From 60e7c2a3f1e8acdb847bbb555c64a6dc6955ba9c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Mar 2022 20:02:44 -0500
Subject: [PATCH 001/167] Adding pylibraft

---
 .gitignore                                    |    4 +-
 python/.flake8                                |    9 -
 python/.flake8.cython                         |   28 -
 python/pytest.ini                             |    8 -
 python/raft/__init__.py                       |   16 -
 python/raft/_version.py                       |  567 -----
 python/raft/common/__init__.pxd               |    0
 python/raft/common/__init__.py                |   17 -
 python/raft/common/cuda.pxd                   |   22 -
 python/raft/common/cuda.pyx                   |   84 -
 python/raft/common/handle.pxd                 |   48 -
 python/raft/common/handle.pyx                 |   90 -
 python/raft/common/interruptible.pxd          |   34 -
 python/raft/common/interruptible.pyx          |   84 -
 python/raft/dask/__init__.py                  |   16 -
 python/raft/dask/common/__init__.py           |   34 -
 python/raft/dask/common/comms.py              |  648 ------
 python/raft/dask/common/comms_utils.pyx       |  313 ---
 python/raft/dask/common/nccl.pyx              |  253 ---
 python/raft/dask/common/ucx.py                |   93 -
 python/raft/dask/common/utils.py              |   39 -
 python/raft/include_test/__init__.py          |   16 -
 .../raft/include_test/raft_include_test.pyx   |   19 -
 python/raft/test/__init__.py                  |   14 -
 python/raft/test/conftest.py                  |   49 -
 python/raft/test/test_comms.py                |  336 ---
 python/raft/test/test_interruptible.py        |   54 -
 python/raft/test/test_raft.py                 |   31 -
 python/setup.cfg                              |   58 -
 python/setup.py                               |  202 --
 python/setuputils.py                          |   65 -
 python/versioneer.py                          | 1822 -----------------
 32 files changed, 2 insertions(+), 5071 deletions(-)
 delete mode 100644 python/.flake8
 delete mode 100644 python/.flake8.cython
 delete mode 100644 python/pytest.ini
 delete mode 100644 python/raft/__init__.py
 delete mode 100644 python/raft/_version.py
 delete mode 100644 python/raft/common/__init__.pxd
 delete mode 100644 python/raft/common/__init__.py
 delete mode 100644 python/raft/common/cuda.pxd
 delete mode 100644 python/raft/common/cuda.pyx
 delete mode 100644 python/raft/common/handle.pxd
 delete mode 100644 python/raft/common/handle.pyx
 delete mode 100644 python/raft/common/interruptible.pxd
 delete mode 100644 python/raft/common/interruptible.pyx
 delete mode 100644 python/raft/dask/__init__.py
 delete mode 100644 python/raft/dask/common/__init__.py
 delete mode 100644 python/raft/dask/common/comms.py
 delete mode 100644 python/raft/dask/common/comms_utils.pyx
 delete mode 100644 python/raft/dask/common/nccl.pyx
 delete mode 100644 python/raft/dask/common/ucx.py
 delete mode 100644 python/raft/dask/common/utils.py
 delete mode 100644 python/raft/include_test/__init__.py
 delete mode 100644 python/raft/include_test/raft_include_test.pyx
 delete mode 100644 python/raft/test/__init__.py
 delete mode 100644 python/raft/test/conftest.py
 delete mode 100644 python/raft/test/test_comms.py
 delete mode 100644 python/raft/test/test_interruptible.py
 delete mode 100644 python/raft/test/test_raft.py
 delete mode 100644 python/setup.cfg
 delete mode 100644 python/setup.py
 delete mode 100755 python/setuputils.py
 delete mode 100644 python/versioneer.py

diff --git a/.gitignore b/.gitignore
index 60a43f6b54..742a37aa35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,8 +17,8 @@ build/
 build_prims/
 dist/
 python/**/**/*.cpp
-python/external_repositories
-python/record.txt
+raft/external_repositories
+raft/record.txt
 log
 .ipynb_checkpoints
 .DS_Store
diff --git a/python/.flake8 b/python/.flake8
deleted file mode 100644
index ef2e5a8495..0000000000
--- a/python/.flake8
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-
-[flake8]
-exclude = __init__.py
-ignore =
-    # line break before binary operator
-    W503
-    # whitespace before :
-    E203
\ No newline at end of file
diff --git a/python/.flake8.cython b/python/.flake8.cython
deleted file mode 100644
index 3cd436d3f3..0000000000
--- a/python/.flake8.cython
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-[flake8]
-filename = *.pyx, *.pxd
-exclude = *.egg, build, docs, .git
-ignore = E999, E225, E226, E227, W503, W504
-
-# Rules ignored:
-# E999: invalid syntax (works for Python, not Cython)
-# E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
-# E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
-# E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
-# W503: line break before binary operator (breaks lines that start with a pointer)
-# W504: line break after binary operator (breaks lines that end with a pointer)
diff --git a/python/pytest.ini b/python/pytest.ini
deleted file mode 100644
index e48e31a00a..0000000000
--- a/python/pytest.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-[pytest]
-markers =
-  unit: marks unit tests
-  quality: marks quality tests
-  stress: marks stress tests
-  mg: marks a test as multi-GPU
-  memleak: marks a test as a memory leak test
-
diff --git a/python/raft/__init__.py b/python/raft/__init__.py
deleted file mode 100644
index b2431b4f6c..0000000000
--- a/python/raft/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from .include_test import raft_include_test
diff --git a/python/raft/_version.py b/python/raft/_version.py
deleted file mode 100644
index 454b0fe7aa..0000000000
--- a/python/raft/_version.py
+++ /dev/null
@@ -1,567 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-# This file helps to compute a version number in source trees obtained from
-# git-archive tarball (such as those provided by githubs download-from-tag
-# feature). Distribution tarballs (built by setup.py sdist) and build
-# directories (produced by setup.py build) will contain a much shorter file
-# that just contains the computed version number.
-
-# This file is released into the public domain. Generated by
-# versioneer-0.18 (https://github.com/warner/python-versioneer)
-
-"""Git implementation of _version.py."""
-
-import errno
-import os
-import re
-import subprocess
-import sys
-
-
-def get_keywords():
-    """Get the keywords needed to look up the version information."""
-    # these strings will be replaced by git during git-archive.
-    # setup.py/versioneer.py will grep for the variable names, so they must
-    # each be defined on a line of their own. _version.py will just call
-    # get_keywords().
-    git_refnames = "$Format:%d$"
-    git_full = "$Format:%H$"
-    git_date = "$Format:%ci$"
-    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
-    return keywords
-
-
-class VersioneerConfig:
-    """Container for Versioneer configuration parameters."""
-
-
-def get_config():
-    """Create, populate and return the VersioneerConfig() object."""
-    # these strings are filled in when 'setup.py versioneer' creates
-    # _version.py
-    cfg = VersioneerConfig()
-    cfg.VCS = "git"
-    cfg.style = "pep440"
-    cfg.tag_prefix = "v"
-    cfg.parentdir_prefix = "raft-"
-    cfg.versionfile_source = "raft/_version.py"
-    cfg.verbose = False
-    return cfg
-
-
-class NotThisMethod(Exception):
-    """Exception raised if a method is not valid for the current scenario."""
-
-
-LONG_VERSION_PY = {}
-HANDLERS = {}
-
-
-def register_vcs_handler(vcs, method):  # decorator
-    """Decorator to mark a method as the handler for a particular VCS."""
-
-    def decorate(f):
-        """Store f in HANDLERS[vcs][method]."""
-        if vcs not in HANDLERS:
-            HANDLERS[vcs] = {}
-        HANDLERS[vcs][method] = f
-        return f
-
-    return decorate
-
-
-def run_command(
-        commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
-):
-    """Call the given command(s)."""
-    assert isinstance(commands, list)
-    p = None
-    for c in commands:
-        try:
-            dispcmd = str([c] + args)
-            # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen(
-                [c] + args,
-                cwd=cwd,
-                env=env,
-                stdout=subprocess.PIPE,
-                stderr=(subprocess.PIPE if hide_stderr else None),
-                )
-            break
-        except EnvironmentError:
-            e = sys.exc_info()[1]
-            if e.errno == errno.ENOENT:
-                continue
-            if verbose:
-                print("unable to run %s" % dispcmd)
-                print(e)
-            return None, None
-    else:
-        if verbose:
-            print("unable to find command, tried %s" % (commands,))
-        return None, None
-    stdout = p.communicate()[0].strip()
-    if sys.version_info[0] >= 3:
-        stdout = stdout.decode()
-    if p.returncode != 0:
-        if verbose:
-            print("unable to run %s (error)" % dispcmd)
-            print("stdout was %s" % stdout)
-        return None, p.returncode
-    return stdout, p.returncode
-
-
-def versions_from_parentdir(parentdir_prefix, root, verbose):
-    """Try to determine the version from the parent directory name.
-
-    Source tarballs conventionally unpack into a directory that includes both
-    the project name and a version string. We will also support searching up
-    two directory levels for an appropriately named parent directory
-    """
-    rootdirs = []
-
-    for i in range(3):
-        dirname = os.path.basename(root)
-        if dirname.startswith(parentdir_prefix):
-            return {
-                "version": dirname[len(parentdir_prefix):],
-                "full-revisionid": None,
-                "dirty": False,
-                "error": None,
-                "date": None,
-            }
-        else:
-            rootdirs.append(root)
-            root = os.path.dirname(root)  # up a level
-
-    if verbose:
-        print(
-            "Tried directories %s but none started with prefix %s"
-            % (str(rootdirs), parentdir_prefix)
-        )
-    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
-
-
-@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
-    """Extract version information from the given file."""
-    # the code embedded in _version.py can just fetch the value of these
-    # keywords. When used from setup.py, we don't want to import _version.py,
-    # so we do it with a regexp instead. This function is not used from
-    # _version.py.
-    keywords = {}
-    try:
-        f = open(versionfile_abs, "r")
-        for line in f.readlines():
-            if line.strip().startswith("git_refnames ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["refnames"] = mo.group(1)
-            if line.strip().startswith("git_full ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["full"] = mo.group(1)
-            if line.strip().startswith("git_date ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["date"] = mo.group(1)
-        f.close()
-    except EnvironmentError:
-        pass
-    return keywords
-
-
-@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
-    """Get version information from git keywords."""
-    if not keywords:
-        raise NotThisMethod("no keywords at all, weird")
-    date = keywords.get("date")
-    if date is not None:
-        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
-        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
-        # -like" string, which we must then edit to make compliant), because
-        # it's been around since git-1.5.3, and it's too difficult to
-        # discover which version we're using, or to work around using an
-        # older one.
-        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-    refnames = keywords["refnames"].strip()
-    if refnames.startswith("$Format"):
-        if verbose:
-            print("keywords are unexpanded, not using")
-        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
-    refs = set([r.strip() for r in refnames.strip("()").split(",")])
-    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
-    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
-    TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
-    if not tags:
-        # Either we're using git < 1.8.3, or there really are no tags. We use
-        # a heuristic: assume all version tags have a digit. The old git %d
-        # expansion behaves like git log --decorate=short and strips out the
-        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
-        # between branches and tags. By ignoring refnames without digits, we
-        # filter out many common branch names like "release" and
-        # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r"\d", r)])
-        if verbose:
-            print("discarding '%s', no digits" % ",".join(refs - tags))
-    if verbose:
-        print("likely tags: %s" % ",".join(sorted(tags)))
-    for ref in sorted(tags):
-        # sorting will prefer e.g. "2.0" over "2.0rc1"
-        if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
-            if verbose:
-                print("picking %s" % r)
-            return {
-                "version": r,
-                "full-revisionid": keywords["full"].strip(),
-                "dirty": False,
-                "error": None,
-                "date": date,
-            }
-    # no suitable tags, so version is "0+unknown", but full hex is still there
-    if verbose:
-        print("no suitable tags, using unknown + full revision id")
-    return {
-        "version": "0+unknown",
-        "full-revisionid": keywords["full"].strip(),
-        "dirty": False,
-        "error": "no suitable tags",
-        "date": None,
-    }
-
-
-@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
-    """Get version from 'git describe' in the root of the source tree.
-
-    This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
-    version string, meaning we're inside a checked out source tree.
-    """
-    GITS = ["git"]
-    if sys.platform == "win32":
-        GITS = ["git.cmd", "git.exe"]
-
-    out, rc = run_command(
-        GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True
-    )
-    if rc != 0:
-        if verbose:
-            print("Directory %s not under git control" % root)
-        raise NotThisMethod("'git rev-parse --git-dir' returned error")
-
-    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
-    # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(
-        GITS,
-        [
-            "describe",
-            "--tags",
-            "--dirty",
-            "--always",
-            "--long",
-            "--match",
-            "%s*" % tag_prefix,
-            ],
-        cwd=root,
-    )
-    # --long was added in git-1.5.5
-    if describe_out is None:
-        raise NotThisMethod("'git describe' failed")
-    describe_out = describe_out.strip()
-    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
-    if full_out is None:
-        raise NotThisMethod("'git rev-parse' failed")
-    full_out = full_out.strip()
-
-    pieces = {}
-    pieces["long"] = full_out
-    pieces["short"] = full_out[:7]  # maybe improved later
-    pieces["error"] = None
-
-    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
-    # TAG might have hyphens.
-    git_describe = describe_out
-
-    # look for -dirty suffix
-    dirty = git_describe.endswith("-dirty")
-    pieces["dirty"] = dirty
-    if dirty:
-        git_describe = git_describe[: git_describe.rindex("-dirty")]
-
-    # now we have TAG-NUM-gHEX or HEX
-
-    if "-" in git_describe:
-        # TAG-NUM-gHEX
-        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
-        if not mo:
-            # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = (
-                    "unable to parse git-describe output: '%s'" % describe_out
-            )
-            return pieces
-
-        # tag
-        full_tag = mo.group(1)
-        if not full_tag.startswith(tag_prefix):
-            if verbose:
-                fmt = "tag '%s' doesn't start with prefix '%s'"
-                print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
-                full_tag,
-                tag_prefix,
-            )
-            return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
-
-        # distance: number of commits since tag
-        pieces["distance"] = int(mo.group(2))
-
-        # commit: short hex revision ID
-        pieces["short"] = mo.group(3)
-
-    else:
-        # HEX: no tags
-        pieces["closest-tag"] = None
-        count_out, rc = run_command(
-            GITS, ["rev-list", "HEAD", "--count"], cwd=root
-        )
-        pieces["distance"] = int(count_out)  # total number of commits
-
-    # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
-        0
-    ].strip()
-    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-
-    return pieces
-
-
-def plus_or_dot(pieces):
-    """Return a + if we don't already have one, else return a ."""
-    if "+" in pieces.get("closest-tag", ""):
-        return "."
-    return "+"
-
-
-def render_pep440(pieces):
-    """Build up version string, with post-release "local version identifier".
-
-    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
-    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
-
-    Exceptions:
-    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += plus_or_dot(pieces)
-            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def render_pep440_pre(pieces):
-    """TAG[.post.devDISTANCE] -- No -dirty.
-
-    Exceptions:
-    1: no tags. 0.post.devDISTANCE
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += ".post.dev%d" % pieces["distance"]
-    else:
-        # exception #1
-        rendered = "0.post.dev%d" % pieces["distance"]
-    return rendered
-
-
-def render_pep440_post(pieces):
-    """TAG[.postDISTANCE[.dev0]+gHEX] .
-
-    The ".dev0" means dirty. Note that .dev0 sorts backwards
-    (a dirty tree will appear "older" than the corresponding clean one),
-    but you shouldn't be releasing software with -dirty anyways.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "g%s" % pieces["short"]
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-        rendered += "+g%s" % pieces["short"]
-    return rendered
-
-
-def render_pep440_old(pieces):
-    """TAG[.postDISTANCE[.dev0]] .
-
-    The ".dev0" means dirty.
-
-    Eexceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-    return rendered
-
-
-def render_git_describe(pieces):
-    """TAG[-DISTANCE-gHEX][-dirty].
-
-    Like 'git describe --tags --dirty --always'.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render_git_describe_long(pieces):
-    """TAG-DISTANCE-gHEX[-dirty].
-
-    Like 'git describe --tags --dirty --always -long'.
-    The distance/hash is unconditional.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render(pieces, style):
-    """Render the given version pieces into the requested style."""
-    if pieces["error"]:
-        return {
-            "version": "unknown",
-            "full-revisionid": pieces.get("long"),
-            "dirty": None,
-            "error": pieces["error"],
-            "date": None,
-        }
-
-    if not style or style == "default":
-        style = "pep440"  # the default
-
-    if style == "pep440":
-        rendered = render_pep440(pieces)
-    elif style == "pep440-pre":
-        rendered = render_pep440_pre(pieces)
-    elif style == "pep440-post":
-        rendered = render_pep440_post(pieces)
-    elif style == "pep440-old":
-        rendered = render_pep440_old(pieces)
-    elif style == "git-describe":
-        rendered = render_git_describe(pieces)
-    elif style == "git-describe-long":
-        rendered = render_git_describe_long(pieces)
-    else:
-        raise ValueError("unknown style '%s'" % style)
-
-    return {
-        "version": rendered,
-        "full-revisionid": pieces["long"],
-        "dirty": pieces["dirty"],
-        "error": None,
-        "date": pieces.get("date"),
-    }
-
-
-def get_versions():
-    """Get version information or return default if unable to do so."""
-    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
-    # __file__, we can work backwards from there to the root. Some
-    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
-    # case we can only use expanded keywords.
-
-    cfg = get_config()
-    verbose = cfg.verbose
-
-    try:
-        return git_versions_from_keywords(
-            get_keywords(), cfg.tag_prefix, verbose
-        )
-    except NotThisMethod:
-        pass
-
-    try:
-        root = os.path.realpath(__file__)
-        # versionfile_source is the relative path from the top of the source
-        # tree (where the .git directory might live) to this file. Invert
-        # this to find the root from __file__.
-        for i in cfg.versionfile_source.split("/"):
-            root = os.path.dirname(root)
-    except NameError:
-        return {
-            "version": "0+unknown",
-            "full-revisionid": None,
-            "dirty": None,
-            "error": "unable to find root of source tree",
-            "date": None,
-        }
-
-    try:
-        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
-        return render(pieces, cfg.style)
-    except NotThisMethod:
-        pass
-
-    try:
-        if cfg.parentdir_prefix:
-            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
-    except NotThisMethod:
-        pass
-
-    return {
-        "version": "0+unknown",
-        "full-revisionid": None,
-        "dirty": None,
-        "error": "unable to compute version",
-        "date": None,
-    }
diff --git a/python/raft/common/__init__.pxd b/python/raft/common/__init__.pxd
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/python/raft/common/__init__.py b/python/raft/common/__init__.py
deleted file mode 100644
index b5ef2b3079..0000000000
--- a/python/raft/common/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from .cuda import Stream
-from .handle import Handle
\ No newline at end of file
diff --git a/python/raft/common/cuda.pxd b/python/raft/common/cuda.pxd
deleted file mode 100644
index 0459cb96af..0000000000
--- a/python/raft/common/cuda.pxd
+++ /dev/null
@@ -1,22 +0,0 @@
-#
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from cuda.ccudart cimport cudaStream_t
-
-cdef class Stream:
-    cdef cudaStream_t s
-
-    cdef cudaStream_t getStream(self)
diff --git a/python/raft/common/cuda.pyx b/python/raft/common/cuda.pyx
deleted file mode 100644
index c3c90936aa..0000000000
--- a/python/raft/common/cuda.pyx
+++ /dev/null
@@ -1,84 +0,0 @@
-#
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cuda.ccudart cimport(
-    cudaStream_t,
-    cudaError_t,
-    cudaSuccess,
-    cudaStreamCreate,
-    cudaStreamDestroy,
-    cudaStreamSynchronize,
-    cudaGetLastError,
-    cudaGetErrorString,
-    cudaGetErrorName
-)
-
-
-class CudaRuntimeError(RuntimeError):
-    def __init__(self, extraMsg=None):
-        cdef cudaError_t e = cudaGetLastError()
-        cdef bytes errMsg = cudaGetErrorString(e)
-        cdef bytes errName = cudaGetErrorName(e)
-        msg = "Error! %s reason='%s'" % (errName.decode(), errMsg.decode())
-        if extraMsg is not None:
-            msg += " extraMsg='%s'" % extraMsg
-        super(CudaRuntimeError, self).__init__(msg)
-
-
-cdef class Stream:
-    """
-    Stream represents a thin-wrapper around cudaStream_t and its operations.
-
-    Examples
-    --------
-
-    .. code-block:: python
-
-        from raft.common.cuda import Stream
-        stream = Stream()
-        stream.sync()
-        del stream  # optional!
-    """
-    def __cinit__(self):
-        cdef cudaStream_t stream
-        cdef cudaError_t e = cudaStreamCreate(&stream)
-        if e != cudaSuccess:
-            raise CudaRuntimeError("Stream create")
-        self.s = stream
-
-    def __dealloc__(self):
-        self.sync()
-        cdef cudaError_t e = cudaStreamDestroy(self.s)
-        if e != cudaSuccess:
-            raise CudaRuntimeError("Stream destroy")
-
-    def sync(self):
-        """
-        Synchronize on the cudastream owned by this object. Note that this
-        could raise exception due to issues with previous asynchronous
-        launches
-        """
-        cdef cudaError_t e = cudaStreamSynchronize(self.s)
-        if e != cudaSuccess:
-            raise CudaRuntimeError("Stream sync")
-
-    cdef cudaStream_t getStream(self):
-        return self.s
diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd
deleted file mode 100644
index 8415b7e3d7..0000000000
--- a/python/raft/common/handle.pxd
+++ /dev/null
@@ -1,48 +0,0 @@
-#
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-
-from libcpp.memory cimport shared_ptr
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
-from libcpp.memory cimport shared_ptr
-from libcpp.memory cimport unique_ptr
-
-cdef extern from "raft/mr/device/allocator.hpp" \
-        namespace "raft::mr::device" nogil:
-    cdef cppclass allocator:
-        pass
-
-cdef extern from "raft/handle.hpp" namespace "raft" nogil:
-    cdef cppclass handle_t:
-        handle_t() except +
-        handle_t(cuda_stream_view stream_view) except +
-        handle_t(cuda_stream_view stream_view,
-                 shared_ptr[cuda_stream_pool] stream_pool) except +
-        void set_device_allocator(shared_ptr[allocator] a) except +
-        shared_ptr[allocator] get_device_allocator() except +
-        cuda_stream_view get_stream() except +
-        void sync_stream() except +
-
-cdef class Handle:
-    cdef unique_ptr[handle_t] c_obj
-    cdef shared_ptr[cuda_stream_pool] stream_pool
-    cdef int n_streams
diff --git a/python/raft/common/handle.pyx b/python/raft/common/handle.pyx
deleted file mode 100644
index 661c5b5f23..0000000000
--- a/python/raft/common/handle.pyx
+++ /dev/null
@@ -1,90 +0,0 @@
-#
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-# import raft
-from libcpp.memory cimport shared_ptr
-from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-from .cuda cimport Stream
-from .cuda import CudaRuntimeError
-
-
-cdef class Handle:
-    """
-    Handle is a lightweight python wrapper around the corresponding C++ class
-    of handle_t exposed by RAFT's C++ interface. Refer to the header file
-    raft/handle.hpp for interface level details of this struct
-
-    Examples
-    --------
-
-    .. code-block:: python
-
-        from raft.common import Stream, Handle
-        stream = Stream()
-        handle = Handle(stream)
-
-        # call algos here
-
-        # final sync of all work launched in the stream of this handle
-        # this is same as `raft.cuda.Stream.sync()` call, but safer in case
-        # the default stream inside the `handle_t` is being used
-        handle.sync()
-        del handle  # optional!
-    """
-
-    def __cinit__(self, stream: Stream = None, n_streams=0):
-        self.n_streams = n_streams
-        if n_streams > 0:
-            self.stream_pool.reset(new cuda_stream_pool(n_streams))
-
-        cdef cuda_stream_view c_stream
-        if stream is None:
-            # this constructor will construct a "main" handle on
-            # per-thread default stream, which is non-blocking
-            self.c_obj.reset(new handle_t(cuda_stream_per_thread,
-                                          self.stream_pool))
-        else:
-            # this constructor constructs a handle on user stream
-            c_stream = cuda_stream_view(stream.getStream())
-            self.c_obj.reset(new handle_t(c_stream,
-                                          self.stream_pool))
-
-    def sync(self):
-        """
-        Issues a sync on the stream set for this handle.
-        """
-        self.c_obj.get()[0].sync_stream()
-
-    def getHandle(self):
-        return <size_t> self.c_obj.get()
-
-    def __getstate__(self):
-        return self.n_streams
-
-    def __setstate__(self, state):
-        self.n_streams = state
-        if self.n_streams > 0:
-            self.stream_pool.reset(new cuda_stream_pool(self.n_streams))
-
-        self.c_obj.reset(new handle_t(cuda_stream_per_thread,
-                                      self.stream_pool))
diff --git a/python/raft/common/interruptible.pxd b/python/raft/common/interruptible.pxd
deleted file mode 100644
index a73e8c1ac7..0000000000
--- a/python/raft/common/interruptible.pxd
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from libcpp.memory cimport shared_ptr
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-cdef extern from "raft/interruptible.hpp" namespace "raft" nogil:
-    cdef cppclass interruptible:
-        void cancel()
-
-cdef extern from "raft/interruptible.hpp" \
-        namespace "raft::interruptible" nogil:
-    cdef void inter_synchronize \
-        "raft::interruptible::synchronize"(cuda_stream_view stream) except+
-    cdef void inter_yield "raft::interruptible::yield"() except+
-    cdef shared_ptr[interruptible] get_token() except+
diff --git a/python/raft/common/interruptible.pyx b/python/raft/common/interruptible.pyx
deleted file mode 100644
index dfc95490ed..0000000000
--- a/python/raft/common/interruptible.pyx
+++ /dev/null
@@ -1,84 +0,0 @@
-#
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-import contextlib
-import signal
-from cython.operator cimport dereference
-
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from cuda.ccudart cimport cudaStream_t
-from .cuda cimport Stream
-
-
-@contextlib.contextmanager
-def cuda_interruptible():
-    '''
-    Temporarily install a keyboard interrupt handler (Ctrl+C)
-    that cancels the enclosed interruptible C++ thread.
-
-    Use this on a long-running C++ function imported via cython:
-
-    .. code-block:: python
-
-        with cuda_interruptible():
-            my_long_running_function(...)
-
-    It's also recommended to release the GIL during the call, to
-    make sure the handler has a chance to run:
-
-    .. code-block:: python
-
-        with cuda_interruptible():
-            with nogil:
-                my_long_running_function(...)
-
-    '''
-    cdef shared_ptr[interruptible] token = get_token()
-
-    def newhr(*args, **kwargs):
-        with nogil:
-            dereference(token).cancel()
-
-    oldhr = signal.signal(signal.SIGINT, newhr)
-    try:
-        yield
-    finally:
-        signal.signal(signal.SIGINT, oldhr)
-
-
-def synchronize(stream: Stream):
-    '''
-    Same as cudaStreamSynchronize, but can be interrupted
-    if called within a `with cuda_interruptible()` block.
-    '''
-    cdef cuda_stream_view c_stream = cuda_stream_view(stream.getStream())
-    with nogil:
-        inter_synchronize(c_stream)
-
-
-def cuda_yield():
-    '''
-    Check for an asynchronously received interrupted_exception.
-    Raises the exception if a user pressed Ctrl+C within a
-    `with cuda_interruptible()` block before.
-    '''
-    with nogil:
-        inter_yield()
diff --git a/python/raft/dask/__init__.py b/python/raft/dask/__init__.py
deleted file mode 100644
index 74231d256f..0000000000
--- a/python/raft/dask/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from .common.comms import Comms
\ No newline at end of file
diff --git a/python/raft/dask/common/__init__.py b/python/raft/dask/common/__init__.py
deleted file mode 100644
index c2265f6828..0000000000
--- a/python/raft/dask/common/__init__.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from .comms import Comms
-from .comms import local_handle
-
-from .comms_utils import inject_comms_on_handle
-from .comms_utils import inject_comms_on_handle_coll_only
-from .comms_utils import perform_test_comms_allreduce
-from .comms_utils import perform_test_comms_send_recv
-from .comms_utils import perform_test_comms_device_send_or_recv
-from .comms_utils import perform_test_comms_device_sendrecv
-from .comms_utils import perform_test_comms_device_multicast_sendrecv
-from .comms_utils import perform_test_comms_allgather
-from .comms_utils import perform_test_comms_gather
-from .comms_utils import perform_test_comms_gatherv
-from .comms_utils import perform_test_comms_bcast
-from .comms_utils import perform_test_comms_reduce
-from .comms_utils import perform_test_comms_reducescatter
-from .comms_utils import perform_test_comm_split
-
-from .ucx import UCX
diff --git a/python/raft/dask/common/comms.py b/python/raft/dask/common/comms.py
deleted file mode 100644
index ee768b41ff..0000000000
--- a/python/raft/dask/common/comms.py
+++ /dev/null
@@ -1,648 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from .nccl import nccl
-from .ucx import UCX
-
-from .comms_utils import inject_comms_on_handle
-from .comms_utils import inject_comms_on_handle_coll_only
-
-from .utils import parse_host_port
-from ...common.handle import Handle
-
-from dask.distributed import get_worker, default_client
-
-import warnings
-
-import logging
-import time
-import uuid
-from collections import OrderedDict
-
-logger = logging.getLogger(__name__)
-
-
-class Comms:
-
-    """
-    Initializes and manages underlying NCCL and UCX comms handles across
-    the workers of a Dask cluster. It is expected that `init()` will be
-    called explicitly. It is recommended to also call `destroy()` when
-    the comms are no longer needed so the underlying resources can be
-    cleaned up. This class is not meant to be thread-safe.
-
-    Examples
-    --------
-   .. code-block:: python
-
-        # The following code block assumes we have wrapped a C++
-        # function in a Python function called `run_algorithm`,
-        # which takes a `raft::handle_t` as a single argument.
-        # Once the `Comms` instance is successfully initialized,
-        # the underlying `raft::handle_t` will contain an instance
-        # of `raft::comms::comms_t`
-
-        from dask_cuda import LocalCUDACluster
-        from dask.distributed import Client
-
-        from raft.dask.common import Comms, local_handle
-
-        cluster = LocalCUDACluster()
-        client = Client(cluster)
-
-        def _use_comms(sessionId):
-            return run_algorithm(local_handle(sessionId))
-
-        comms = Comms(client=client)
-        comms.init()
-
-        futures = [client.submit(_use_comms,
-                                 comms.sessionId,
-                                 workers=[w],
-                                 pure=False) # Don't memoize
-                       for w in cb.worker_addresses]
-        wait(dfs, timeout=5)
-
-        comms.destroy()
-        client.close()
-        cluster.close()
-    """
-
-    valid_nccl_placements = ("client", "worker", "scheduler")
-
-    def __init__(
-        self,
-        comms_p2p=False,
-        client=None,
-        verbose=False,
-        streams_per_handle=0,
-        nccl_root_location="scheduler",
-    ):
-        """
-        Construct a new CommsContext instance
-
-        Parameters
-        ----------
-        comms_p2p : bool
-                    Initialize UCX endpoints?
-        client : dask.distributed.Client [optional]
-                 Dask client to use
-        verbose : bool
-                  Print verbose logging
-        nccl_root_location : string
-                  Indicates where the NCCL's root node should be located.
-                  ['client', 'worker', 'scheduler' (default)]
-
-        """
-        self.client = client if client is not None else default_client()
-
-        self.comms_p2p = comms_p2p
-
-        self.nccl_root_location = nccl_root_location.lower()
-        if self.nccl_root_location not in Comms.valid_nccl_placements:
-            raise ValueError(
-                f"nccl_root_location must be one of: "
-                f"{Comms.valid_nccl_placements}"
-            )
-
-        self.streams_per_handle = streams_per_handle
-
-        self.sessionId = uuid.uuid4().bytes
-
-        self.nccl_initialized = False
-        self.ucx_initialized = False
-
-        self.verbose = verbose
-
-        if verbose:
-            print("Initializing comms!")
-
-    def __del__(self):
-        if self.nccl_initialized or self.ucx_initialized:
-            self.destroy()
-
-    def create_nccl_uniqueid(self):
-        if self.nccl_root_location == "client":
-            self.uniqueId = nccl.get_unique_id()
-        elif self.nccl_root_location == "worker":
-            self.uniqueId = self.client.run(
-                _func_set_worker_as_nccl_root,
-                sessionId=self.sessionId,
-                verbose=self.verbose,
-                workers=[self.worker_addresses[0]],
-                wait=True,
-            )[self.worker_addresses[0]]
-        else:
-            self.uniqueId = self.client.run_on_scheduler(
-                _func_set_scheduler_as_nccl_root,
-                sessionId=self.sessionId,
-                verbose=self.verbose,
-            )
-
-    def worker_info(self, workers):
-        """
-        Builds a dictionary of { (worker_address, worker_port) :
-                                (worker_rank, worker_port ) }
-        """
-        ranks = _func_worker_ranks(workers)
-        ports = (
-            _func_ucp_ports(self.client, workers) if self.comms_p2p else None
-        )
-
-        output = {}
-        for k in ranks.keys():
-            output[k] = {"rank": ranks[k]}
-            if self.comms_p2p:
-                output[k]["port"] = ports[k]
-        return output
-
-    def init(self, workers=None):
-        """
-        Initializes the underlying comms. NCCL is required but
-        UCX is only initialized if `comms_p2p == True`
-
-        Parameters
-        ----------
-
-        workers : Sequence
-                  Unique collection of workers for initializing comms.
-        """
-
-        self.worker_addresses = list(
-            OrderedDict.fromkeys(
-                self.client.scheduler_info()["workers"].keys()
-                if workers is None
-                else workers
-            )
-        )
-
-        if self.nccl_initialized or self.ucx_initialized:
-            warnings.warn("Comms have already been initialized.")
-            return
-
-        worker_info = self.worker_info(self.worker_addresses)
-        worker_info = {w: worker_info[w] for w in self.worker_addresses}
-
-        self.create_nccl_uniqueid()
-
-        self.client.run(
-            _func_init_all,
-            self.sessionId,
-            self.uniqueId,
-            self.comms_p2p,
-            worker_info,
-            self.verbose,
-            self.streams_per_handle,
-            workers=self.worker_addresses,
-            wait=True,
-        )
-
-        self.nccl_initialized = True
-
-        if self.comms_p2p:
-            self.ucx_initialized = True
-
-        if self.verbose:
-            print("Initialization complete.")
-
-    def destroy(self):
-        """
-        Shuts down initialized comms and cleans up resources. This will
-        be called automatically by the Comms destructor, but may be called
-        earlier to save resources.
-        """
-        self.client.run(
-            _func_destroy_all,
-            self.sessionId,
-            self.comms_p2p,
-            self.verbose,
-            wait=True,
-            workers=self.worker_addresses,
-        )
-
-        if self.nccl_root_location == "scheduler":
-            self.client.run_on_scheduler(
-                _func_destroy_scheduler_session, self.sessionId
-            )
-
-        if self.verbose:
-            print("Destroying comms.")
-
-        self.nccl_initialized = False
-        self.ucx_initialized = False
-
-
-def local_handle(sessionId):
-    """
-    Simple helper function for retrieving the local handle_t instance
-    for a comms session on a worker.
-
-    Parameters
-    ----------
-    sessionId : str
-                session identifier from an initialized comms instance
-
-    Returns
-    -------
-
-    handle : raft.Handle or None
-    """
-    state = get_raft_comm_state(sessionId, get_worker())
-    return state["handle"] if "handle" in state else None
-
-
-def get_raft_comm_state(sessionId, state_object=None):
-    """
-    Retrieves cuML comms state on the scheduler node, for the given sessionId,
-    creating a new session if it does not exist. If no session id is given,
-    returns the state dict for all sessions.
-
-    Parameters
-    ----------
-    sessionId : SessionId value to retrieve from the dask_scheduler instances
-    state_object : Object (either Worker, or Scheduler) on which the raft
-                   comm state will retrieved (or created)
-
-    Returns
-    -------
-
-    session state : str
-                    session state associated with sessionId
-    """
-    state_object = state_object if state_object is not None else get_worker()
-
-    if not hasattr(state_object, "_raft_comm_state"):
-        state_object._raft_comm_state = {}
-
-    if (
-        sessionId is not None
-        and sessionId not in state_object._raft_comm_state
-    ):
-        state_object._raft_comm_state[sessionId] = {"ts": time.time()}
-
-    if sessionId is not None:
-        return state_object._raft_comm_state[sessionId]
-
-    return state_object._raft_comm_state
-
-
-def set_nccl_root(sessionId, state_object):
-    if sessionId is None:
-        raise ValueError("sessionId cannot be None.")
-
-    raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=state_object
-    )
-
-    if "nccl_uid" not in raft_comm_state:
-        raft_comm_state["nccl_uid"] = nccl.get_unique_id()
-
-    return raft_comm_state["nccl_uid"]
-
-
-def get_ucx():
-    """
-    A simple convenience wrapper to make sure UCP listener and
-    endpoints are only ever assigned once per worker.
-    """
-    raft_comm_state = get_raft_comm_state(
-        sessionId="ucp", state_object=get_worker()
-    )
-    if "ucx" not in raft_comm_state:
-        raft_comm_state["ucx"] = UCX.get()
-
-    return raft_comm_state["ucx"]
-
-
-def _func_destroy_scheduler_session(sessionId, dask_scheduler):
-    """
-    Remove session date from _raft_comm_state, associated with sessionId
-
-    Parameters
-    ----------
-    sessionId : session Id to be destroyed.
-    dask_scheduler : dask_scheduler object
-                    (Note: this is supplied by DASK, not the client)
-    """
-    if sessionId is not None and sessionId in dask_scheduler._raft_comm_state:
-        del dask_scheduler._raft_comm_state[sessionId]
-    else:
-        return 1
-
-    return 0
-
-
-def _func_set_scheduler_as_nccl_root(sessionId, verbose, dask_scheduler):
-    """
-    Creates a persistent nccl uniqueId on the scheduler node.
-
-
-    Parameters
-    ----------
-    sessionId : Associated session to attach the unique ID to.
-    verbose : Indicates whether or not to emit additional information
-    dask_scheduler : dask scheduler object,
-                    (Note: this is supplied by DASK, not the client)
-
-    Return
-    ------
-    uniqueId : byte str
-                NCCL uniqueId, associating the DASK scheduler as its root node.
-    """
-    if verbose:
-        logger.info(
-            msg=f"Setting scheduler as NCCL "
-            f"root for sessionId, '{sessionId}'"
-        )
-
-    nccl_uid = set_nccl_root(sessionId=sessionId, state_object=dask_scheduler)
-
-    if verbose:
-        logger.info("Done setting scheduler as NCCL root.")
-
-    return nccl_uid
-
-
-def _func_set_worker_as_nccl_root(sessionId, verbose):
-    """
-    Creates a persistent nccl uniqueId on the scheduler node.
-
-
-    Parameters
-    ----------
-    sessionId : Associated session to attach the unique ID to.
-    verbose : Indicates whether or not to emit additional information
-
-    Return
-    ------
-    uniqueId : byte str
-                NCCL uniqueId, associating this DASK worker as its root node.
-    """
-    worker = get_worker()
-    if verbose:
-        worker.log_event(
-            topic="info",
-            msg=f"Setting worker as NCCL root for session, '{sessionId}'",
-        )
-
-    nccl_uid = set_nccl_root(sessionId=sessionId, state_object=worker)
-
-    if verbose:
-        worker.log_event(
-            topic="info", msg="Done setting scheduler as NCCL root."
-        )
-
-    return nccl_uid
-
-
-def _func_ucp_listener_port():
-    return get_ucx().listener_port()
-
-
-async def _func_init_all(
-    sessionId, uniqueId, comms_p2p, worker_info, verbose, streams_per_handle
-):
-    worker = get_worker()
-    raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=worker
-    )
-    raft_comm_state["nccl_uid"] = uniqueId
-    raft_comm_state["wid"] = worker_info[get_worker().address]["rank"]
-    raft_comm_state["nworkers"] = len(worker_info)
-
-    if verbose:
-        worker.log_event(topic="info", msg="Initializing NCCL.")
-        start = time.time()
-
-    _func_init_nccl(sessionId, uniqueId)
-
-    if verbose:
-        elapsed = time.time() - start
-        worker.log_event(
-            topic="info", msg=f"NCCL Initialization took: {elapsed} seconds."
-        )
-
-    if comms_p2p:
-        if verbose:
-            worker.log_event(topic="info", msg="Initializing UCX Endpoints")
-
-        if verbose:
-            start = time.time()
-        await _func_ucp_create_endpoints(sessionId, worker_info)
-
-        if verbose:
-            elapsed = time.time() - start
-            msg = (
-                f"Done initializing UCX endpoints."
-                f"Took: {elapsed} seconds.\nBuilding handle."
-            )
-            worker.log_event(topic="info", msg=msg)
-
-        _func_build_handle_p2p(sessionId, streams_per_handle, verbose)
-
-        if verbose:
-            worker.log_event(topic="info", msg="Done building handle.")
-
-    else:
-        _func_build_handle(sessionId, streams_per_handle, verbose)
-
-
-def _func_init_nccl(sessionId, uniqueId):
-    """
-    Initialize ncclComm_t on worker
-
-    Parameters
-    ----------
-    sessionId : str
-                session identifier from a comms instance
-    uniqueId : array[byte]
-               The NCCL unique Id generated from the
-               client.
-    """
-
-    worker = get_worker()
-    raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=get_worker()
-    )
-    wid = raft_comm_state["wid"]
-    nWorkers = raft_comm_state["nworkers"]
-
-    try:
-        n = nccl()
-        n.init(nWorkers, uniqueId, wid)
-        raft_comm_state["nccl"] = n
-    except Exception as e:
-        worker.log_event(
-            topic="error", msg=f"An error occurred initializing NCCL: {e}."
-        )
-        raise
-
-
-def _func_build_handle_p2p(sessionId, streams_per_handle, verbose):
-    """
-    Builds a handle_t on the current worker given the initialized comms
-
-    Parameters
-    ----------
-    sessionId : str id to reference state for current comms instance.
-    streams_per_handle : int number of internal streams to create
-    verbose : bool print verbose logging output
-    """
-    worker = get_worker()
-    if verbose:
-        worker.log_event(topic="info", msg="Building p2p handle.")
-
-    ucp_worker = get_ucx().get_worker()
-    raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=worker
-    )
-
-    handle = Handle(n_streams=streams_per_handle)
-    nccl_comm = raft_comm_state["nccl"]
-    eps = raft_comm_state["ucp_eps"]
-    nWorkers = raft_comm_state["nworkers"]
-    workerId = raft_comm_state["wid"]
-
-    if verbose:
-        worker.log_event(topic="info", msg="Injecting comms on handle.")
-
-    inject_comms_on_handle(
-        handle, nccl_comm, ucp_worker, eps, nWorkers, workerId, verbose
-    )
-
-    if verbose:
-        worker.log_event(
-            topic="info", msg="Finished injecting comms on handle."
-        )
-
-    raft_comm_state["handle"] = handle
-
-
-def _func_build_handle(sessionId, streams_per_handle, verbose):
-    """
-    Builds a handle_t on the current worker given the initialized comms
-
-    Parameters
-    ----------
-    sessionId : str id to reference state for current comms instance.
-    streams_per_handle : int number of internal streams to create
-    verbose : bool print verbose logging output
-    """
-    worker = get_worker()
-    if verbose:
-        worker.log_event(
-            topic="info", msg="Finished injecting comms on handle."
-        )
-
-    handle = Handle(n_streams=streams_per_handle)
-
-    raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=worker
-    )
-
-    workerId = raft_comm_state["wid"]
-    nWorkers = raft_comm_state["nworkers"]
-
-    nccl_comm = raft_comm_state["nccl"]
-    inject_comms_on_handle_coll_only(
-        handle, nccl_comm, nWorkers, workerId, verbose
-    )
-    raft_comm_state["handle"] = handle
-
-
-def _func_store_initial_state(nworkers, sessionId, uniqueId, wid):
-    raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=get_worker()
-    )
-    raft_comm_state["nccl_uid"] = uniqueId
-    raft_comm_state["wid"] = wid
-    raft_comm_state["nworkers"] = nworkers
-
-
-async def _func_ucp_create_endpoints(sessionId, worker_info):
-    """
-    Runs on each worker to create ucp endpoints to all other workers
-
-    Parameters
-    ----------
-    sessionId : str
-                uuid unique id for this instance
-    worker_info : dict
-                  Maps worker addresses to NCCL ranks & UCX ports
-    """
-    eps = [None] * len(worker_info)
-    count = 1
-
-    for k in worker_info:
-        ip, port = parse_host_port(k)
-
-        ep = await get_ucx().get_endpoint(ip, worker_info[k]["port"])
-
-        eps[worker_info[k]["rank"]] = ep
-        count += 1
-
-    raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=get_worker()
-    )
-    raft_comm_state["ucp_eps"] = eps
-
-
-async def _func_destroy_all(sessionId, comms_p2p, verbose=False):
-    worker = get_worker()
-    if verbose:
-        worker.log_event(topic="info", msg="Destroying NCCL session state.")
-
-    raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=worker
-    )
-    if "nccl" in raft_comm_state:
-        raft_comm_state["nccl"].destroy()
-        del raft_comm_state["nccl"]
-        if verbose:
-            worker.log_event(topic="info", msg="NCCL session state destroyed.")
-    else:
-        if verbose:
-            worker.log_event(
-                topic="warning",
-                msg=f"Session state for, '{sessionId}', "
-                f"does not contain expected 'nccl' element",
-            )
-
-    if verbose:
-        worker.log_event(
-            topic="info",
-            msg=f"Destroying CUDA handle for sessionId, '{sessionId}.'",
-        )
-
-    if "handle" in raft_comm_state:
-        del raft_comm_state["handle"]
-    else:
-        if verbose:
-            worker.log_event(
-                topic="warning",
-                msg=f"Session state for, '{sessionId}', "
-                f"does not contain expected 'handle' element",
-            )
-
-
-def _func_ucp_ports(client, workers):
-    return client.run(_func_ucp_listener_port, workers=workers)
-
-
-def _func_worker_ranks(workers):
-    """
-    Builds a dictionary of { (worker_address, worker_port) : worker_rank }
-    """
-    return dict(list(zip(workers, range(len(workers)))))
diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx
deleted file mode 100644
index 38c5670372..0000000000
--- a/python/raft/dask/common/comms_utils.pyx
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from libc.stdlib cimport malloc, free
-from cython.operator cimport dereference as deref
-
-from cpython.long cimport PyLong_AsVoidPtr
-
-from libcpp cimport bool
-
-from libc.stdint cimport uintptr_t
-
-cdef extern from "nccl.h":
-
-    cdef struct ncclComm
-    ctypedef ncclComm *ncclComm_t
-
-cdef extern from "raft/handle.hpp" namespace "raft":
-    cdef cppclass handle_t:
-        handle_t() except +
-
-cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms":
-
-    void build_comms_nccl_ucx(handle_t *handle,
-                              ncclComm_t comm,
-                              void *ucp_worker,
-                              void *eps,
-                              int size,
-                              int rank) except +
-
-    void build_comms_nccl_only(handle_t *handle,
-                               ncclComm_t comm,
-                               int size,
-                               int rank) except +
-
-cdef extern from "raft/comms/comms_test.hpp" namespace "raft::comms":
-
-    bool test_collective_allreduce(const handle_t &h, int root) except +
-    bool test_collective_broadcast(const handle_t &h, int root) except +
-    bool test_collective_reduce(const handle_t &h, int root) except +
-    bool test_collective_allgather(const handle_t &h, int root) except +
-    bool test_collective_gather(const handle_t &h, int root) except +
-    bool test_collective_gatherv(const handle_t &h, int root) except +
-    bool test_collective_reducescatter(const handle_t &h, int root) except +
-    bool test_pointToPoint_simple_send_recv(const handle_t &h,
-                                            int numTrials) except +
-    bool test_pointToPoint_device_send_or_recv(const handle_t &h,
-                                               int numTrials) except +
-    bool test_pointToPoint_device_sendrecv(const handle_t &h,
-                                           int numTrials) except +
-    bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
-                                                     int numTrials) except +
-    bool test_commsplit(const handle_t &h, int n_colors) except +
-
-
-def perform_test_comms_allreduce(handle, root):
-    """
-    Performs an allreduce on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
-    return test_collective_allreduce(deref(h), root)
-
-
-def perform_test_comms_reduce(handle, root):
-    """
-    Performs an allreduce on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
-    return test_collective_reduce(deref(h), root)
-
-
-def perform_test_comms_reducescatter(handle, root):
-    """
-    Performs an allreduce on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
-    return test_collective_reducescatter(deref(h), root)
-
-
-def perform_test_comms_bcast(handle, root):
-    """
-    Performs an broadcast on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
-    return test_collective_broadcast(deref(h), root)
-
-
-def perform_test_comms_allgather(handle, root):
-    """
-    Performs an broadcast on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
-    return test_collective_allgather(deref(h), root)
-
-
-def perform_test_comms_gather(handle, root):
-    """
-    Performs a gather on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    root : int
-           Rank of the root worker
-    """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
-    return test_collective_gather(deref(h), root)
-
-
-def perform_test_comms_gatherv(handle, root):
-    """
-    Performs a gatherv on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    root : int
-           Rank of the root worker
-    """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
-    return test_collective_gatherv(deref(h), root)
-
-
-def perform_test_comms_send_recv(handle, n_trials):
-    """
-    Performs a p2p send/recv on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    n_trilas : int
-               Number of test trials
-    """
-    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
-    return test_pointToPoint_simple_send_recv(deref(h), <int>n_trials)
-
-
-def perform_test_comms_device_send_or_recv(handle, n_trials):
-    """
-    Performs a p2p device send or recv on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    n_trilas : int
-               Number of test trials
-    """
-    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
-    return test_pointToPoint_device_send_or_recv(deref(h), <int>n_trials)
-
-
-def perform_test_comms_device_sendrecv(handle, n_trials):
-    """
-    Performs a p2p device concurrent send&recv on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    n_trilas : int
-               Number of test trials
-    """
-    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
-    return test_pointToPoint_device_sendrecv(deref(h), <int>n_trials)
-
-
-def perform_test_comms_device_multicast_sendrecv(handle, n_trials):
-    """
-    Performs a p2p device concurrent multicast send&recv on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    n_trilas : int
-               Number of test trials
-    """
-    cdef const handle_t *h = <handle_t *> <size_t> handle.getHandle()
-    return test_pointToPoint_device_multicast_sendrecv(deref(h), <int>n_trials)
-
-
-def perform_test_comm_split(handle, n_colors):
-    """
-    Performs a p2p send/recv on the current worker
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    """
-    cdef const handle_t * h = < handle_t * > < size_t > handle.getHandle()
-    return test_commsplit(deref(h), < int > n_colors)
-
-
-def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose):
-    """
-    Given a handle and initialized nccl comm, creates a comms_t
-    instance and injects it into the handle.
-
-        Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    nccl_inst : raft.dask.common.nccl
-                Initialized nccl comm to use
-    size : int
-           Number of workers in cluster
-    rank : int
-           Rank of current worker
-
-    """
-
-    cdef size_t handle_size_t = <size_t>handle.getHandle()
-    handle_ = <handle_t*>handle_size_t
-
-    cdef size_t nccl_comm_size_t = <size_t>nccl_inst.get_comm()
-    nccl_comm_ = <ncclComm_t*>nccl_comm_size_t
-
-    build_comms_nccl_only(handle_,
-                          deref(nccl_comm_),
-                          size,
-                          rank)
-
-
-def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size,
-                           rank, verbose):
-    """
-    Given a handle and initialized comms, creates a comms_t instance
-    and injects it into the handle.
-
-    Parameters
-    ----------
-    handle : raft.common.Handle
-             handle containing comms_t to use
-    nccl_inst : raft.dask.common.nccl
-                Initialized nccl comm to use
-    ucp_worker : size_t pointer to initialized ucp_worker_h instance
-    eps: size_t pointer to array of initialized ucp_ep_h instances
-    size : int
-           Number of workers in cluster
-    rank : int
-           Rank of current worker
-    """
-    cdef size_t *ucp_eps = <size_t*> malloc(len(eps)*sizeof(size_t))
-
-    for i in range(len(eps)):
-        if eps[i] is not None:
-            ep_st = <uintptr_t>eps[i].get_ucp_endpoint()
-            ucp_eps[i] = <size_t>ep_st
-        else:
-            ucp_eps[i] = 0
-
-    cdef void* ucp_worker_st = <void*><size_t>ucp_worker
-
-    cdef size_t handle_size_t = <size_t>handle.getHandle()
-    handle_ = <handle_t*>handle_size_t
-
-    cdef size_t nccl_comm_size_t = <size_t>nccl_inst.get_comm()
-    nccl_comm_ = <ncclComm_t*>nccl_comm_size_t
-
-    build_comms_nccl_ucx(handle_,
-                         deref(nccl_comm_),
-                         <void*>ucp_worker_st,
-                         <void*>ucp_eps,
-                         size,
-                         rank)
-
-    free(ucp_eps)
diff --git a/python/raft/dask/common/nccl.pyx b/python/raft/dask/common/nccl.pyx
deleted file mode 100644
index fd113e2222..0000000000
--- a/python/raft/dask/common/nccl.pyx
+++ /dev/null
@@ -1,253 +0,0 @@
-#
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from libc.stdint cimport uintptr_t
-from cython.operator cimport dereference as deref
-
-from libcpp cimport bool
-from libc.stdlib cimport malloc, free
-
-cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms":
-    void get_unique_id(char *uid, int size) except +
-    void nccl_unique_id_from_char(ncclUniqueId *id,
-                                  char *uniqueId,
-                                  int size) except +
-
-cdef extern from "nccl.h":
-
-    cdef struct ncclComm
-
-    ctypedef struct ncclUniqueId:
-        char *internal[128]
-
-    ctypedef ncclComm *ncclComm_t
-
-    ctypedef enum ncclResult_t:
-        ncclSuccess
-        ncclUnhandledCudaError
-        ncclSystemError
-        ncclInternalError
-        ncclInvalidArgument
-        ncclInvalidUsage
-        ncclNumResults
-
-    ncclResult_t ncclCommInitRank(ncclComm_t *comm,
-                                  int nranks,
-                                  ncclUniqueId commId,
-                                  int rank) nogil
-
-    ncclResult_t ncclGetUniqueId(ncclUniqueId *uniqueId) nogil
-
-    ncclResult_t ncclCommUserRank(const ncclComm_t comm, int *rank) nogil
-
-    ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int *count) nogil
-
-    const char *ncclGetErrorString(ncclResult_t result) nogil
-
-    ncclResult_t ncclCommAbort(ncclComm_t comm) nogil
-
-    ncclResult_t ncclCommDestroy(ncclComm_t comm) nogil
-
-NCCL_UNIQUE_ID_BYTES = 128
-
-
-def unique_id():
-    """
-    Returns a new ncclUniqueId converted to a
-    character array that can be safely serialized
-    and shared to a remote worker.
-
-    Returns
-    -------
-    128-byte unique id : str
-    """
-    cdef char *uid = <char *> malloc(NCCL_UNIQUE_ID_BYTES * sizeof(char))
-    get_unique_id(uid, NCCL_UNIQUE_ID_BYTES)
-    c_str = uid[:NCCL_UNIQUE_ID_BYTES-1]
-    free(uid)
-    return c_str
-
-
-cdef class nccl:
-    """
-    A NCCL wrapper for initializing and closing NCCL comms
-    in Python.
-    """
-    cdef ncclComm_t *comm
-
-    cdef int size
-    cdef int rank
-
-    def __cinit__(self):
-        self.comm = <ncclComm_t*>malloc(sizeof(ncclComm_t))
-
-    def __dealloc__(self):
-
-        comm_ = <ncclComm_t*>self.comm
-
-        if comm_ != NULL:
-            free(self.comm)
-            self.comm = NULL
-
-    @staticmethod
-    def get_unique_id():
-        """
-        Returns a new nccl unique id
-
-        Returns
-        -------
-        nccl unique id : str
-        """
-        return unique_id()
-
-    def init(self, nranks, commId, rank):
-        """
-        Construct a nccl-py object
-
-        Parameters
-        ----------
-        nranks : int size of clique
-        commId : string unique id from client
-        rank : int rank of current worker
-        """
-        self.size = nranks
-        self.rank = rank
-
-        cdef ncclUniqueId *ident = <ncclUniqueId*>malloc(sizeof(ncclUniqueId))
-        nccl_unique_id_from_char(ident, commId, NCCL_UNIQUE_ID_BYTES)
-
-        comm_ = <ncclComm_t*>self.comm
-
-        cdef int nr = nranks
-        cdef int r = rank
-        cdef ncclResult_t result
-
-        with nogil:
-            result = ncclCommInitRank(comm_, nr,
-                                      deref(ident), r)
-
-        if result != ncclSuccess:
-            with nogil:
-                err_str = ncclGetErrorString(result)
-
-            raise RuntimeError("NCCL_ERROR: %s" % err_str)
-
-    def destroy(self):
-        """
-        Call destroy on the underlying NCCL comm
-        """
-        comm_ = <ncclComm_t*>self.comm
-
-        cdef ncclResult_t result
-        if comm_ != NULL:
-            with nogil:
-                result = ncclCommDestroy(deref(comm_))
-
-            free(self.comm)
-            self.comm = NULL
-
-            if result != ncclSuccess:
-                with nogil:
-                    err_str = ncclGetErrorString(result)
-
-                raise RuntimeError("NCCL_ERROR: %s" % err_str)
-
-    def abort(self):
-        """
-        Call abort on the underlying nccl comm
-        """
-        comm_ = <ncclComm_t*>self.comm
-        cdef ncclResult_t result
-        if comm_ != NULL:
-            with nogil:
-                result = ncclCommAbort(deref(comm_))
-
-            free(comm_)
-            self.comm = NULL
-
-            if result != ncclSuccess:
-                with nogil:
-                    err_str = ncclGetErrorString(result)
-                raise RuntimeError("NCCL_ERROR: %s" % err_str)
-
-    def cu_device(self):
-        """
-        Get the device backing the underlying comm
-
-        Returns
-        -------
-        device id : int
-        """
-        cdef int *dev = <int*>malloc(sizeof(int))
-
-        comm_ = <ncclComm_t*>self.comm
-        cdef ncclResult_t result
-        with nogil:
-            result = ncclCommCuDevice(deref(comm_), dev)
-
-        ret = dev[0]
-        free(dev)
-
-        if result != ncclSuccess:
-            with nogil:
-                err_str = ncclGetErrorString(result)
-
-            raise RuntimeError("NCCL_ERROR: %s" % err_str)
-
-        return ret
-
-    def user_rank(self):
-        """
-        Get the rank id of the current comm
-
-        Returns
-        -------
-        rank : int
-        """
-
-        cdef int *rank = <int*>malloc(sizeof(int))
-
-        comm_ = <ncclComm_t*>self.comm
-
-        cdef ncclResult_t result
-        with nogil:
-            result = ncclCommUserRank(deref(comm_), rank)
-
-        ret = rank[0]
-        free(rank)
-
-        if result != ncclSuccess:
-            with nogil:
-                err_str = ncclGetErrorString(result)
-            raise RuntimeError("NCCL_ERROR: %s" % err_str)
-
-        return ret
-
-    def get_comm(self):
-        """
-        Returns the underlying nccl comm in a size_t (similar to void*).
-        This can be safely typecasted from size_t into ncclComm_t*
-
-        Returns
-        -------
-        ncclComm_t instance pointer : size_t
-        """
-        return <size_t>self.comm
diff --git a/python/raft/dask/common/ucx.py b/python/raft/dask/common/ucx.py
deleted file mode 100644
index f61479a0eb..0000000000
--- a/python/raft/dask/common/ucx.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import ucp
-
-
-async def _connection_func(ep):
-    UCX.get().add_server_endpoint(ep)
-
-
-class UCX:
-    """
-    Singleton UCX context to encapsulate all interactions with the
-    UCX-py API and guarantee only a single listener & endpoints are
-    created by RAFT Comms on a single process.
-    """
-
-    __instance = None
-
-    def __init__(self, listener_callback):
-
-        self.listener_callback = listener_callback
-
-        self._create_listener()
-        self._endpoints = {}
-        self._server_endpoints = []
-
-        assert UCX.__instance is None
-
-        UCX.__instance = self
-
-    @staticmethod
-    def get(listener_callback=_connection_func):
-        if UCX.__instance is None:
-            UCX(listener_callback)
-        return UCX.__instance
-
-    def get_worker(self):
-        return ucp.get_ucp_worker()
-
-    def _create_listener(self):
-        self._listener = ucp.create_listener(self.listener_callback)
-
-    def listener_port(self):
-        return self._listener.port
-
-    async def _create_endpoint(self, ip, port):
-        ep = await ucp.create_endpoint(ip, port)
-        self._endpoints[(ip, port)] = ep
-        return ep
-
-    def add_server_endpoint(self, ep):
-        self._server_endpoints.append(ep)
-
-    async def get_endpoint(self, ip, port):
-        if (ip, port) not in self._endpoints:
-            ep = await self._create_endpoint(ip, port)
-        else:
-            ep = self._endpoints[(ip, port)]
-
-        return ep
-
-    async def close_endpoints(self):
-        for k, ep in self._endpoints.items():
-            await ep.close()
-
-        for ep in self._server_endpoints:
-            ep.close()
-
-    def __del__(self):
-        for ip_port, ep in self._endpoints.items():
-            if not ep.closed():
-                ep.abort()
-            del ep
-
-        for ep in self._server_endpoints:
-            if not ep.closed():
-                ep.abort()
-            del ep
-
-        self._listener.close()
diff --git a/python/raft/dask/common/utils.py b/python/raft/dask/common/utils.py
deleted file mode 100644
index fdb5acfb5d..0000000000
--- a/python/raft/dask/common/utils.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from dask.distributed import default_client
-
-
-def get_client(client=None):
-    return default_client() if client is None else client
-
-
-def parse_host_port(address):
-    """
-    Given a string address with host/port, build a tuple(host, port)
-
-    Parameters
-    ----------
-    address: string address to parse
-
-    Returns
-    -------
-    tuple with host and port info : tuple(host, port)
-    """
-    if '://' in address:
-        address = address.rsplit('://', 1)[1]
-    host, port = address.split(':')
-    port = int(port)
-    return host, port
diff --git a/python/raft/include_test/__init__.py b/python/raft/include_test/__init__.py
deleted file mode 100644
index 2b81c05b26..0000000000
--- a/python/raft/include_test/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from .raft_include_test import raft_include_test
diff --git a/python/raft/include_test/raft_include_test.pyx b/python/raft/include_test/raft_include_test.pyx
deleted file mode 100644
index 6ebcb79256..0000000000
--- a/python/raft/include_test/raft_include_test.pyx
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-def raft_include_test():
-    print("RAFT Setup succesfully")
-    return True
diff --git a/python/raft/test/__init__.py b/python/raft/test/__init__.py
deleted file mode 100644
index df8a4ae3b9..0000000000
--- a/python/raft/test/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
\ No newline at end of file
diff --git a/python/raft/test/conftest.py b/python/raft/test/conftest.py
deleted file mode 100644
index 7ba0e36b0e..0000000000
--- a/python/raft/test/conftest.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import pytest
-
-from dask.distributed import Client
-
-from dask_cuda import initialize
-from dask_cuda import LocalCUDACluster
-
-import os
-os.environ["UCX_LOG_LEVEL"] = "error"
-
-
-enable_tcp_over_ucx = True
-enable_nvlink = False
-enable_infiniband = False
-
-
-@pytest.fixture(scope="session")
-def cluster():
-    cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0)
-    yield cluster
-    cluster.close()
-
-
-@pytest.fixture(scope="session")
-def ucx_cluster():
-    initialize.initialize(create_cuda_context=True,
-                          enable_tcp_over_ucx=enable_tcp_over_ucx,
-                          enable_nvlink=enable_nvlink,
-                          enable_infiniband=enable_infiniband)
-    cluster = LocalCUDACluster(protocol="ucx",
-                               enable_tcp_over_ucx=enable_tcp_over_ucx,
-                               enable_nvlink=enable_nvlink,
-                               enable_infiniband=enable_infiniband)
-    yield cluster
-    cluster.close()
-
-
-@pytest.fixture(scope="session")
-def client(cluster):
-    client = Client(cluster)
-    yield client
-    client.close()
-
-
-@pytest.fixture()
-def ucx_client(ucx_cluster):
-    client = Client(cluster)
-    yield client
-    client.close()
diff --git a/python/raft/test/test_comms.py b/python/raft/test/test_comms.py
deleted file mode 100644
index a540e8db10..0000000000
--- a/python/raft/test/test_comms.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import pytest
-
-from collections import OrderedDict
-
-from dask.distributed import Client
-from dask.distributed import wait
-
-try:
-    from raft.dask import Comms
-    from raft.dask.common import local_handle
-    from raft.dask.common import perform_test_comms_send_recv
-    from raft.dask.common import perform_test_comms_device_send_or_recv
-    from raft.dask.common import perform_test_comms_device_sendrecv
-    from raft.dask.common import perform_test_comms_device_multicast_sendrecv
-    from raft.dask.common import perform_test_comms_allreduce
-    from raft.dask.common import perform_test_comms_bcast
-    from raft.dask.common import perform_test_comms_reduce
-    from raft.dask.common import perform_test_comms_allgather
-    from raft.dask.common import perform_test_comms_gather
-    from raft.dask.common import perform_test_comms_gatherv
-    from raft.dask.common import perform_test_comms_reducescatter
-    from raft.dask.common import perform_test_comm_split
-
-    pytestmark = pytest.mark.mg
-except ImportError:
-    pytestmark = pytest.mark.skip
-
-
-def test_comms_init_no_p2p(cluster):
-
-    client = Client(cluster)
-
-    try:
-        cb = Comms(verbose=True)
-        cb.init()
-
-        assert cb.nccl_initialized is True
-        assert cb.ucx_initialized is False
-
-    finally:
-
-        cb.destroy()
-        client.close()
-
-
-def func_test_collective(func, sessionId, root):
-    handle = local_handle(sessionId)
-    return func(handle, root)
-
-
-def func_test_send_recv(sessionId, n_trials):
-    handle = local_handle(sessionId)
-    return perform_test_comms_send_recv(handle, n_trials)
-
-
-def func_test_device_send_or_recv(sessionId, n_trials):
-    handle = local_handle(sessionId)
-    return perform_test_comms_device_send_or_recv(handle, n_trials)
-
-
-def func_test_device_sendrecv(sessionId, n_trials):
-    handle = local_handle(sessionId)
-    return perform_test_comms_device_sendrecv(handle, n_trials)
-
-
-def func_test_device_multicast_sendrecv(sessionId, n_trials):
-    handle = local_handle(sessionId)
-    return perform_test_comms_device_multicast_sendrecv(handle, n_trials)
-
-
-def func_test_comm_split(sessionId, n_trials):
-    handle = local_handle(sessionId)
-    return perform_test_comm_split(handle, n_trials)
-
-
-def func_check_uid(sessionId, uniqueId, state_object):
-    if not hasattr(state_object, "_raft_comm_state"):
-        return 1
-
-    state = state_object._raft_comm_state
-    if sessionId not in state:
-        return 2
-
-    session_state = state[sessionId]
-    if "nccl_uid" not in session_state:
-        return 3
-
-    nccl_uid = session_state["nccl_uid"]
-    if nccl_uid != uniqueId:
-        return 4
-
-    return 0
-
-
-def func_check_uid_on_scheduler(sessionId, uniqueId, dask_scheduler):
-    return func_check_uid(
-        sessionId=sessionId, uniqueId=uniqueId, state_object=dask_scheduler
-    )
-
-
-def func_check_uid_on_worker(sessionId, uniqueId):
-    from dask.distributed import get_worker
-
-    return func_check_uid(
-        sessionId=sessionId, uniqueId=uniqueId, state_object=get_worker()
-    )
-
-
-def test_handles(cluster):
-
-    client = Client(cluster)
-
-    def _has_handle(sessionId):
-        return local_handle(sessionId) is not None
-
-    try:
-        cb = Comms(verbose=True)
-        cb.init()
-
-        dfs = [
-            client.submit(_has_handle, cb.sessionId, pure=False, workers=[w])
-            for w in cb.worker_addresses
-        ]
-        wait(dfs, timeout=5)
-
-        assert all(client.compute(dfs, sync=True))
-
-    finally:
-        cb.destroy()
-        client.close()
-
-
-if pytestmark.markname != "skip":
-    functions = [
-        perform_test_comms_allgather,
-        perform_test_comms_allreduce,
-        perform_test_comms_bcast,
-        perform_test_comms_gather,
-        perform_test_comms_gatherv,
-        perform_test_comms_reduce,
-        perform_test_comms_reducescatter,
-    ]
-else:
-    functions = [None]
-
-
-@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
-def test_nccl_root_placement(client, root_location):
-
-    cb = None
-    try:
-        cb = Comms(
-            verbose=True, client=client, nccl_root_location=root_location
-        )
-        cb.init()
-
-        worker_addresses = list(
-            OrderedDict.fromkeys(client.scheduler_info()["workers"].keys())
-        )
-
-        if root_location in ("worker",):
-            result = client.run(
-                func_check_uid_on_worker,
-                cb.sessionId,
-                cb.uniqueId,
-                workers=[worker_addresses[0]],
-            )[worker_addresses[0]]
-        elif root_location in ("scheduler",):
-            result = client.run_on_scheduler(
-                func_check_uid_on_scheduler, cb.sessionId, cb.uniqueId
-            )
-        else:
-            result = int(cb.uniqueId is None)
-
-        assert result == 0
-
-    finally:
-        if cb:
-            cb.destroy()
-
-
-@pytest.mark.parametrize("func", functions)
-@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
-@pytest.mark.nccl
-def test_collectives(client, func, root_location):
-
-    try:
-        cb = Comms(
-            verbose=True, client=client, nccl_root_location=root_location
-        )
-        cb.init()
-
-        for k, v in cb.worker_info(cb.worker_addresses).items():
-
-            dfs = [
-                client.submit(
-                    func_test_collective,
-                    func,
-                    cb.sessionId,
-                    v["rank"],
-                    pure=False,
-                    workers=[w],
-                )
-                for w in cb.worker_addresses
-            ]
-            wait(dfs, timeout=5)
-
-            assert all([x.result() for x in dfs])
-    finally:
-        if cb:
-            cb.destroy()
-
-
-@pytest.mark.nccl
-def test_comm_split(client):
-
-    cb = Comms(comms_p2p=True, verbose=True)
-    cb.init()
-
-    dfs = [
-        client.submit(
-            func_test_comm_split, cb.sessionId, 3, pure=False, workers=[w]
-        )
-        for w in cb.worker_addresses
-    ]
-
-    wait(dfs, timeout=5)
-
-    assert all([x.result() for x in dfs])
-
-
-@pytest.mark.ucx
-@pytest.mark.parametrize("n_trials", [1, 5])
-def test_send_recv(n_trials, client):
-
-    cb = Comms(comms_p2p=True, verbose=True)
-    cb.init()
-
-    dfs = [
-        client.submit(
-            func_test_send_recv,
-            cb.sessionId,
-            n_trials,
-            pure=False,
-            workers=[w],
-        )
-        for w in cb.worker_addresses
-    ]
-
-    wait(dfs, timeout=5)
-
-    assert list(map(lambda x: x.result(), dfs))
-
-
-@pytest.mark.nccl
-@pytest.mark.parametrize("n_trials", [1, 5])
-def test_device_send_or_recv(n_trials, client):
-
-    cb = Comms(comms_p2p=True, verbose=True)
-    cb.init()
-
-    dfs = [
-        client.submit(
-            func_test_device_send_or_recv,
-            cb.sessionId,
-            n_trials,
-            pure=False,
-            workers=[w],
-        )
-        for w in cb.worker_addresses
-    ]
-
-    wait(dfs, timeout=5)
-
-    assert list(map(lambda x: x.result(), dfs))
-
-
-@pytest.mark.nccl
-@pytest.mark.parametrize("n_trials", [1, 5])
-def test_device_sendrecv(n_trials, client):
-
-    cb = Comms(comms_p2p=True, verbose=True)
-    cb.init()
-
-    dfs = [
-        client.submit(
-            func_test_device_sendrecv,
-            cb.sessionId,
-            n_trials,
-            pure=False,
-            workers=[w],
-        )
-        for w in cb.worker_addresses
-    ]
-
-    wait(dfs, timeout=5)
-
-    assert list(map(lambda x: x.result(), dfs))
-
-
-@pytest.mark.nccl
-@pytest.mark.parametrize("n_trials", [1, 5])
-def test_device_multicast_sendrecv(n_trials, client):
-
-    cb = Comms(comms_p2p=True, verbose=True)
-    cb.init()
-
-    dfs = [
-        client.submit(
-            func_test_device_multicast_sendrecv,
-            cb.sessionId,
-            n_trials,
-            pure=False,
-            workers=[w],
-        )
-        for w in cb.worker_addresses
-    ]
-
-    wait(dfs, timeout=5)
-
-    assert list(map(lambda x: x.result(), dfs))
diff --git a/python/raft/test/test_interruptible.py b/python/raft/test/test_interruptible.py
deleted file mode 100644
index 81f4f99ed8..0000000000
--- a/python/raft/test/test_interruptible.py
+++ /dev/null
@@ -1,54 +0,0 @@
-
-import os
-import pytest
-import signal
-import time
-from raft.common.interruptible import cuda_interruptible, cuda_yield
-
-
-def send_ctrl_c():
-    # signal.raise_signal(signal.SIGINT) available only since python 3.8
-    os.kill(os.getpid(), signal.SIGINT)
-
-
-def test_should_cancel_via_interruptible():
-    start_time = time.monotonic()
-    with pytest.raises(RuntimeError, match='this thread was cancelled'):
-        with cuda_interruptible():
-            send_ctrl_c()
-            cuda_yield()
-            time.sleep(1.0)
-    end_time = time.monotonic()
-    assert end_time < start_time + 0.5, \
-        "The process seems to have waited, while it shouldn't have."
-
-
-def test_should_cancel_via_python():
-    start_time = time.monotonic()
-    with pytest.raises(KeyboardInterrupt):
-        send_ctrl_c()
-        cuda_yield()
-        time.sleep(1.0)
-    end_time = time.monotonic()
-    assert end_time < start_time + 0.5, \
-        "The process seems to have waited, while it shouldn't have."
-
-
-def test_should_wait_no_interrupt():
-    start_time = time.monotonic()
-    with cuda_interruptible():
-        cuda_yield()
-        time.sleep(1.0)
-    end_time = time.monotonic()
-    assert end_time > start_time + 0.5, \
-        "The process seems to be cancelled, while it shouldn't be."
-
-
-def test_should_wait_no_yield():
-    start_time = time.monotonic()
-    with cuda_interruptible():
-        send_ctrl_c()
-        time.sleep(1.0)
-    end_time = time.monotonic()
-    assert end_time > start_time + 0.5, \
-        "The process seems to be cancelled, while it shouldn't be."
diff --git a/python/raft/test/test_raft.py b/python/raft/test/test_raft.py
deleted file mode 100644
index 9f0524e198..0000000000
--- a/python/raft/test/test_raft.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import pytest
-import sys
-
-try:
-    import raft
-except ImportError:
-    print("Skipping RAFT tests")
-    pytestmart = pytest.mark.skip
-
-pytestmark = pytest.mark.skipif(
-    'raft' not in sys.argv, reason="marker to allow integration of RAFT"
-)
-
-
-def test_raft():
-    assert raft.raft_include_test()
diff --git a/python/setup.cfg b/python/setup.cfg
deleted file mode 100644
index f6c096818b..0000000000
--- a/python/setup.cfg
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-[flake8]
-exclude = __init__.py,versioneer.py
-# See the docstring in versioneer.py for instructions. Note that you must
-# re-run 'versioneer.py setup' after changing this section, and commit the
-# resulting files.
-
-[versioneer]
-VCS = git
-style = pep440
-versionfile_source = raft/_version.py
-versionfile_build = raft/_version.py
-tag_prefix = v
-parentdir_prefix = raft-
-
-[isort]
-line_length=79
-multi_line_output=3
-include_trailing_comma=True
-force_grid_wrap=0
-combine_as_imports=True
-order_by_type=True
-known_dask=
-    dask
-    distributed
-    dask_cuda
-known_rapids=
-    nvtext
-    cudf
-    cuml
-    cugraph
-    dask_cudf
-    rmm
-known_first_party=
-    raft
-default_section=THIRDPARTY
-sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
-skip=
-    thirdparty
-    .eggs
-    .git
-    .hg
-    .mypy_cache
-    .tox
-    .venv
-    _build
-    buck-out
-    build
-    dist
-    __init__.py
-
-[options]
-packages = find:
-install_requires =
-    numpy
-    numba>=0.49
-python_requires = >=3.7,<3.9
diff --git a/python/setup.py b/python/setup.py
deleted file mode 100644
index 10beca1eb4..0000000000
--- a/python/setup.py
+++ /dev/null
@@ -1,202 +0,0 @@
-#
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy
-import os
-import shutil
-import sys
-import sysconfig
-
-# Must import in this order:
-#   setuptools -> Cython.Distutils.build_ext -> setuptools.command.build_ext
-# Otherwise, setuptools.command.build_ext ends up inheriting from
-# Cython.Distutils.old_build_ext which we do not want
-import setuptools
-
-try:
-    from Cython.Distutils.build_ext import new_build_ext as _build_ext
-except ImportError:
-    from setuptools.command.build_ext import build_ext as _build_ext
-
-from distutils.sysconfig import get_python_lib
-
-import setuptools.command.build_ext
-from setuptools import find_packages, setup
-from setuptools.extension import Extension
-
-from setuputils import clean_folder
-from setuputils import get_environment_option
-from setuputils import get_cli_option
-
-from pathlib import Path
-
-import versioneer
-
-
-##############################################################################
-# - Dependencies include and lib folder setup --------------------------------
-
-install_requires = [
-    'cython'
-]
-
-cuda_home = get_environment_option("CUDA_HOME")
-
-clean_artifacts = get_cli_option('clean')
-single_gpu_build = get_cli_option('--singlegpu')
-
-
-if not cuda_home:
-    cuda_home = (
-        os.popen('echo "$(dirname $(dirname $(which nvcc)))"').read().strip()
-    )
-    print("-- Using nvcc to detect CUDA, found at " + str(cuda_home))
-cuda_include_dir = os.path.join(cuda_home, "include")
-cuda_lib_dir = os.path.join(cuda_home, "lib64")
-
-##############################################################################
-# - Clean target -------------------------------------------------------------
-
-if clean_artifacts:
-    print("-- Cleaning all Python and Cython build artifacts...")
-
-    try:
-        setup_file_path = str(Path(__file__).parent.absolute())
-        shutil.rmtree(setup_file_path + '/.pytest_cache', ignore_errors=True)
-        shutil.rmtree(setup_file_path + '/_external_repositories',
-                      ignore_errors=True)
-        shutil.rmtree(setup_file_path + '/raft.egg-info', ignore_errors=True)
-        shutil.rmtree(setup_file_path + '/__pycache__', ignore_errors=True)
-
-        clean_folder(setup_file_path + '/raft')
-        shutil.rmtree(setup_file_path + '/build')
-
-    except IOError:
-        pass
-
-    # need to terminate script so cythonizing doesn't get triggered after
-    # cleanup unintendedly
-    sys.argv.remove("clean")
-
-    if "--all" in sys.argv:
-        sys.argv.remove("--all")
-
-    if len(sys.argv) == 1:
-        sys.exit(0)
-
-
-##############################################################################
-# - Cython extensions build and parameters -----------------------------------
-
-
-libs = ['cudart', "nccl", "cusolver", "cusparse", "cublas"]
-
-include_dirs = [cuda_include_dir,
-                numpy.get_include(),
-                "../cpp/include/",
-                os.path.dirname(sysconfig.get_path("include"))]
-
-extensions = [
-    Extension("*",
-              sources=["raft/**/*.pyx"],
-              include_dirs=include_dirs,
-              library_dirs=[get_python_lib()],
-              runtime_library_dirs=[cuda_lib_dir,
-                                    os.path.join(os.sys.prefix, "lib")],
-              libraries=libs,
-              language='c++',
-              extra_compile_args=['-std=c++17'])
-]
-
-
-class build_ext_no_debug(_build_ext):
-
-    def build_extensions(self):
-        def remove_flags(compiler, *flags):
-            for flag in flags:
-                try:
-                    compiler.compiler_so = list(
-                        filter((flag).__ne__, compiler.compiler_so)
-                    )
-                except Exception:
-                    pass
-
-        # Full optimization
-        self.compiler.compiler_so.append("-O3")
-
-        # Ignore deprecation declaration warnings
-        self.compiler.compiler_so.append("-Wno-deprecated-declarations")
-
-        # No debug symbols, full optimization, no '-Wstrict-prototypes' warning
-        remove_flags(
-            self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes"
-        )
-        super().build_extensions()
-
-    def finalize_options(self):
-        if self.distribution.ext_modules:
-            # Delay import this to allow for Cython-less installs
-            from Cython.Build.Dependencies import cythonize
-
-            nthreads = getattr(self, "parallel", None)  # -j option in Py3.5+
-            nthreads = int(nthreads) if nthreads else None
-            self.distribution.ext_modules = cythonize(
-                self.distribution.ext_modules,
-                nthreads=nthreads,
-                force=self.force,
-                gdb_debug=False,
-                compiler_directives=dict(
-                    profile=False, language_level=3, embedsignature=True
-                ),
-            )
-        # Skip calling super() and jump straight to setuptools
-        setuptools.command.build_ext.build_ext.finalize_options(self)
-
-
-cmdclass = dict()
-cmdclass.update(versioneer.get_cmdclass())
-cmdclass["build_ext"] = build_ext_no_debug
-
-
-##############################################################################
-# - Python package generation ------------------------------------------------
-
-
-setup(name='raft',
-      description="RAPIDS Analytics Frameworks Toolset",
-      version=versioneer.get_version(),
-      classifiers=[
-        "Intended Audience :: Developers",
-        "Programming Language :: Python",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7"
-      ],
-      author="NVIDIA Corporation",
-      setup_requires=['cython'],
-      ext_modules=extensions,
-      package_data=dict.fromkeys(
-                         find_packages(include=["raft.dask.common",
-                                                "raft.dask.common.includes",
-                                                "raft.common",
-                                                "raft.common.includes"]),
-                         ["*.hpp", "*.pxd"],
-      ),
-      packages=find_packages(include=['raft', 'raft.*']),
-      install_requires=install_requires,
-      license="Apache",
-      cmdclass=cmdclass,
-      zip_safe=False
-      )
diff --git a/python/setuputils.py b/python/setuputils.py
deleted file mode 100755
index 61cb2da273..0000000000
--- a/python/setuputils.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import glob
-import os
-import shutil
-import sys
-
-
-def get_environment_option(name):
-    ENV_VARIABLE = os.environ.get(name, False)
-
-    if not ENV_VARIABLE:
-        print("-- " + name + " environment variable not set.")
-
-    else:
-        print("-- " + name + " detected with value: " + str(ENV_VARIABLE))
-
-    return ENV_VARIABLE
-
-
-def get_cli_option(name):
-    if name in sys.argv:
-        print("-- Detected " + str(name) + " build option.")
-        return True
-
-    else:
-        return False
-
-
-def clean_folder(path):
-    """
-    Function to clean all Cython and Python artifacts and cache folders. It
-    clean the folder as well as its direct children recursively.
-
-    Parameters
-    ----------
-    path : String
-        Path to the folder to be cleaned.
-    """
-    shutil.rmtree(path + '/__pycache__', ignore_errors=True)
-
-    folders = glob.glob(path + '/*/')
-    for folder in folders:
-        shutil.rmtree(folder + '/__pycache__', ignore_errors=True)
-
-        clean_folder(folder)
-
-        cython_exts = glob.glob(folder + '/*.cpp')
-        cython_exts.extend(glob.glob(folder + '/*.cpython*'))
-        for file in cython_exts:
-            os.remove(file)
diff --git a/python/versioneer.py b/python/versioneer.py
deleted file mode 100644
index 64fea1c892..0000000000
--- a/python/versioneer.py
+++ /dev/null
@@ -1,1822 +0,0 @@
-
-# Version: 0.18
-
-"""The Versioneer - like a rocketeer, but for versions.
-
-The Versioneer
-==============
-
-* like a rocketeer, but for versions!
-* https://github.com/warner/python-versioneer
-* Brian Warner
-* License: Public Domain
-* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy
-* [![Latest Version]
-(https://pypip.in/version/versioneer/badge.svg?style=flat)
-](https://pypi.python.org/pypi/versioneer/)
-* [![Build Status]
-(https://travis-ci.org/warner/python-versioneer.png?branch=master)
-](https://travis-ci.org/warner/python-versioneer)
-
-This is a tool for managing a recorded version number in distutils-based
-python projects. The goal is to remove the tedious and error-prone "update
-the embedded version string" step from your release process. Making a new
-release should be as easy as recording a new tag in your version-control
-system, and maybe making new tarballs.
-
-
-## Quick Install
-
-* `pip install versioneer` to somewhere to your $PATH
-* add a `[versioneer]` section to your setup.cfg (see below)
-* run `versioneer install` in your source tree, commit the results
-
-## Version Identifiers
-
-Source trees come from a variety of places:
-
-* a version-control system checkout (mostly used by developers)
-* a nightly tarball, produced by build automation
-* a snapshot tarball, produced by a web-based VCS browser, like github's
-  "tarball from tag" feature
-* a release tarball, produced by "setup.py sdist", distributed through PyPI
-
-Within each source tree, the version identifier (either a string or a number,
-this tool is format-agnostic) can come from a variety of places:
-
-* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
-  about recent "tags" and an absolute revision-id
-* the name of the directory into which the tarball was unpacked
-* an expanded VCS keyword ($Id$, etc)
-* a `_version.py` created by some earlier build step
-
-For released software, the version identifier is closely related to a VCS
-tag. Some projects use tag names that include more than just the version
-string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
-needs to strip the tag prefix to extract the version identifier. For
-unreleased software (between tags), the version identifier should provide
-enough information to help developers recreate the same tree, while also
-giving them an idea of roughly how old the tree is (after version 1.2, before
-version 1.3). Many VCS systems can report a description that captures this,
-for example `git describe --tags --dirty --always` reports things like
-"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
-0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
-uncommitted changes.
-
-The version identifier is used for multiple purposes:
-
-* to allow the module to self-identify its version: `myproject.__version__`
-* to choose a name and prefix for a 'setup.py sdist' tarball
-
-## Theory of Operation
-
-Versioneer works by adding a special `_version.py` file into your source
-tree, where your `__init__.py` can import it. This `_version.py` knows how to
-dynamically ask the VCS tool for version information at import time.
-
-`_version.py` also contains `$Revision$` markers, and the installation
-process marks `_version.py` to have this marker rewritten with a tag name
-during the `git archive` command. As a result, generated tarballs will
-contain enough information to get the proper version.
-
-To allow `setup.py` to compute a version too, a `versioneer.py` is added to
-the top level of your source tree, next to `setup.py` and the `setup.cfg`
-that configures it. This overrides several distutils/setuptools commands to
-compute the version when invoked, and changes `setup.py build` and `setup.py
-sdist` to replace `_version.py` with a small static file that contains just
-the generated version data.
-
-## Installation
-
-See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
-
-## Version-String Flavors
-
-Code which uses Versioneer can learn about its version string at runtime by
-importing `_version` from your main `__init__.py` file and running the
-`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
-import the top-level `versioneer.py` and run `get_versions()`.
-
-Both functions return a dictionary with different flavors of version
-information:
-
-* `['version']`: A condensed version string, rendered using the selected
-  style. This is the most commonly used value for the project's version
-  string. The default "pep440" style yields strings like `0.11`,
-  `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
-  below for alternative styles.
-
-* `['full-revisionid']`: detailed revision identifier. For Git, this is the
-  full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
-
-* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
-  commit date in ISO 8601 format. This will be None if the date is not
-  available.
-
-* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
-  this is only accurate if run in a VCS checkout, otherwise it is likely to
-  be False or None
-
-* `['error']`: if the version string could not be computed, this will be set
-  to a string describing the problem, otherwise it will be None. It may be
-  useful to throw an exception in setup.py if this is set, to avoid e.g.
-  creating tarballs with a version string of "unknown".
-
-Some variants are more useful than others. Including `full-revisionid` in a
-bug report should allow developers to reconstruct the exact code being tested
-(or indicate the presence of local changes that should be shared with the
-developers). `version` is suitable for display in an "about" box or a CLI
-`--version` output: it can be easily compared against release notes and lists
-of bugs fixed in various releases.
-
-The installer adds the following text to your `__init__.py` to place a basic
-version in `YOURPROJECT.__version__`:
-
-    from ._version import get_versions
-    __version__ = get_versions()['version']
-    del get_versions
-
-## Styles
-
-The setup.cfg `style=` configuration controls how the VCS information is
-rendered into a version string.
-
-The default style, "pep440", produces a PEP440-compliant string, equal to the
-un-prefixed tag name for actual releases, and containing an additional "local
-version" section with more detail for in-between builds. For Git, this is
-TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
---dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
-tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
-that this commit is two revisions ("+2") beyond the "0.11" tag. For released
-software (exactly equal to a known tag), the identifier will only contain the
-stripped tag, e.g. "0.11".
-
-Other styles are available. See [details.md](details.md) in the Versioneer
-source tree for descriptions.
-
-## Debugging
-
-Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
-to return a version of "0+unknown". To investigate the problem, run `setup.py
-version`, which will run the version-lookup code in a verbose mode, and will
-display the full contents of `get_versions()` (including the `error` string,
-which may help identify what went wrong).
-
-## Known Limitations
-
-Some situations are known to cause problems for Versioneer. This details the
-most significant ones. More can be found on Github
-[issues page](https://github.com/warner/python-versioneer/issues).
-
-### Subprojects
-
-Versioneer has limited support for source trees in which `setup.py` is not in
-the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
-two common reasons why `setup.py` might not be in the root:
-
-* Source trees which contain multiple subprojects, such as
-  [Buildbot](https://github.com/buildbot/buildbot), which contains both
-  "master" and "slave" subprojects, each with their own `setup.py`,
-  `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
-  distributions (and upload multiple independently-installable tarballs).
-* Source trees whose main purpose is to contain a C library, but which also
-  provide bindings to Python (and perhaps other langauges) in subdirectories.
-
-Versioneer will look for `.git` in parent directories, and most operations
-should get the right version string. However `pip` and `setuptools` have bugs
-and implementation details which frequently cause `pip install .` from a
-subproject directory to fail to find a correct version string (so it usually
-defaults to `0+unknown`).
-
-`pip install --editable .` should work correctly. `setup.py install` might
-work too.
-
-Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
-some later version.
-
-[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking
-this issue. The discussion in
-[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the
-issue from the Versioneer side in more detail.
-[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
-[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
-pip to let Versioneer work correctly.
-
-Versioneer-0.16 and earlier only looked for a `.git` directory next to the
-`setup.cfg`, so subprojects were completely unsupported with those releases.
-
-### Editable installs with setuptools <= 18.5
-
-`setup.py develop` and `pip install --editable .` allow you to install a
-project into a virtualenv once, then continue editing the source code (and
-test) without re-installing after every change.
-
-"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
-convenient way to specify executable scripts that should be installed along
-with the python package.
-
-These both work as expected when using modern setuptools. When using
-setuptools-18.5 or earlier, however, certain operations will cause
-`pkg_resources.DistributionNotFound` errors when running the entrypoint
-script, which must be resolved by re-installing the package. This happens
-when the install happens with one version, then the egg_info data is
-regenerated while a different version is checked out. Many setup.py commands
-cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
-a different virtualenv), so this can be surprising.
-
-[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes
-this one, but upgrading to a newer version of setuptools should probably
-resolve it.
-
-### Unicode version strings
-
-While Versioneer works (and is continually tested) with both Python 2 and
-Python 3, it is not entirely consistent with bytes-vs-unicode distinctions.
-Newer releases probably generate unicode version strings on py2. It's not
-clear that this is wrong, but it may be surprising for applications when then
-write these strings to a network connection or include them in bytes-oriented
-APIs like cryptographic checksums.
-
-[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates
-this question.
-
-
-## Updating Versioneer
-
-To upgrade your project to a new release of Versioneer, do the following:
-
-* install the new Versioneer (`pip install -U versioneer` or equivalent)
-* edit `setup.cfg`, if necessary, to include any new configuration settings
-  indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
-* re-run `versioneer install` in your source tree, to replace
-  `SRC/_version.py`
-* commit any changed files
-
-## Future Directions
-
-This tool is designed to make it easily extended to other version-control
-systems: all VCS-specific components are in separate directories like
-src/git/ . The top-level `versioneer.py` script is assembled from these
-components by running make-versioneer.py . In the future, make-versioneer.py
-will take a VCS name as an argument, and will construct a version of
-`versioneer.py` that is specific to the given VCS. It might also take the
-configuration arguments that are currently provided manually during
-installation by editing setup.py . Alternatively, it might go the other
-direction and include code from all supported VCS systems, reducing the
-number of intermediate scripts.
-
-
-## License
-
-To make Versioneer easier to embed, all its code is dedicated to the public
-domain. The `_version.py` that it creates is also in the public domain.
-Specifically, both are released under the Creative Commons "Public Domain
-Dedication" license (CC0-1.0), as described in
-https://creativecommons.org/publicdomain/zero/1.0/ .
-
-"""
-
-from __future__ import print_function
-try:
-    import configparser
-except ImportError:
-    import ConfigParser as configparser
-import errno
-import json
-import os
-import re
-import subprocess
-import sys
-
-
-class VersioneerConfig:
-    """Container for Versioneer configuration parameters."""
-
-
-def get_root():
-    """Get the project root directory.
-
-    We require that all commands are run from the project root, i.e. the
-    directory that contains setup.py, setup.cfg, and versioneer.py .
-    """
-    root = os.path.realpath(os.path.abspath(os.getcwd()))
-    setup_py = os.path.join(root, "setup.py")
-    versioneer_py = os.path.join(root, "versioneer.py")
-    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
-        # allow 'python path/to/setup.py COMMAND'
-        root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
-        setup_py = os.path.join(root, "setup.py")
-        versioneer_py = os.path.join(root, "versioneer.py")
-    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
-        err = ("Versioneer was unable to run the project root directory. "
-               "Versioneer requires setup.py to be executed from "
-               "its immediate directory (like 'python setup.py COMMAND'), "
-               "or in a way that lets it use sys.argv[0] to find the root "
-               "(like 'python path/to/setup.py COMMAND').")
-        raise VersioneerBadRootError(err)
-    try:
-        # Certain runtime workflows (setup.py install/develop in a setuptools
-        # tree) execute all dependencies in a single python process, so
-        # "versioneer" may be imported multiple times, and python's shared
-        # module-import table will cache the first one. So we can't use
-        # os.path.dirname(__file__), as that will find whichever
-        # versioneer.py was first imported, even in later projects.
-        me = os.path.realpath(os.path.abspath(__file__))
-        me_dir = os.path.normcase(os.path.splitext(me)[0])
-        vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
-        if me_dir != vsr_dir:
-            print("Warning: build in %s is using versioneer.py from %s"
-                  % (os.path.dirname(me), versioneer_py))
-    except NameError:
-        pass
-    return root
-
-
-def get_config_from_root(root):
-    """Read the project setup.cfg file to determine Versioneer config."""
-    # This might raise EnvironmentError (if setup.cfg is missing), or
-    # configparser.NoSectionError (if it lacks a [versioneer] section), or
-    # configparser.NoOptionError (if it lacks "VCS="). See the docstring at
-    # the top of versioneer.py for instructions on writing your setup.cfg .
-    setup_cfg = os.path.join(root, "setup.cfg")
-    parser = configparser.SafeConfigParser()
-    with open(setup_cfg, "r") as f:
-        parser.readfp(f)
-    VCS = parser.get("versioneer", "VCS")  # mandatory
-
-    def get(parser, name):
-        if parser.has_option("versioneer", name):
-            return parser.get("versioneer", name)
-        return None
-    cfg = VersioneerConfig()
-    cfg.VCS = VCS
-    cfg.style = get(parser, "style") or ""
-    cfg.versionfile_source = get(parser, "versionfile_source")
-    cfg.versionfile_build = get(parser, "versionfile_build")
-    cfg.tag_prefix = get(parser, "tag_prefix")
-    if cfg.tag_prefix in ("''", '""'):
-        cfg.tag_prefix = ""
-    cfg.parentdir_prefix = get(parser, "parentdir_prefix")
-    cfg.verbose = get(parser, "verbose")
-    return cfg
-
-
-class NotThisMethod(Exception):
-    """Exception raised if a method is not valid for the current scenario."""
-
-
-# these dictionaries contain VCS-specific tools
-LONG_VERSION_PY = {}
-HANDLERS = {}
-
-
-def register_vcs_handler(vcs, method):  # decorator
-    """Decorator to mark a method as the handler for a particular VCS."""
-    def decorate(f):
-        """Store f in HANDLERS[vcs][method]."""
-        if vcs not in HANDLERS:
-            HANDLERS[vcs] = {}
-        HANDLERS[vcs][method] = f
-        return f
-    return decorate
-
-
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
-    """Call the given command(s)."""
-    assert isinstance(commands, list)
-    p = None
-    for c in commands:
-        try:
-            dispcmd = str([c] + args)
-            # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
-            break
-        except EnvironmentError:
-            e = sys.exc_info()[1]
-            if e.errno == errno.ENOENT:
-                continue
-            if verbose:
-                print("unable to run %s" % dispcmd)
-                print(e)
-            return None, None
-    else:
-        if verbose:
-            print("unable to find command, tried %s" % (commands,))
-        return None, None
-    stdout = p.communicate()[0].strip()
-    if sys.version_info[0] >= 3:
-        stdout = stdout.decode()
-    if p.returncode != 0:
-        if verbose:
-            print("unable to run %s (error)" % dispcmd)
-            print("stdout was %s" % stdout)
-        return None, p.returncode
-    return stdout, p.returncode
-
-
-LONG_VERSION_PY['git'] = '''
-# This file helps to compute a version number in source trees obtained from
-# git-archive tarball (such as those provided by githubs download-from-tag
-# feature). Distribution tarballs (built by setup.py sdist) and build
-# directories (produced by setup.py build) will contain a much shorter file
-# that just contains the computed version number.
-
-# This file is released into the public domain. Generated by
-# versioneer-0.18 (https://github.com/warner/python-versioneer)
-
-"""Git implementation of _version.py."""
-
-import errno
-import os
-import re
-import subprocess
-import sys
-
-
-def get_keywords():
-    """Get the keywords needed to look up the version information."""
-    # these strings will be replaced by git during git-archive.
-    # setup.py/versioneer.py will grep for the variable names, so they must
-    # each be defined on a line of their own. _version.py will just call
-    # get_keywords().
-    git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
-    git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
-    git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
-    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
-    return keywords
-
-
-class VersioneerConfig:
-    """Container for Versioneer configuration parameters."""
-
-
-def get_config():
-    """Create, populate and return the VersioneerConfig() object."""
-    # these strings are filled in when 'setup.py versioneer' creates
-    # _version.py
-    cfg = VersioneerConfig()
-    cfg.VCS = "git"
-    cfg.style = "%(STYLE)s"
-    cfg.tag_prefix = "%(TAG_PREFIX)s"
-    cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
-    cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
-    cfg.verbose = False
-    return cfg
-
-
-class NotThisMethod(Exception):
-    """Exception raised if a method is not valid for the current scenario."""
-
-
-LONG_VERSION_PY = {}
-HANDLERS = {}
-
-
-def register_vcs_handler(vcs, method):  # decorator
-    """Decorator to mark a method as the handler for a particular VCS."""
-    def decorate(f):
-        """Store f in HANDLERS[vcs][method]."""
-        if vcs not in HANDLERS:
-            HANDLERS[vcs] = {}
-        HANDLERS[vcs][method] = f
-        return f
-    return decorate
-
-
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
-    """Call the given command(s)."""
-    assert isinstance(commands, list)
-    p = None
-    for c in commands:
-        try:
-            dispcmd = str([c] + args)
-            # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
-            break
-        except EnvironmentError:
-            e = sys.exc_info()[1]
-            if e.errno == errno.ENOENT:
-                continue
-            if verbose:
-                print("unable to run %%s" %% dispcmd)
-                print(e)
-            return None, None
-    else:
-        if verbose:
-            print("unable to find command, tried %%s" %% (commands,))
-        return None, None
-    stdout = p.communicate()[0].strip()
-    if sys.version_info[0] >= 3:
-        stdout = stdout.decode()
-    if p.returncode != 0:
-        if verbose:
-            print("unable to run %%s (error)" %% dispcmd)
-            print("stdout was %%s" %% stdout)
-        return None, p.returncode
-    return stdout, p.returncode
-
-
-def versions_from_parentdir(parentdir_prefix, root, verbose):
-    """Try to determine the version from the parent directory name.
-
-    Source tarballs conventionally unpack into a directory that includes both
-    the project name and a version string. We will also support searching up
-    two directory levels for an appropriately named parent directory
-    """
-    rootdirs = []
-
-    for i in range(3):
-        dirname = os.path.basename(root)
-        if dirname.startswith(parentdir_prefix):
-            return {"version": dirname[len(parentdir_prefix):],
-                    "full-revisionid": None,
-                    "dirty": False, "error": None, "date": None}
-        else:
-            rootdirs.append(root)
-            root = os.path.dirname(root)  # up a level
-
-    if verbose:
-        print("Tried directories %%s but none started with prefix %%s" %%
-              (str(rootdirs), parentdir_prefix))
-    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
-
-
-@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
-    """Extract version information from the given file."""
-    # the code embedded in _version.py can just fetch the value of these
-    # keywords. When used from setup.py, we don't want to import _version.py,
-    # so we do it with a regexp instead. This function is not used from
-    # _version.py.
-    keywords = {}
-    try:
-        f = open(versionfile_abs, "r")
-        for line in f.readlines():
-            if line.strip().startswith("git_refnames ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["refnames"] = mo.group(1)
-            if line.strip().startswith("git_full ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["full"] = mo.group(1)
-            if line.strip().startswith("git_date ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["date"] = mo.group(1)
-        f.close()
-    except EnvironmentError:
-        pass
-    return keywords
-
-
-@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
-    """Get version information from git keywords."""
-    if not keywords:
-        raise NotThisMethod("no keywords at all, weird")
-    date = keywords.get("date")
-    if date is not None:
-        # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
-        # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
-        # -like" string, which we must then edit to make compliant), because
-        # it's been around since git-1.5.3, and it's too difficult to
-        # discover which version we're using, or to work around using an
-        # older one.
-        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-    refnames = keywords["refnames"].strip()
-    if refnames.startswith("$Format"):
-        if verbose:
-            print("keywords are unexpanded, not using")
-        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
-    refs = set([r.strip() for r in refnames.strip("()").split(",")])
-    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
-    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
-    TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
-    if not tags:
-        # Either we're using git < 1.8.3, or there really are no tags. We use
-        # a heuristic: assume all version tags have a digit. The old git %%d
-        # expansion behaves like git log --decorate=short and strips out the
-        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
-        # between branches and tags. By ignoring refnames without digits, we
-        # filter out many common branch names like "release" and
-        # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
-        if verbose:
-            print("discarding '%%s', no digits" %% ",".join(refs - tags))
-    if verbose:
-        print("likely tags: %%s" %% ",".join(sorted(tags)))
-    for ref in sorted(tags):
-        # sorting will prefer e.g. "2.0" over "2.0rc1"
-        if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
-            if verbose:
-                print("picking %%s" %% r)
-            return {"version": r,
-                    "full-revisionid": keywords["full"].strip(),
-                    "dirty": False, "error": None,
-                    "date": date}
-    # no suitable tags, so version is "0+unknown", but full hex is still there
-    if verbose:
-        print("no suitable tags, using unknown + full revision id")
-    return {"version": "0+unknown",
-            "full-revisionid": keywords["full"].strip(),
-            "dirty": False, "error": "no suitable tags", "date": None}
-
-
-@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
-    """Get version from 'git describe' in the root of the source tree.
-
-    This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
-    version string, meaning we're inside a checked out source tree.
-    """
-    GITS = ["git"]
-    if sys.platform == "win32":
-        GITS = ["git.cmd", "git.exe"]
-
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
-    if rc != 0:
-        if verbose:
-            print("Directory %%s not under git control" %% root)
-        raise NotThisMethod("'git rev-parse --git-dir' returned error")
-
-    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
-    # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%%s*" %% tag_prefix],
-                                   cwd=root)
-    # --long was added in git-1.5.5
-    if describe_out is None:
-        raise NotThisMethod("'git describe' failed")
-    describe_out = describe_out.strip()
-    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
-    if full_out is None:
-        raise NotThisMethod("'git rev-parse' failed")
-    full_out = full_out.strip()
-
-    pieces = {}
-    pieces["long"] = full_out
-    pieces["short"] = full_out[:7]  # maybe improved later
-    pieces["error"] = None
-
-    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
-    # TAG might have hyphens.
-    git_describe = describe_out
-
-    # look for -dirty suffix
-    dirty = git_describe.endswith("-dirty")
-    pieces["dirty"] = dirty
-    if dirty:
-        git_describe = git_describe[:git_describe.rindex("-dirty")]
-
-    # now we have TAG-NUM-gHEX or HEX
-
-    if "-" in git_describe:
-        # TAG-NUM-gHEX
-        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
-        if not mo:
-            # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = ("unable to parse git-describe output: '%%s'"
-                               %% describe_out)
-            return pieces
-
-        # tag
-        full_tag = mo.group(1)
-        if not full_tag.startswith(tag_prefix):
-            if verbose:
-                fmt = "tag '%%s' doesn't start with prefix '%%s'"
-                print(fmt %% (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
-                               %% (full_tag, tag_prefix))
-            return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
-
-        # distance: number of commits since tag
-        pieces["distance"] = int(mo.group(2))
-
-        # commit: short hex revision ID
-        pieces["short"] = mo.group(3)
-
-    else:
-        # HEX: no tags
-        pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
-        pieces["distance"] = int(count_out)  # total number of commits
-
-    # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"],
-                       cwd=root)[0].strip()
-    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-
-    return pieces
-
-
-def plus_or_dot(pieces):
-    """Return a + if we don't already have one, else return a ."""
-    if "+" in pieces.get("closest-tag", ""):
-        return "."
-    return "+"
-
-
-def render_pep440(pieces):
-    """Build up version string, with post-release "local version identifier".
-
-    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
-    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
-
-    Exceptions:
-    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += plus_or_dot(pieces)
-            rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
-                                          pieces["short"])
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def render_pep440_pre(pieces):
-    """TAG[.post.devDISTANCE] -- No -dirty.
-
-    Exceptions:
-    1: no tags. 0.post.devDISTANCE
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += ".post.dev%%d" %% pieces["distance"]
-    else:
-        # exception #1
-        rendered = "0.post.dev%%d" %% pieces["distance"]
-    return rendered
-
-
-def render_pep440_post(pieces):
-    """TAG[.postDISTANCE[.dev0]+gHEX] .
-
-    The ".dev0" means dirty. Note that .dev0 sorts backwards
-    (a dirty tree will appear "older" than the corresponding clean one),
-    but you shouldn't be releasing software with -dirty anyways.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%%d" %% pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "g%%s" %% pieces["short"]
-    else:
-        # exception #1
-        rendered = "0.post%%d" %% pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-        rendered += "+g%%s" %% pieces["short"]
-    return rendered
-
-
-def render_pep440_old(pieces):
-    """TAG[.postDISTANCE[.dev0]] .
-
-    The ".dev0" means dirty.
-
-    Eexceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%%d" %% pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-    else:
-        # exception #1
-        rendered = "0.post%%d" %% pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-    return rendered
-
-
-def render_git_describe(pieces):
-    """TAG[-DISTANCE-gHEX][-dirty].
-
-    Like 'git describe --tags --dirty --always'.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render_git_describe_long(pieces):
-    """TAG-DISTANCE-gHEX[-dirty].
-
-    Like 'git describe --tags --dirty --always -long'.
-    The distance/hash is unconditional.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render(pieces, style):
-    """Render the given version pieces into the requested style."""
-    if pieces["error"]:
-        return {"version": "unknown",
-                "full-revisionid": pieces.get("long"),
-                "dirty": None,
-                "error": pieces["error"],
-                "date": None}
-
-    if not style or style == "default":
-        style = "pep440"  # the default
-
-    if style == "pep440":
-        rendered = render_pep440(pieces)
-    elif style == "pep440-pre":
-        rendered = render_pep440_pre(pieces)
-    elif style == "pep440-post":
-        rendered = render_pep440_post(pieces)
-    elif style == "pep440-old":
-        rendered = render_pep440_old(pieces)
-    elif style == "git-describe":
-        rendered = render_git_describe(pieces)
-    elif style == "git-describe-long":
-        rendered = render_git_describe_long(pieces)
-    else:
-        raise ValueError("unknown style '%%s'" %% style)
-
-    return {"version": rendered, "full-revisionid": pieces["long"],
-            "dirty": pieces["dirty"], "error": None,
-            "date": pieces.get("date")}
-
-
-def get_versions():
-    """Get version information or return default if unable to do so."""
-    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
-    # __file__, we can work backwards from there to the root. Some
-    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
-    # case we can only use expanded keywords.
-
-    cfg = get_config()
-    verbose = cfg.verbose
-
-    try:
-        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
-                                          verbose)
-    except NotThisMethod:
-        pass
-
-    try:
-        root = os.path.realpath(__file__)
-        # versionfile_source is the relative path from the top of the source
-        # tree (where the .git directory might live) to this file. Invert
-        # this to find the root from __file__.
-        for i in cfg.versionfile_source.split('/'):
-            root = os.path.dirname(root)
-    except NameError:
-        return {"version": "0+unknown", "full-revisionid": None,
-                "dirty": None,
-                "error": "unable to find root of source tree",
-                "date": None}
-
-    try:
-        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
-        return render(pieces, cfg.style)
-    except NotThisMethod:
-        pass
-
-    try:
-        if cfg.parentdir_prefix:
-            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
-    except NotThisMethod:
-        pass
-
-    return {"version": "0+unknown", "full-revisionid": None,
-            "dirty": None,
-            "error": "unable to compute version", "date": None}
-'''
-
-
-@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
-    """Extract version information from the given file."""
-    # the code embedded in _version.py can just fetch the value of these
-    # keywords. When used from setup.py, we don't want to import _version.py,
-    # so we do it with a regexp instead. This function is not used from
-    # _version.py.
-    keywords = {}
-    try:
-        f = open(versionfile_abs, "r")
-        for line in f.readlines():
-            if line.strip().startswith("git_refnames ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["refnames"] = mo.group(1)
-            if line.strip().startswith("git_full ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["full"] = mo.group(1)
-            if line.strip().startswith("git_date ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["date"] = mo.group(1)
-        f.close()
-    except EnvironmentError:
-        pass
-    return keywords
-
-
-@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
-    """Get version information from git keywords."""
-    if not keywords:
-        raise NotThisMethod("no keywords at all, weird")
-    date = keywords.get("date")
-    if date is not None:
-        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
-        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
-        # -like" string, which we must then edit to make compliant), because
-        # it's been around since git-1.5.3, and it's too difficult to
-        # discover which version we're using, or to work around using an
-        # older one.
-        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-    refnames = keywords["refnames"].strip()
-    if refnames.startswith("$Format"):
-        if verbose:
-            print("keywords are unexpanded, not using")
-        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
-    refs = set([r.strip() for r in refnames.strip("()").split(",")])
-    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
-    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
-    TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
-    if not tags:
-        # Either we're using git < 1.8.3, or there really are no tags. We use
-        # a heuristic: assume all version tags have a digit. The old git %d
-        # expansion behaves like git log --decorate=short and strips out the
-        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
-        # between branches and tags. By ignoring refnames without digits, we
-        # filter out many common branch names like "release" and
-        # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
-        if verbose:
-            print("discarding '%s', no digits" % ",".join(refs - tags))
-    if verbose:
-        print("likely tags: %s" % ",".join(sorted(tags)))
-    for ref in sorted(tags):
-        # sorting will prefer e.g. "2.0" over "2.0rc1"
-        if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
-            if verbose:
-                print("picking %s" % r)
-            return {"version": r,
-                    "full-revisionid": keywords["full"].strip(),
-                    "dirty": False, "error": None,
-                    "date": date}
-    # no suitable tags, so version is "0+unknown", but full hex is still there
-    if verbose:
-        print("no suitable tags, using unknown + full revision id")
-    return {"version": "0+unknown",
-            "full-revisionid": keywords["full"].strip(),
-            "dirty": False, "error": "no suitable tags", "date": None}
-
-
-@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
-    """Get version from 'git describe' in the root of the source tree.
-
-    This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
-    version string, meaning we're inside a checked out source tree.
-    """
-    GITS = ["git"]
-    if sys.platform == "win32":
-        GITS = ["git.cmd", "git.exe"]
-
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
-    if rc != 0:
-        if verbose:
-            print("Directory %s not under git control" % root)
-        raise NotThisMethod("'git rev-parse --git-dir' returned error")
-
-    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
-    # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%s*" % tag_prefix],
-                                   cwd=root)
-    # --long was added in git-1.5.5
-    if describe_out is None:
-        raise NotThisMethod("'git describe' failed")
-    describe_out = describe_out.strip()
-    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
-    if full_out is None:
-        raise NotThisMethod("'git rev-parse' failed")
-    full_out = full_out.strip()
-
-    pieces = {}
-    pieces["long"] = full_out
-    pieces["short"] = full_out[:7]  # maybe improved later
-    pieces["error"] = None
-
-    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
-    # TAG might have hyphens.
-    git_describe = describe_out
-
-    # look for -dirty suffix
-    dirty = git_describe.endswith("-dirty")
-    pieces["dirty"] = dirty
-    if dirty:
-        git_describe = git_describe[:git_describe.rindex("-dirty")]
-
-    # now we have TAG-NUM-gHEX or HEX
-
-    if "-" in git_describe:
-        # TAG-NUM-gHEX
-        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
-        if not mo:
-            # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = ("unable to parse git-describe output: '%s'"
-                               % describe_out)
-            return pieces
-
-        # tag
-        full_tag = mo.group(1)
-        if not full_tag.startswith(tag_prefix):
-            if verbose:
-                fmt = "tag '%s' doesn't start with prefix '%s'"
-                print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
-                               % (full_tag, tag_prefix))
-            return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
-
-        # distance: number of commits since tag
-        pieces["distance"] = int(mo.group(2))
-
-        # commit: short hex revision ID
-        pieces["short"] = mo.group(3)
-
-    else:
-        # HEX: no tags
-        pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
-        pieces["distance"] = int(count_out)  # total number of commits
-
-    # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
-                       cwd=root)[0].strip()
-    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-
-    return pieces
-
-
-def do_vcs_install(manifest_in, versionfile_source, ipy):
-    """Git-specific installation logic for Versioneer.
-
-    For Git, this means creating/changing .gitattributes to mark _version.py
-    for export-subst keyword substitution.
-    """
-    GITS = ["git"]
-    if sys.platform == "win32":
-        GITS = ["git.cmd", "git.exe"]
-    files = [manifest_in, versionfile_source]
-    if ipy:
-        files.append(ipy)
-    try:
-        me = __file__
-        if me.endswith(".pyc") or me.endswith(".pyo"):
-            me = os.path.splitext(me)[0] + ".py"
-        versioneer_file = os.path.relpath(me)
-    except NameError:
-        versioneer_file = "versioneer.py"
-    files.append(versioneer_file)
-    present = False
-    try:
-        f = open(".gitattributes", "r")
-        for line in f.readlines():
-            if line.strip().startswith(versionfile_source):
-                if "export-subst" in line.strip().split()[1:]:
-                    present = True
-        f.close()
-    except EnvironmentError:
-        pass
-    if not present:
-        f = open(".gitattributes", "a+")
-        f.write("%s export-subst\n" % versionfile_source)
-        f.close()
-        files.append(".gitattributes")
-    run_command(GITS, ["add", "--"] + files)
-
-
-def versions_from_parentdir(parentdir_prefix, root, verbose):
-    """Try to determine the version from the parent directory name.
-
-    Source tarballs conventionally unpack into a directory that includes both
-    the project name and a version string. We will also support searching up
-    two directory levels for an appropriately named parent directory
-    """
-    rootdirs = []
-
-    for i in range(3):
-        dirname = os.path.basename(root)
-        if dirname.startswith(parentdir_prefix):
-            return {"version": dirname[len(parentdir_prefix):],
-                    "full-revisionid": None,
-                    "dirty": False, "error": None, "date": None}
-        else:
-            rootdirs.append(root)
-            root = os.path.dirname(root)  # up a level
-
-    if verbose:
-        print("Tried directories %s but none started with prefix %s" %
-              (str(rootdirs), parentdir_prefix))
-    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
-
-
-SHORT_VERSION_PY = """
-# This file was generated by 'versioneer.py' (0.18) from
-# revision-control system data, or from the parent directory name of an
-# unpacked source archive. Distribution tarballs contain a pre-generated copy
-# of this file.
-
-import json
-
-version_json = '''
-%s
-'''  # END VERSION_JSON
-
-
-def get_versions():
-    return json.loads(version_json)
-"""
-
-
-def versions_from_file(filename):
-    """Try to determine the version from _version.py if present."""
-    try:
-        with open(filename) as f:
-            contents = f.read()
-    except EnvironmentError:
-        raise NotThisMethod("unable to read _version.py")
-    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
-                   contents, re.M | re.S)
-    if not mo:
-        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
-                       contents, re.M | re.S)
-    if not mo:
-        raise NotThisMethod("no version_json in _version.py")
-    return json.loads(mo.group(1))
-
-
-def write_to_version_file(filename, versions):
-    """Write the given version number to the given _version.py file."""
-    os.unlink(filename)
-    contents = json.dumps(versions, sort_keys=True,
-                          indent=1, separators=(",", ": "))
-    with open(filename, "w") as f:
-        f.write(SHORT_VERSION_PY % contents)
-
-    print("set %s to '%s'" % (filename, versions["version"]))
-
-
-def plus_or_dot(pieces):
-    """Return a + if we don't already have one, else return a ."""
-    if "+" in pieces.get("closest-tag", ""):
-        return "."
-    return "+"
-
-
-def render_pep440(pieces):
-    """Build up version string, with post-release "local version identifier".
-
-    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
-    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
-
-    Exceptions:
-    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += plus_or_dot(pieces)
-            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
-                                          pieces["short"])
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def render_pep440_pre(pieces):
-    """TAG[.post.devDISTANCE] -- No -dirty.
-
-    Exceptions:
-    1: no tags. 0.post.devDISTANCE
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += ".post.dev%d" % pieces["distance"]
-    else:
-        # exception #1
-        rendered = "0.post.dev%d" % pieces["distance"]
-    return rendered
-
-
-def render_pep440_post(pieces):
-    """TAG[.postDISTANCE[.dev0]+gHEX] .
-
-    The ".dev0" means dirty. Note that .dev0 sorts backwards
-    (a dirty tree will appear "older" than the corresponding clean one),
-    but you shouldn't be releasing software with -dirty anyways.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "g%s" % pieces["short"]
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-        rendered += "+g%s" % pieces["short"]
-    return rendered
-
-
-def render_pep440_old(pieces):
-    """TAG[.postDISTANCE[.dev0]] .
-
-    The ".dev0" means dirty.
-
-    Eexceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-    return rendered
-
-
-def render_git_describe(pieces):
-    """TAG[-DISTANCE-gHEX][-dirty].
-
-    Like 'git describe --tags --dirty --always'.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render_git_describe_long(pieces):
-    """TAG-DISTANCE-gHEX[-dirty].
-
-    Like 'git describe --tags --dirty --always -long'.
-    The distance/hash is unconditional.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render(pieces, style):
-    """Render the given version pieces into the requested style."""
-    if pieces["error"]:
-        return {"version": "unknown",
-                "full-revisionid": pieces.get("long"),
-                "dirty": None,
-                "error": pieces["error"],
-                "date": None}
-
-    if not style or style == "default":
-        style = "pep440"  # the default
-
-    if style == "pep440":
-        rendered = render_pep440(pieces)
-    elif style == "pep440-pre":
-        rendered = render_pep440_pre(pieces)
-    elif style == "pep440-post":
-        rendered = render_pep440_post(pieces)
-    elif style == "pep440-old":
-        rendered = render_pep440_old(pieces)
-    elif style == "git-describe":
-        rendered = render_git_describe(pieces)
-    elif style == "git-describe-long":
-        rendered = render_git_describe_long(pieces)
-    else:
-        raise ValueError("unknown style '%s'" % style)
-
-    return {"version": rendered, "full-revisionid": pieces["long"],
-            "dirty": pieces["dirty"], "error": None,
-            "date": pieces.get("date")}
-
-
-class VersioneerBadRootError(Exception):
-    """The project root directory is unknown or missing key files."""
-
-
-def get_versions(verbose=False):
-    """Get the project version from whatever source is available.
-
-    Returns dict with two keys: 'version' and 'full'.
-    """
-    if "versioneer" in sys.modules:
-        # see the discussion in cmdclass.py:get_cmdclass()
-        del sys.modules["versioneer"]
-
-    root = get_root()
-    cfg = get_config_from_root(root)
-
-    assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
-    handlers = HANDLERS.get(cfg.VCS)
-    assert handlers, "unrecognized VCS '%s'" % cfg.VCS
-    verbose = verbose or cfg.verbose
-    assert cfg.versionfile_source is not None, \
-        "please set versioneer.versionfile_source"
-    assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
-
-    versionfile_abs = os.path.join(root, cfg.versionfile_source)
-
-    # extract version from first of: _version.py, VCS command (e.g. 'git
-    # describe'), parentdir. This is meant to work for developers using a
-    # source checkout, for users of a tarball created by 'setup.py sdist',
-    # and for users of a tarball/zipball created by 'git archive' or github's
-    # download-from-tag feature or the equivalent in other VCSes.
-
-    get_keywords_f = handlers.get("get_keywords")
-    from_keywords_f = handlers.get("keywords")
-    if get_keywords_f and from_keywords_f:
-        try:
-            keywords = get_keywords_f(versionfile_abs)
-            ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
-            if verbose:
-                print("got version from expanded keyword %s" % ver)
-            return ver
-        except NotThisMethod:
-            pass
-
-    try:
-        ver = versions_from_file(versionfile_abs)
-        if verbose:
-            print("got version from file %s %s" % (versionfile_abs, ver))
-        return ver
-    except NotThisMethod:
-        pass
-
-    from_vcs_f = handlers.get("pieces_from_vcs")
-    if from_vcs_f:
-        try:
-            pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
-            ver = render(pieces, cfg.style)
-            if verbose:
-                print("got version from VCS %s" % ver)
-            return ver
-        except NotThisMethod:
-            pass
-
-    try:
-        if cfg.parentdir_prefix:
-            ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
-            if verbose:
-                print("got version from parentdir %s" % ver)
-            return ver
-    except NotThisMethod:
-        pass
-
-    if verbose:
-        print("unable to compute version")
-
-    return {"version": "0+unknown", "full-revisionid": None,
-            "dirty": None, "error": "unable to compute version",
-            "date": None}
-
-
-def get_version():
-    """Get the short version string for this project."""
-    return get_versions()["version"]
-
-
-def get_cmdclass():
-    """Get the custom setuptools/distutils subclasses used by Versioneer."""
-    if "versioneer" in sys.modules:
-        del sys.modules["versioneer"]
-        # this fixes the "python setup.py develop" case (also 'install' and
-        # 'easy_install .'), in which subdependencies of the main project are
-        # built (using setup.py bdist_egg) in the same python process. Assume
-        # a main project A and a dependency B, which use different versions
-        # of Versioneer. A's setup.py imports A's Versioneer, leaving it in
-        # sys.modules by the time B's setup.py is executed, causing B to run
-        # with the wrong versioneer. Setuptools wraps the sub-dep builds in a
-        # sandbox that restores sys.modules to it's pre-build state, so the
-        # parent is protected against the child's "import versioneer". By
-        # removing ourselves from sys.modules here, before the child build
-        # happens, we protect the child from the parent's versioneer too.
-        # Also see https://github.com/warner/python-versioneer/issues/52
-
-    cmds = {}
-
-    # we add "version" to both distutils and setuptools
-    from distutils.core import Command
-
-    class cmd_version(Command):
-        description = "report generated version string"
-        user_options = []
-        boolean_options = []
-
-        def initialize_options(self):
-            pass
-
-        def finalize_options(self):
-            pass
-
-        def run(self):
-            vers = get_versions(verbose=True)
-            print("Version: %s" % vers["version"])
-            print(" full-revisionid: %s" % vers.get("full-revisionid"))
-            print(" dirty: %s" % vers.get("dirty"))
-            print(" date: %s" % vers.get("date"))
-            if vers["error"]:
-                print(" error: %s" % vers["error"])
-    cmds["version"] = cmd_version
-
-    # we override "build_py" in both distutils and setuptools
-    #
-    # most invocation pathways end up running build_py:
-    #  distutils/build -> build_py
-    #  distutils/install -> distutils/build ->..
-    #  setuptools/bdist_wheel -> distutils/install ->..
-    #  setuptools/bdist_egg -> distutils/install_lib -> build_py
-    #  setuptools/install -> bdist_egg ->..
-    #  setuptools/develop -> ?
-    #  pip install:
-    #   copies source tree to a tempdir before running egg_info/etc
-    #   if .git isn't copied too, 'git describe' will fail
-    #   then does setup.py bdist_wheel, or sometimes setup.py install
-    #  setup.py egg_info -> ?
-
-    # we override different "build_py" commands for both environments
-    if "setuptools" in sys.modules:
-        from setuptools.command.build_py import build_py as _build_py
-    else:
-        from distutils.command.build_py import build_py as _build_py
-
-    class cmd_build_py(_build_py):
-        def run(self):
-            root = get_root()
-            cfg = get_config_from_root(root)
-            versions = get_versions()
-            _build_py.run(self)
-            # now locate _version.py in the new build/ directory and replace
-            # it with an updated value
-            if cfg.versionfile_build:
-                target_versionfile = os.path.join(self.build_lib,
-                                                  cfg.versionfile_build)
-                print("UPDATING %s" % target_versionfile)
-                write_to_version_file(target_versionfile, versions)
-    cmds["build_py"] = cmd_build_py
-
-    if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
-        from cx_Freeze.dist import build_exe as _build_exe
-        # nczeczulin reports that py2exe won't like the pep440-style string
-        # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
-        # setup(console=[{
-        #   "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
-        #   "product_version": versioneer.get_version(),
-        #   ...
-
-        class cmd_build_exe(_build_exe):
-            def run(self):
-                root = get_root()
-                cfg = get_config_from_root(root)
-                versions = get_versions()
-                target_versionfile = cfg.versionfile_source
-                print("UPDATING %s" % target_versionfile)
-                write_to_version_file(target_versionfile, versions)
-
-                _build_exe.run(self)
-                os.unlink(target_versionfile)
-                with open(cfg.versionfile_source, "w") as f:
-                    LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(LONG %
-                            {"DOLLAR": "$",
-                             "STYLE": cfg.style,
-                             "TAG_PREFIX": cfg.tag_prefix,
-                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                             })
-        cmds["build_exe"] = cmd_build_exe
-        del cmds["build_py"]
-
-    if 'py2exe' in sys.modules:  # py2exe enabled?
-        try:
-            from py2exe.distutils_buildexe import py2exe as _py2exe  # py3
-        except ImportError:
-            from py2exe.build_exe import py2exe as _py2exe  # py2
-
-        class cmd_py2exe(_py2exe):
-            def run(self):
-                root = get_root()
-                cfg = get_config_from_root(root)
-                versions = get_versions()
-                target_versionfile = cfg.versionfile_source
-                print("UPDATING %s" % target_versionfile)
-                write_to_version_file(target_versionfile, versions)
-
-                _py2exe.run(self)
-                os.unlink(target_versionfile)
-                with open(cfg.versionfile_source, "w") as f:
-                    LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(LONG %
-                            {"DOLLAR": "$",
-                             "STYLE": cfg.style,
-                             "TAG_PREFIX": cfg.tag_prefix,
-                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                             })
-        cmds["py2exe"] = cmd_py2exe
-
-    # we override different "sdist" commands for both environments
-    if "setuptools" in sys.modules:
-        from setuptools.command.sdist import sdist as _sdist
-    else:
-        from distutils.command.sdist import sdist as _sdist
-
-    class cmd_sdist(_sdist):
-        def run(self):
-            versions = get_versions()
-            self._versioneer_generated_versions = versions
-            # unless we update this, the command will keep using the old
-            # version
-            self.distribution.metadata.version = versions["version"]
-            return _sdist.run(self)
-
-        def make_release_tree(self, base_dir, files):
-            root = get_root()
-            cfg = get_config_from_root(root)
-            _sdist.make_release_tree(self, base_dir, files)
-            # now locate _version.py in the new base_dir directory
-            # (remembering that it may be a hardlink) and replace it with an
-            # updated value
-            target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
-            print("UPDATING %s" % target_versionfile)
-            write_to_version_file(target_versionfile,
-                                  self._versioneer_generated_versions)
-    cmds["sdist"] = cmd_sdist
-
-    return cmds
-
-
-CONFIG_ERROR = """
-setup.cfg is missing the necessary Versioneer configuration. You need
-a section like:
-
- [versioneer]
- VCS = git
- style = pep440
- versionfile_source = src/myproject/_version.py
- versionfile_build = myproject/_version.py
- tag_prefix =
- parentdir_prefix = myproject-
-
-You will also need to edit your setup.py to use the results:
-
- import versioneer
- setup(version=versioneer.get_version(),
-       cmdclass=versioneer.get_cmdclass(), ...)
-
-Please read the docstring in ./versioneer.py for configuration instructions,
-edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
-"""
-
-SAMPLE_CONFIG = """
-# See the docstring in versioneer.py for instructions. Note that you must
-# re-run 'versioneer.py setup' after changing this section, and commit the
-# resulting files.
-
-[versioneer]
-#VCS = git
-#style = pep440
-#versionfile_source =
-#versionfile_build =
-#tag_prefix =
-#parentdir_prefix =
-
-"""
-
-INIT_PY_SNIPPET = """
-from ._version import get_versions
-__version__ = get_versions()['version']
-del get_versions
-"""
-
-
-def do_setup():
-    """Main VCS-independent setup function for installing Versioneer."""
-    root = get_root()
-    try:
-        cfg = get_config_from_root(root)
-    except (EnvironmentError, configparser.NoSectionError,
-            configparser.NoOptionError) as e:
-        if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
-            print("Adding sample versioneer config to setup.cfg",
-                  file=sys.stderr)
-            with open(os.path.join(root, "setup.cfg"), "a") as f:
-                f.write(SAMPLE_CONFIG)
-        print(CONFIG_ERROR, file=sys.stderr)
-        return 1
-
-    print(" creating %s" % cfg.versionfile_source)
-    with open(cfg.versionfile_source, "w") as f:
-        LONG = LONG_VERSION_PY[cfg.VCS]
-        f.write(LONG % {"DOLLAR": "$",
-                        "STYLE": cfg.style,
-                        "TAG_PREFIX": cfg.tag_prefix,
-                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                        })
-
-    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
-                       "__init__.py")
-    if os.path.exists(ipy):
-        try:
-            with open(ipy, "r") as f:
-                old = f.read()
-        except EnvironmentError:
-            old = ""
-        if INIT_PY_SNIPPET not in old:
-            print(" appending to %s" % ipy)
-            with open(ipy, "a") as f:
-                f.write(INIT_PY_SNIPPET)
-        else:
-            print(" %s unmodified" % ipy)
-    else:
-        print(" %s doesn't exist, ok" % ipy)
-        ipy = None
-
-    # Make sure both the top-level "versioneer.py" and versionfile_source
-    # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
-    # they'll be copied into source distributions. Pip won't be able to
-    # install the package without this.
-    manifest_in = os.path.join(root, "MANIFEST.in")
-    simple_includes = set()
-    try:
-        with open(manifest_in, "r") as f:
-            for line in f:
-                if line.startswith("include "):
-                    for include in line.split()[1:]:
-                        simple_includes.add(include)
-    except EnvironmentError:
-        pass
-    # That doesn't cover everything MANIFEST.in can do
-    # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
-    # it might give some false negatives. Appending redundant 'include'
-    # lines is safe, though.
-    if "versioneer.py" not in simple_includes:
-        print(" appending 'versioneer.py' to MANIFEST.in")
-        with open(manifest_in, "a") as f:
-            f.write("include versioneer.py\n")
-    else:
-        print(" 'versioneer.py' already in MANIFEST.in")
-    if cfg.versionfile_source not in simple_includes:
-        print(" appending versionfile_source ('%s') to MANIFEST.in" %
-              cfg.versionfile_source)
-        with open(manifest_in, "a") as f:
-            f.write("include %s\n" % cfg.versionfile_source)
-    else:
-        print(" versionfile_source already in MANIFEST.in")
-
-    # Make VCS-specific changes. For git, this means creating/changing
-    # .gitattributes to mark _version.py for export-subst keyword
-    # substitution.
-    do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
-    return 0
-
-
-def scan_setup_py():
-    """Validate the contents of setup.py against Versioneer's expectations."""
-    found = set()
-    setters = False
-    errors = 0
-    with open("setup.py", "r") as f:
-        for line in f.readlines():
-            if "import versioneer" in line:
-                found.add("import")
-            if "versioneer.get_cmdclass()" in line:
-                found.add("cmdclass")
-            if "versioneer.get_version()" in line:
-                found.add("get_version")
-            if "versioneer.VCS" in line:
-                setters = True
-            if "versioneer.versionfile_source" in line:
-                setters = True
-    if len(found) != 3:
-        print("")
-        print("Your setup.py appears to be missing some important items")
-        print("(but I might be wrong). Please make sure it has something")
-        print("roughly like the following:")
-        print("")
-        print(" import versioneer")
-        print(" setup( version=versioneer.get_version(),")
-        print("        cmdclass=versioneer.get_cmdclass(),  ...)")
-        print("")
-        errors += 1
-    if setters:
-        print("You should remove lines like 'versioneer.VCS = ' and")
-        print("'versioneer.versionfile_source = ' . This configuration")
-        print("now lives in setup.cfg, and should be removed from setup.py")
-        print("")
-        errors += 1
-    return errors
-
-
-if __name__ == "__main__":
-    cmd = sys.argv[1]
-    if cmd == "setup":
-        errors = do_setup()
-        errors += scan_setup_py()
-        if errors:
-            sys.exit(1)

From 54cac54a97fd8d4cda9466f449b9fb4636091a08 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Mar 2022 20:07:36 -0500
Subject: [PATCH 002/167] Adding distance

---
 python/.flake8                                |    9 +
 python/.flake8.cython                         |   28 +
 python/pylibraft/pylibraft/__init__.py        |   14 +
 python/pylibraft/pylibraft/_version.py        |  567 +++++
 .../pylibraft/pylibraft/common/__init__.pxd   |   14 +
 python/pylibraft/pylibraft/common/__init__.py |   14 +
 python/pylibraft/pylibraft/common/cuda.pxd    |   22 +
 python/pylibraft/pylibraft/common/cuda.pyx    |   84 +
 python/pylibraft/pylibraft/common/handle.pxd  |   47 +
 python/pylibraft/pylibraft/common/handle.pyx  |   90 +
 .../pylibraft/common/interruptible.pxd        |   34 +
 .../pylibraft/common/interruptible.pyx        |   84 +
 .../pylibraft/pylibraft/distance/__init__.pxd |   14 +
 .../pylibraft/pylibraft/distance/__init__.py  |   14 +
 .../pylibraft/pylibraft/distance/distance.pxd |   14 +
 python/pylibraft/pylibraft/test/__init__.py   |   14 +
 python/pylibraft/setup.cfg                    |   58 +
 python/pylibraft/setup.py                     |  198 ++
 python/pylibraft/setuputils.py                |   65 +
 python/pylibraft/versioneer.py                | 1822 +++++++++++++++++
 python/raft/pytest.ini                        |    8 +
 python/raft/raft/__init__.py                  |   16 +
 python/raft/raft/_version.py                  |  567 +++++
 python/raft/raft/common/__init__.pxd          |    0
 python/raft/raft/common/__init__.py           |   17 +
 python/raft/raft/common/cuda.pxd              |   22 +
 python/raft/raft/common/cuda.pyx              |   84 +
 python/raft/raft/common/handle.pxd            |   48 +
 python/raft/raft/common/handle.pyx            |   90 +
 python/raft/raft/common/interruptible.pxd     |   34 +
 python/raft/raft/common/interruptible.pyx     |   84 +
 python/raft/raft/dask/__init__.py             |   16 +
 python/raft/raft/dask/common/__init__.py      |   34 +
 python/raft/raft/dask/common/comms.py         |  648 ++++++
 python/raft/raft/dask/common/comms_utils.pyx  |  313 +++
 python/raft/raft/dask/common/nccl.pyx         |  253 +++
 python/raft/raft/dask/common/ucx.py           |   93 +
 python/raft/raft/dask/common/utils.py         |   39 +
 python/raft/raft/include_test/__init__.py     |   16 +
 .../raft/include_test/raft_include_test.pyx   |   19 +
 python/raft/raft/test/__init__.py             |   14 +
 python/raft/raft/test/conftest.py             |   49 +
 python/raft/raft/test/test_comms.py           |  336 +++
 python/raft/raft/test/test_interruptible.py   |   54 +
 python/raft/raft/test/test_raft.py            |   31 +
 python/raft/record.txt                        |   44 +
 python/raft/setup.cfg                         |   58 +
 python/raft/setup.py                          |  202 ++
 python/raft/setuputils.py                     |   65 +
 python/raft/versioneer.py                     | 1822 +++++++++++++++++
 50 files changed, 8282 insertions(+)
 create mode 100644 python/.flake8
 create mode 100644 python/.flake8.cython
 create mode 100644 python/pylibraft/pylibraft/__init__.py
 create mode 100644 python/pylibraft/pylibraft/_version.py
 create mode 100644 python/pylibraft/pylibraft/common/__init__.pxd
 create mode 100644 python/pylibraft/pylibraft/common/__init__.py
 create mode 100644 python/pylibraft/pylibraft/common/cuda.pxd
 create mode 100644 python/pylibraft/pylibraft/common/cuda.pyx
 create mode 100644 python/pylibraft/pylibraft/common/handle.pxd
 create mode 100644 python/pylibraft/pylibraft/common/handle.pyx
 create mode 100644 python/pylibraft/pylibraft/common/interruptible.pxd
 create mode 100644 python/pylibraft/pylibraft/common/interruptible.pyx
 create mode 100644 python/pylibraft/pylibraft/distance/__init__.pxd
 create mode 100644 python/pylibraft/pylibraft/distance/__init__.py
 create mode 100644 python/pylibraft/pylibraft/distance/distance.pxd
 create mode 100644 python/pylibraft/pylibraft/test/__init__.py
 create mode 100644 python/pylibraft/setup.cfg
 create mode 100644 python/pylibraft/setup.py
 create mode 100755 python/pylibraft/setuputils.py
 create mode 100644 python/pylibraft/versioneer.py
 create mode 100644 python/raft/pytest.ini
 create mode 100644 python/raft/raft/__init__.py
 create mode 100644 python/raft/raft/_version.py
 create mode 100644 python/raft/raft/common/__init__.pxd
 create mode 100644 python/raft/raft/common/__init__.py
 create mode 100644 python/raft/raft/common/cuda.pxd
 create mode 100644 python/raft/raft/common/cuda.pyx
 create mode 100644 python/raft/raft/common/handle.pxd
 create mode 100644 python/raft/raft/common/handle.pyx
 create mode 100644 python/raft/raft/common/interruptible.pxd
 create mode 100644 python/raft/raft/common/interruptible.pyx
 create mode 100644 python/raft/raft/dask/__init__.py
 create mode 100644 python/raft/raft/dask/common/__init__.py
 create mode 100644 python/raft/raft/dask/common/comms.py
 create mode 100644 python/raft/raft/dask/common/comms_utils.pyx
 create mode 100644 python/raft/raft/dask/common/nccl.pyx
 create mode 100644 python/raft/raft/dask/common/ucx.py
 create mode 100644 python/raft/raft/dask/common/utils.py
 create mode 100644 python/raft/raft/include_test/__init__.py
 create mode 100644 python/raft/raft/include_test/raft_include_test.pyx
 create mode 100644 python/raft/raft/test/__init__.py
 create mode 100644 python/raft/raft/test/conftest.py
 create mode 100644 python/raft/raft/test/test_comms.py
 create mode 100644 python/raft/raft/test/test_interruptible.py
 create mode 100644 python/raft/raft/test/test_raft.py
 create mode 100644 python/raft/record.txt
 create mode 100644 python/raft/setup.cfg
 create mode 100644 python/raft/setup.py
 create mode 100755 python/raft/setuputils.py
 create mode 100644 python/raft/versioneer.py

diff --git a/python/.flake8 b/python/.flake8
new file mode 100644
index 0000000000..ef2e5a8495
--- /dev/null
+++ b/python/.flake8
@@ -0,0 +1,9 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+[flake8]
+exclude = __init__.py
+ignore =
+    # line break before binary operator
+    W503
+    # whitespace before :
+    E203
\ No newline at end of file
diff --git a/python/.flake8.cython b/python/.flake8.cython
new file mode 100644
index 0000000000..3cd436d3f3
--- /dev/null
+++ b/python/.flake8.cython
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+[flake8]
+filename = *.pyx, *.pxd
+exclude = *.egg, build, docs, .git
+ignore = E999, E225, E226, E227, W503, W504
+
+# Rules ignored:
+# E999: invalid syntax (works for Python, not Cython)
+# E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
+# E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
+# E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
+# W503: line break before binary operator (breaks lines that start with a pointer)
+# W504: line break after binary operator (breaks lines that end with a pointer)
diff --git a/python/pylibraft/pylibraft/__init__.py b/python/pylibraft/pylibraft/__init__.py
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/_version.py b/python/pylibraft/pylibraft/_version.py
new file mode 100644
index 0000000000..58cd44da3b
--- /dev/null
+++ b/python/pylibraft/pylibraft/_version.py
@@ -0,0 +1,567 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.18 (https://github.com/warner/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "$Format:%d$"
+    git_full = "$Format:%H$"
+    git_date = "$Format:%ci$"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "pep440"
+    cfg.tag_prefix = "v"
+    cfg.parentdir_prefix = "pylibraft-"
+    cfg.versionfile_source = "pylibraft/_version.py"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+
+    return decorate
+
+
+def run_command(
+        commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
+):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen(
+                [c] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+                )
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {
+                "version": dirname[len(parentdir_prefix):],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r"\d", r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(
+        GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True
+    )
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            "%s*" % tag_prefix,
+            ],
+        cwd=root,
+    )
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = (
+                    "unable to parse git-describe output: '%s'" % describe_out
+            )
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(
+            GITS, ["rev-list", "HEAD", "--count"], cwd=root
+        )
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(
+            get_keywords(), cfg.tag_prefix, verbose
+        )
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split("/"):
+            root = os.path.dirname(root)
+    except NameError:
+        return {
+            "version": "0+unknown",
+            "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to find root of source tree",
+            "date": None,
+        }
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
diff --git a/python/pylibraft/pylibraft/common/__init__.pxd b/python/pylibraft/pylibraft/common/__init__.pxd
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/__init__.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/common/__init__.py b/python/pylibraft/pylibraft/common/__init__.py
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/common/cuda.pxd b/python/pylibraft/pylibraft/common/cuda.pxd
new file mode 100644
index 0000000000..ae6246dee1
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/cuda.pxd
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from cuda.ccudart cimport cudaStream_t
+
+cdef class Stream:
+    cdef cudaStream_t s
+
+    cdef cudaStream_t getStream(self)
diff --git a/python/pylibraft/pylibraft/common/cuda.pyx b/python/pylibraft/pylibraft/common/cuda.pyx
new file mode 100644
index 0000000000..eb48f64cf1
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/cuda.pyx
@@ -0,0 +1,84 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cuda.ccudart cimport(
+    cudaStream_t,
+    cudaError_t,
+    cudaSuccess,
+    cudaStreamCreate,
+    cudaStreamDestroy,
+    cudaStreamSynchronize,
+    cudaGetLastError,
+    cudaGetErrorString,
+    cudaGetErrorName
+)
+
+
+class CudaRuntimeError(RuntimeError):
+    def __init__(self, extraMsg=None):
+        cdef cudaError_t e = cudaGetLastError()
+        cdef bytes errMsg = cudaGetErrorString(e)
+        cdef bytes errName = cudaGetErrorName(e)
+        msg = "Error! %s reason='%s'" % (errName.decode(), errMsg.decode())
+        if extraMsg is not None:
+            msg += " extraMsg='%s'" % extraMsg
+        super(CudaRuntimeError, self).__init__(msg)
+
+
+cdef class Stream:
+    """
+    Stream represents a thin-wrapper around cudaStream_t and its operations.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from raft.common.cuda import Stream
+        stream = Stream()
+        stream.sync()
+        del stream  # optional!
+    """
+    def __cinit__(self):
+        cdef cudaStream_t stream
+        cdef cudaError_t e = cudaStreamCreate(&stream)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream create")
+        self.s = stream
+
+    def __dealloc__(self):
+        self.sync()
+        cdef cudaError_t e = cudaStreamDestroy(self.s)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream destroy")
+
+    def sync(self):
+        """
+        Synchronize on the cudastream owned by this object. Note that this
+        could raise exception due to issues with previous asynchronous
+        launches
+        """
+        cdef cudaError_t e = cudaStreamSynchronize(self.s)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream sync")
+
+    cdef cudaStream_t getStream(self):
+        return self.s
diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
new file mode 100644
index 0000000000..ed8b11dca0
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -0,0 +1,47 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libcpp.memory cimport shared_ptr
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
+from libcpp.memory cimport shared_ptr
+from libcpp.memory cimport unique_ptr
+
+cdef extern from "raft/mr/device/allocator.hpp" \
+        namespace "raft::mr::device" nogil:
+    cdef cppclass allocator:
+        pass
+
+cdef extern from "raft/handle.hpp" namespace "raft" nogil:
+    cdef cppclass handle_t:
+        handle_t() except +
+        handle_t(cuda_stream_view stream_view) except +
+        handle_t(cuda_stream_view stream_view,
+                 shared_ptr[cuda_stream_pool] stream_pool) except +
+        void set_device_allocator(shared_ptr[allocator] a) except +
+        shared_ptr[allocator] get_device_allocator() except +
+        cuda_stream_view get_stream() except +
+        void sync_stream() except +
+
+cdef class Handle:
+    cdef unique_ptr[handle_t] c_obj
+    cdef shared_ptr[cuda_stream_pool] stream_pool
+    cdef int n_streams
diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
new file mode 100644
index 0000000000..f4db60f794
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/handle.pyx
@@ -0,0 +1,90 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+# import raft
+from libcpp.memory cimport shared_ptr
+from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+
+from .cuda cimport Stream
+from .cuda import CudaRuntimeError
+
+
+cdef class Handle:
+    """
+    Handle is a lightweight python wrapper around the corresponding C++ class
+    of handle_t exposed by RAFT's C++ interface. Refer to the header file
+    raft/handle.hpp for interface level details of this struct
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from raft.common import Stream, Handle
+        stream = Stream()
+        handle = Handle(stream)
+
+        # call algos here
+
+        # final sync of all work launched in the stream of this handle
+        # this is same as `raft.cuda.Stream.sync()` call, but safer in case
+        # the default stream inside the `handle_t` is being used
+        handle.sync()
+        del handle  # optional!
+    """
+
+    def __cinit__(self, stream: Stream = None, n_streams=0):
+        self.n_streams = n_streams
+        if n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(n_streams))
+
+        cdef cuda_stream_view c_stream
+        if stream is None:
+            # this constructor will construct a "main" handle on
+            # per-thread default stream, which is non-blocking
+            self.c_obj.reset(new handle_t(cuda_stream_per_thread,
+                                          self.stream_pool))
+        else:
+            # this constructor constructs a handle on user stream
+            c_stream = cuda_stream_view(stream.getStream())
+            self.c_obj.reset(new handle_t(c_stream,
+                                          self.stream_pool))
+
+    def sync(self):
+        """
+        Issues a sync on the stream set for this handle.
+        """
+        self.c_obj.get()[0].sync_stream()
+
+    def getHandle(self):
+        return <size_t> self.c_obj.get()
+
+    def __getstate__(self):
+        return self.n_streams
+
+    def __setstate__(self, state):
+        self.n_streams = state
+        if self.n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(self.n_streams))
+
+        self.c_obj.reset(new handle_t(cuda_stream_per_thread,
+                                      self.stream_pool))
diff --git a/python/pylibraft/pylibraft/common/interruptible.pxd b/python/pylibraft/pylibraft/common/interruptible.pxd
new file mode 100644
index 0000000000..cb639c0f72
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/interruptible.pxd
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libcpp.memory cimport shared_ptr
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+
+cdef extern from "raft/interruptible.hpp" namespace "raft" nogil:
+    cdef cppclass interruptible:
+        void cancel()
+
+cdef extern from "raft/interruptible.hpp" \
+        namespace "raft::interruptible" nogil:
+    cdef void inter_synchronize \
+        "raft::interruptible::synchronize"(cuda_stream_view stream) except+
+    cdef void inter_yield "raft::interruptible::yield"() except+
+    cdef shared_ptr[interruptible] get_token() except+
diff --git a/python/pylibraft/pylibraft/common/interruptible.pyx b/python/pylibraft/pylibraft/common/interruptible.pyx
new file mode 100644
index 0000000000..4dd337649b
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/interruptible.pyx
@@ -0,0 +1,84 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+import contextlib
+import signal
+from cython.operator cimport dereference
+
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from cuda.ccudart cimport cudaStream_t
+from .cuda cimport Stream
+
+
+@contextlib.contextmanager
+def cuda_interruptible():
+    '''
+    Temporarily install a keyboard interrupt handler (Ctrl+C)
+    that cancels the enclosed interruptible C++ thread.
+
+    Use this on a long-running C++ function imported via cython:
+
+    .. code-block:: python
+
+        with cuda_interruptible():
+            my_long_running_function(...)
+
+    It's also recommended to release the GIL during the call, to
+    make sure the handler has a chance to run:
+
+    .. code-block:: python
+
+        with cuda_interruptible():
+            with nogil:
+                my_long_running_function(...)
+
+    '''
+    cdef shared_ptr[interruptible] token = get_token()
+
+    def newhr(*args, **kwargs):
+        with nogil:
+            dereference(token).cancel()
+
+    oldhr = signal.signal(signal.SIGINT, newhr)
+    try:
+        yield
+    finally:
+        signal.signal(signal.SIGINT, oldhr)
+
+
+def synchronize(stream: Stream):
+    '''
+    Same as cudaStreamSynchronize, but can be interrupted
+    if called within a `with cuda_interruptible()` block.
+    '''
+    cdef cuda_stream_view c_stream = cuda_stream_view(stream.getStream())
+    with nogil:
+        inter_synchronize(c_stream)
+
+
+def cuda_yield():
+    '''
+    Check for an asynchronously received interrupted_exception.
+    Raises the exception if a user pressed Ctrl+C within a
+    `with cuda_interruptible()` block before.
+    '''
+    with nogil:
+        inter_yield()
diff --git a/python/pylibraft/pylibraft/distance/__init__.pxd b/python/pylibraft/pylibraft/distance/__init__.pxd
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/__init__.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/distance/__init__.py b/python/pylibraft/pylibraft/distance/__init__.py
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/distance/distance.pxd b/python/pylibraft/pylibraft/distance/distance.pxd
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/distance.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/test/__init__.py b/python/pylibraft/pylibraft/test/__init__.py
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/setup.cfg b/python/pylibraft/setup.cfg
new file mode 100644
index 0000000000..163304670a
--- /dev/null
+++ b/python/pylibraft/setup.cfg
@@ -0,0 +1,58 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+[flake8]
+exclude = __init__.py,versioneer.py
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+VCS = git
+style = pep440
+versionfile_source = raft/_version.py
+versionfile_build = raft/_version.py
+tag_prefix = v
+parentdir_prefix = raft-
+
+[isort]
+line_length=79
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+combine_as_imports=True
+order_by_type=True
+known_dask=
+    dask
+    distributed
+    dask_cuda
+known_rapids=
+    nvtext
+    cudf
+    cuml
+    cugraph
+    dask_cudf
+    rmm
+known_first_party=
+    raft
+default_section=THIRDPARTY
+sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
+skip=
+    thirdparty
+    .eggs
+    .git
+    .hg
+    .mypy_cache
+    .tox
+    .venv
+    _build
+    buck-out
+    build
+    dist
+    __init__.py
+
+[options]
+packages = find:
+install_requires =
+    numpy
+    numba>=0.49
+python_requires = >=3.7,<3.9
diff --git a/python/pylibraft/setup.py b/python/pylibraft/setup.py
new file mode 100644
index 0000000000..5103ed1f83
--- /dev/null
+++ b/python/pylibraft/setup.py
@@ -0,0 +1,198 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy
+import os
+import shutil
+import sys
+import sysconfig
+
+# Must import in this order:
+#   setuptools -> Cython.Distutils.build_ext -> setuptools.command.build_ext
+# Otherwise, setuptools.command.build_ext ends up inheriting from
+# Cython.Distutils.old_build_ext which we do not want
+import setuptools
+
+try:
+    from Cython.Distutils.build_ext import new_build_ext as _build_ext
+except ImportError:
+    from setuptools.command.build_ext import build_ext as _build_ext
+
+from distutils.sysconfig import get_python_lib
+
+import setuptools.command.build_ext
+from setuptools import find_packages, setup
+from setuptools.extension import Extension
+
+from setuputils import clean_folder
+from setuputils import get_environment_option
+from setuputils import get_cli_option
+
+from pathlib import Path
+
+import versioneer
+
+
+##############################################################################
+# - Dependencies include and lib folder setup --------------------------------
+
+install_requires = [
+    'cython'
+]
+
+cuda_home = get_environment_option("CUDA_HOME")
+
+clean_artifacts = get_cli_option('clean')
+single_gpu_build = get_cli_option('--singlegpu')
+
+
+if not cuda_home:
+    cuda_home = (
+        os.popen('echo "$(dirname $(dirname $(which nvcc)))"').read().strip()
+    )
+    print("-- Using nvcc to detect CUDA, found at " + str(cuda_home))
+cuda_include_dir = os.path.join(cuda_home, "include")
+cuda_lib_dir = os.path.join(cuda_home, "lib64")
+
+##############################################################################
+# - Clean target -------------------------------------------------------------
+
+if clean_artifacts:
+    print("-- Cleaning all Python and Cython build artifacts...")
+
+    try:
+        setup_file_path = str(Path(__file__).parent.absolute())
+        shutil.rmtree(setup_file_path + '/.pytest_cache', ignore_errors=True)
+        shutil.rmtree(setup_file_path + '/pylibraft.egg-info', ignore_errors=True)
+        shutil.rmtree(setup_file_path + '/__pycache__', ignore_errors=True)
+
+        clean_folder(setup_file_path + '/pylibraft')
+        shutil.rmtree(setup_file_path + '/build')
+
+    except IOError:
+        pass
+
+    # need to terminate script so cythonizing doesn't get triggered after
+    # cleanup unintendedly
+    sys.argv.remove("clean")
+
+    if "--all" in sys.argv:
+        sys.argv.remove("--all")
+
+    if len(sys.argv) == 1:
+        sys.exit(0)
+
+
+##############################################################################
+# - Cython extensions build and parameters -----------------------------------
+
+
+libs = ['cudart', "cusolver", "cusparse", "cublas"]
+
+include_dirs = [cuda_include_dir,
+                numpy.get_include(),
+                "../cpp/include/",
+                os.path.dirname(sysconfig.get_path("include"))]
+
+extensions = [
+    Extension("*",
+              sources=["pylibraft/**/*.pyx"],
+              include_dirs=include_dirs,
+              library_dirs=[get_python_lib()],
+              runtime_library_dirs=[cuda_lib_dir,
+                                    os.path.join(os.sys.prefix, "lib")],
+              libraries=libs,
+              language='c++',
+              extra_compile_args=['-std=c++17'])
+]
+
+
+class build_ext_no_debug(_build_ext):
+
+    def build_extensions(self):
+        def remove_flags(compiler, *flags):
+            for flag in flags:
+                try:
+                    compiler.compiler_so = list(
+                        filter((flag).__ne__, compiler.compiler_so)
+                    )
+                except Exception:
+                    pass
+
+        # Full optimization
+        self.compiler.compiler_so.append("-O3")
+
+        # Ignore deprecation declaration warnings
+        self.compiler.compiler_so.append("-Wno-deprecated-declarations")
+
+        # No debug symbols, full optimization, no '-Wstrict-prototypes' warning
+        remove_flags(
+            self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes"
+        )
+        super().build_extensions()
+
+    def finalize_options(self):
+        if self.distribution.ext_modules:
+            # Delay import this to allow for Cython-less installs
+            from Cython.Build.Dependencies import cythonize
+
+            nthreads = getattr(self, "parallel", None)  # -j option in Py3.5+
+            nthreads = int(nthreads) if nthreads else None
+            self.distribution.ext_modules = cythonize(
+                self.distribution.ext_modules,
+                nthreads=nthreads,
+                force=self.force,
+                gdb_debug=False,
+                compiler_directives=dict(
+                    profile=False, language_level=3, embedsignature=True
+                ),
+            )
+        # Skip calling super() and jump straight to setuptools
+        setuptools.command.build_ext.build_ext.finalize_options(self)
+
+
+cmdclass = dict()
+cmdclass.update(versioneer.get_cmdclass())
+cmdclass["build_ext"] = build_ext_no_debug
+
+
+##############################################################################
+# - Python package generation ------------------------------------------------
+
+
+setup(name='pylibraft',
+      description="RAFT: Reusable Algorithms Functions and other Tools",
+      version=versioneer.get_version(),
+      classifiers=[
+        "Intended Audience :: Developers",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7"
+      ],
+      author="NVIDIA Corporation",
+      setup_requires=['cython'],
+      ext_modules=extensions,
+      package_data=dict.fromkeys(
+                         find_packages(include=["pylibraft.common",
+                                                "pylibraft.common.includes"]),
+                         ["*.hpp", "*.pxd"],
+      ),
+      packages=find_packages(include=['pylibraft', 'pylibraft.*']),
+      install_requires=install_requires,
+      license="Apache",
+      cmdclass=cmdclass,
+      zip_safe=False
+      )
diff --git a/python/pylibraft/setuputils.py b/python/pylibraft/setuputils.py
new file mode 100755
index 0000000000..d93e4b06a4
--- /dev/null
+++ b/python/pylibraft/setuputils.py
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import glob
+import os
+import shutil
+import sys
+
+
+def get_environment_option(name):
+    ENV_VARIABLE = os.environ.get(name, False)
+
+    if not ENV_VARIABLE:
+        print("-- " + name + " environment variable not set.")
+
+    else:
+        print("-- " + name + " detected with value: " + str(ENV_VARIABLE))
+
+    return ENV_VARIABLE
+
+
+def get_cli_option(name):
+    if name in sys.argv:
+        print("-- Detected " + str(name) + " build option.")
+        return True
+
+    else:
+        return False
+
+
+def clean_folder(path):
+    """
+    Function to clean all Cython and Python artifacts and cache folders. It
+    clean the folder as well as its direct children recursively.
+
+    Parameters
+    ----------
+    path : String
+        Path to the folder to be cleaned.
+    """
+    shutil.rmtree(path + '/__pycache__', ignore_errors=True)
+
+    folders = glob.glob(path + '/*/')
+    for folder in folders:
+        shutil.rmtree(folder + '/__pycache__', ignore_errors=True)
+
+        clean_folder(folder)
+
+        cython_exts = glob.glob(folder + '/*.cpp')
+        cython_exts.extend(glob.glob(folder + '/*.cpython*'))
+        for file in cython_exts:
+            os.remove(file)
diff --git a/python/pylibraft/versioneer.py b/python/pylibraft/versioneer.py
new file mode 100644
index 0000000000..64fea1c892
--- /dev/null
+++ b/python/pylibraft/versioneer.py
@@ -0,0 +1,1822 @@
+
+# Version: 0.18
+
+"""The Versioneer - like a rocketeer, but for versions.
+
+The Versioneer
+==============
+
+* like a rocketeer, but for versions!
+* https://github.com/warner/python-versioneer
+* Brian Warner
+* License: Public Domain
+* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy
+* [![Latest Version]
+(https://pypip.in/version/versioneer/badge.svg?style=flat)
+](https://pypi.python.org/pypi/versioneer/)
+* [![Build Status]
+(https://travis-ci.org/warner/python-versioneer.png?branch=master)
+](https://travis-ci.org/warner/python-versioneer)
+
+This is a tool for managing a recorded version number in distutils-based
+python projects. The goal is to remove the tedious and error-prone "update
+the embedded version string" step from your release process. Making a new
+release should be as easy as recording a new tag in your version-control
+system, and maybe making new tarballs.
+
+
+## Quick Install
+
+* `pip install versioneer` to somewhere to your $PATH
+* add a `[versioneer]` section to your setup.cfg (see below)
+* run `versioneer install` in your source tree, commit the results
+
+## Version Identifiers
+
+Source trees come from a variety of places:
+
+* a version-control system checkout (mostly used by developers)
+* a nightly tarball, produced by build automation
+* a snapshot tarball, produced by a web-based VCS browser, like github's
+  "tarball from tag" feature
+* a release tarball, produced by "setup.py sdist", distributed through PyPI
+
+Within each source tree, the version identifier (either a string or a number,
+this tool is format-agnostic) can come from a variety of places:
+
+* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
+  about recent "tags" and an absolute revision-id
+* the name of the directory into which the tarball was unpacked
+* an expanded VCS keyword ($Id$, etc)
+* a `_version.py` created by some earlier build step
+
+For released software, the version identifier is closely related to a VCS
+tag. Some projects use tag names that include more than just the version
+string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
+needs to strip the tag prefix to extract the version identifier. For
+unreleased software (between tags), the version identifier should provide
+enough information to help developers recreate the same tree, while also
+giving them an idea of roughly how old the tree is (after version 1.2, before
+version 1.3). Many VCS systems can report a description that captures this,
+for example `git describe --tags --dirty --always` reports things like
+"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
+0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
+uncommitted changes.
+
+The version identifier is used for multiple purposes:
+
+* to allow the module to self-identify its version: `myproject.__version__`
+* to choose a name and prefix for a 'setup.py sdist' tarball
+
+## Theory of Operation
+
+Versioneer works by adding a special `_version.py` file into your source
+tree, where your `__init__.py` can import it. This `_version.py` knows how to
+dynamically ask the VCS tool for version information at import time.
+
+`_version.py` also contains `$Revision$` markers, and the installation
+process marks `_version.py` to have this marker rewritten with a tag name
+during the `git archive` command. As a result, generated tarballs will
+contain enough information to get the proper version.
+
+To allow `setup.py` to compute a version too, a `versioneer.py` is added to
+the top level of your source tree, next to `setup.py` and the `setup.cfg`
+that configures it. This overrides several distutils/setuptools commands to
+compute the version when invoked, and changes `setup.py build` and `setup.py
+sdist` to replace `_version.py` with a small static file that contains just
+the generated version data.
+
+## Installation
+
+See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
+
+## Version-String Flavors
+
+Code which uses Versioneer can learn about its version string at runtime by
+importing `_version` from your main `__init__.py` file and running the
+`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
+import the top-level `versioneer.py` and run `get_versions()`.
+
+Both functions return a dictionary with different flavors of version
+information:
+
+* `['version']`: A condensed version string, rendered using the selected
+  style. This is the most commonly used value for the project's version
+  string. The default "pep440" style yields strings like `0.11`,
+  `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
+  below for alternative styles.
+
+* `['full-revisionid']`: detailed revision identifier. For Git, this is the
+  full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
+
+* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
+  commit date in ISO 8601 format. This will be None if the date is not
+  available.
+
+* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
+  this is only accurate if run in a VCS checkout, otherwise it is likely to
+  be False or None
+
+* `['error']`: if the version string could not be computed, this will be set
+  to a string describing the problem, otherwise it will be None. It may be
+  useful to throw an exception in setup.py if this is set, to avoid e.g.
+  creating tarballs with a version string of "unknown".
+
+Some variants are more useful than others. Including `full-revisionid` in a
+bug report should allow developers to reconstruct the exact code being tested
+(or indicate the presence of local changes that should be shared with the
+developers). `version` is suitable for display in an "about" box or a CLI
+`--version` output: it can be easily compared against release notes and lists
+of bugs fixed in various releases.
+
+The installer adds the following text to your `__init__.py` to place a basic
+version in `YOURPROJECT.__version__`:
+
+    from ._version import get_versions
+    __version__ = get_versions()['version']
+    del get_versions
+
+## Styles
+
+The setup.cfg `style=` configuration controls how the VCS information is
+rendered into a version string.
+
+The default style, "pep440", produces a PEP440-compliant string, equal to the
+un-prefixed tag name for actual releases, and containing an additional "local
+version" section with more detail for in-between builds. For Git, this is
+TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
+--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
+tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
+that this commit is two revisions ("+2") beyond the "0.11" tag. For released
+software (exactly equal to a known tag), the identifier will only contain the
+stripped tag, e.g. "0.11".
+
+Other styles are available. See [details.md](details.md) in the Versioneer
+source tree for descriptions.
+
+## Debugging
+
+Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
+to return a version of "0+unknown". To investigate the problem, run `setup.py
+version`, which will run the version-lookup code in a verbose mode, and will
+display the full contents of `get_versions()` (including the `error` string,
+which may help identify what went wrong).
+
+## Known Limitations
+
+Some situations are known to cause problems for Versioneer. This details the
+most significant ones. More can be found on Github
+[issues page](https://github.com/warner/python-versioneer/issues).
+
+### Subprojects
+
+Versioneer has limited support for source trees in which `setup.py` is not in
+the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
+two common reasons why `setup.py` might not be in the root:
+
+* Source trees which contain multiple subprojects, such as
+  [Buildbot](https://github.com/buildbot/buildbot), which contains both
+  "master" and "slave" subprojects, each with their own `setup.py`,
+  `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
+  distributions (and upload multiple independently-installable tarballs).
+* Source trees whose main purpose is to contain a C library, but which also
+  provide bindings to Python (and perhaps other langauges) in subdirectories.
+
+Versioneer will look for `.git` in parent directories, and most operations
+should get the right version string. However `pip` and `setuptools` have bugs
+and implementation details which frequently cause `pip install .` from a
+subproject directory to fail to find a correct version string (so it usually
+defaults to `0+unknown`).
+
+`pip install --editable .` should work correctly. `setup.py install` might
+work too.
+
+Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
+some later version.
+
+[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking
+this issue. The discussion in
+[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the
+issue from the Versioneer side in more detail.
+[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
+[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
+pip to let Versioneer work correctly.
+
+Versioneer-0.16 and earlier only looked for a `.git` directory next to the
+`setup.cfg`, so subprojects were completely unsupported with those releases.
+
+### Editable installs with setuptools <= 18.5
+
+`setup.py develop` and `pip install --editable .` allow you to install a
+project into a virtualenv once, then continue editing the source code (and
+test) without re-installing after every change.
+
+"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
+convenient way to specify executable scripts that should be installed along
+with the python package.
+
+These both work as expected when using modern setuptools. When using
+setuptools-18.5 or earlier, however, certain operations will cause
+`pkg_resources.DistributionNotFound` errors when running the entrypoint
+script, which must be resolved by re-installing the package. This happens
+when the install happens with one version, then the egg_info data is
+regenerated while a different version is checked out. Many setup.py commands
+cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
+a different virtualenv), so this can be surprising.
+
+[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes
+this one, but upgrading to a newer version of setuptools should probably
+resolve it.
+
+### Unicode version strings
+
+While Versioneer works (and is continually tested) with both Python 2 and
+Python 3, it is not entirely consistent with bytes-vs-unicode distinctions.
+Newer releases probably generate unicode version strings on py2. It's not
+clear that this is wrong, but it may be surprising for applications when then
+write these strings to a network connection or include them in bytes-oriented
+APIs like cryptographic checksums.
+
+[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates
+this question.
+
+
+## Updating Versioneer
+
+To upgrade your project to a new release of Versioneer, do the following:
+
+* install the new Versioneer (`pip install -U versioneer` or equivalent)
+* edit `setup.cfg`, if necessary, to include any new configuration settings
+  indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
+* re-run `versioneer install` in your source tree, to replace
+  `SRC/_version.py`
+* commit any changed files
+
+## Future Directions
+
+This tool is designed to make it easily extended to other version-control
+systems: all VCS-specific components are in separate directories like
+src/git/ . The top-level `versioneer.py` script is assembled from these
+components by running make-versioneer.py . In the future, make-versioneer.py
+will take a VCS name as an argument, and will construct a version of
+`versioneer.py` that is specific to the given VCS. It might also take the
+configuration arguments that are currently provided manually during
+installation by editing setup.py . Alternatively, it might go the other
+direction and include code from all supported VCS systems, reducing the
+number of intermediate scripts.
+
+
+## License
+
+To make Versioneer easier to embed, all its code is dedicated to the public
+domain. The `_version.py` that it creates is also in the public domain.
+Specifically, both are released under the Creative Commons "Public Domain
+Dedication" license (CC0-1.0), as described in
+https://creativecommons.org/publicdomain/zero/1.0/ .
+
+"""
+
+from __future__ import print_function
+try:
+    import configparser
+except ImportError:
+    import ConfigParser as configparser
+import errno
+import json
+import os
+import re
+import subprocess
+import sys
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_root():
+    """Get the project root directory.
+
+    We require that all commands are run from the project root, i.e. the
+    directory that contains setup.py, setup.cfg, and versioneer.py .
+    """
+    root = os.path.realpath(os.path.abspath(os.getcwd()))
+    setup_py = os.path.join(root, "setup.py")
+    versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        # allow 'python path/to/setup.py COMMAND'
+        root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
+        setup_py = os.path.join(root, "setup.py")
+        versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        err = ("Versioneer was unable to run the project root directory. "
+               "Versioneer requires setup.py to be executed from "
+               "its immediate directory (like 'python setup.py COMMAND'), "
+               "or in a way that lets it use sys.argv[0] to find the root "
+               "(like 'python path/to/setup.py COMMAND').")
+        raise VersioneerBadRootError(err)
+    try:
+        # Certain runtime workflows (setup.py install/develop in a setuptools
+        # tree) execute all dependencies in a single python process, so
+        # "versioneer" may be imported multiple times, and python's shared
+        # module-import table will cache the first one. So we can't use
+        # os.path.dirname(__file__), as that will find whichever
+        # versioneer.py was first imported, even in later projects.
+        me = os.path.realpath(os.path.abspath(__file__))
+        me_dir = os.path.normcase(os.path.splitext(me)[0])
+        vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
+        if me_dir != vsr_dir:
+            print("Warning: build in %s is using versioneer.py from %s"
+                  % (os.path.dirname(me), versioneer_py))
+    except NameError:
+        pass
+    return root
+
+
+def get_config_from_root(root):
+    """Read the project setup.cfg file to determine Versioneer config."""
+    # This might raise EnvironmentError (if setup.cfg is missing), or
+    # configparser.NoSectionError (if it lacks a [versioneer] section), or
+    # configparser.NoOptionError (if it lacks "VCS="). See the docstring at
+    # the top of versioneer.py for instructions on writing your setup.cfg .
+    setup_cfg = os.path.join(root, "setup.cfg")
+    parser = configparser.SafeConfigParser()
+    with open(setup_cfg, "r") as f:
+        parser.readfp(f)
+    VCS = parser.get("versioneer", "VCS")  # mandatory
+
+    def get(parser, name):
+        if parser.has_option("versioneer", name):
+            return parser.get("versioneer", name)
+        return None
+    cfg = VersioneerConfig()
+    cfg.VCS = VCS
+    cfg.style = get(parser, "style") or ""
+    cfg.versionfile_source = get(parser, "versionfile_source")
+    cfg.versionfile_build = get(parser, "versionfile_build")
+    cfg.tag_prefix = get(parser, "tag_prefix")
+    if cfg.tag_prefix in ("''", '""'):
+        cfg.tag_prefix = ""
+    cfg.parentdir_prefix = get(parser, "parentdir_prefix")
+    cfg.verbose = get(parser, "verbose")
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+# these dictionaries contain VCS-specific tools
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+LONG_VERSION_PY['git'] = '''
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.18 (https://github.com/warner/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
+    git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
+    git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "%(STYLE)s"
+    cfg.tag_prefix = "%(TAG_PREFIX)s"
+    cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
+    cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %%s" %% dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %%s" %% (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %%s (error)" %% dispcmd)
+            print("stdout was %%s" %% stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %%s but none started with prefix %%s" %%
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %%d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%%s', no digits" %% ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %%s" %% ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %%s" %% r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %%s not under git control" %% root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%%s*" %% tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%%s'"
+                               %% describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%%s' doesn't start with prefix '%%s'"
+                print(fmt %% (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
+                               %% (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%%d" %% pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%%d" %% pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%%s" %% pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%%s" %% pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%%s'" %% style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
+'''
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def do_vcs_install(manifest_in, versionfile_source, ipy):
+    """Git-specific installation logic for Versioneer.
+
+    For Git, this means creating/changing .gitattributes to mark _version.py
+    for export-subst keyword substitution.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+    files = [manifest_in, versionfile_source]
+    if ipy:
+        files.append(ipy)
+    try:
+        me = __file__
+        if me.endswith(".pyc") or me.endswith(".pyo"):
+            me = os.path.splitext(me)[0] + ".py"
+        versioneer_file = os.path.relpath(me)
+    except NameError:
+        versioneer_file = "versioneer.py"
+    files.append(versioneer_file)
+    present = False
+    try:
+        f = open(".gitattributes", "r")
+        for line in f.readlines():
+            if line.strip().startswith(versionfile_source):
+                if "export-subst" in line.strip().split()[1:]:
+                    present = True
+        f.close()
+    except EnvironmentError:
+        pass
+    if not present:
+        f = open(".gitattributes", "a+")
+        f.write("%s export-subst\n" % versionfile_source)
+        f.close()
+        files.append(".gitattributes")
+    run_command(GITS, ["add", "--"] + files)
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+SHORT_VERSION_PY = """
+# This file was generated by 'versioneer.py' (0.18) from
+# revision-control system data, or from the parent directory name of an
+# unpacked source archive. Distribution tarballs contain a pre-generated copy
+# of this file.
+
+import json
+
+version_json = '''
+%s
+'''  # END VERSION_JSON
+
+
+def get_versions():
+    return json.loads(version_json)
+"""
+
+
+def versions_from_file(filename):
+    """Try to determine the version from _version.py if present."""
+    try:
+        with open(filename) as f:
+            contents = f.read()
+    except EnvironmentError:
+        raise NotThisMethod("unable to read _version.py")
+    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
+                   contents, re.M | re.S)
+    if not mo:
+        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
+                       contents, re.M | re.S)
+    if not mo:
+        raise NotThisMethod("no version_json in _version.py")
+    return json.loads(mo.group(1))
+
+
+def write_to_version_file(filename, versions):
+    """Write the given version number to the given _version.py file."""
+    os.unlink(filename)
+    contents = json.dumps(versions, sort_keys=True,
+                          indent=1, separators=(",", ": "))
+    with open(filename, "w") as f:
+        f.write(SHORT_VERSION_PY % contents)
+
+    print("set %s to '%s'" % (filename, versions["version"]))
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+class VersioneerBadRootError(Exception):
+    """The project root directory is unknown or missing key files."""
+
+
+def get_versions(verbose=False):
+    """Get the project version from whatever source is available.
+
+    Returns dict with two keys: 'version' and 'full'.
+    """
+    if "versioneer" in sys.modules:
+        # see the discussion in cmdclass.py:get_cmdclass()
+        del sys.modules["versioneer"]
+
+    root = get_root()
+    cfg = get_config_from_root(root)
+
+    assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
+    handlers = HANDLERS.get(cfg.VCS)
+    assert handlers, "unrecognized VCS '%s'" % cfg.VCS
+    verbose = verbose or cfg.verbose
+    assert cfg.versionfile_source is not None, \
+        "please set versioneer.versionfile_source"
+    assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
+
+    versionfile_abs = os.path.join(root, cfg.versionfile_source)
+
+    # extract version from first of: _version.py, VCS command (e.g. 'git
+    # describe'), parentdir. This is meant to work for developers using a
+    # source checkout, for users of a tarball created by 'setup.py sdist',
+    # and for users of a tarball/zipball created by 'git archive' or github's
+    # download-from-tag feature or the equivalent in other VCSes.
+
+    get_keywords_f = handlers.get("get_keywords")
+    from_keywords_f = handlers.get("keywords")
+    if get_keywords_f and from_keywords_f:
+        try:
+            keywords = get_keywords_f(versionfile_abs)
+            ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
+            if verbose:
+                print("got version from expanded keyword %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        ver = versions_from_file(versionfile_abs)
+        if verbose:
+            print("got version from file %s %s" % (versionfile_abs, ver))
+        return ver
+    except NotThisMethod:
+        pass
+
+    from_vcs_f = handlers.get("pieces_from_vcs")
+    if from_vcs_f:
+        try:
+            pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
+            ver = render(pieces, cfg.style)
+            if verbose:
+                print("got version from VCS %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        if cfg.parentdir_prefix:
+            ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+            if verbose:
+                print("got version from parentdir %s" % ver)
+            return ver
+    except NotThisMethod:
+        pass
+
+    if verbose:
+        print("unable to compute version")
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None, "error": "unable to compute version",
+            "date": None}
+
+
+def get_version():
+    """Get the short version string for this project."""
+    return get_versions()["version"]
+
+
+def get_cmdclass():
+    """Get the custom setuptools/distutils subclasses used by Versioneer."""
+    if "versioneer" in sys.modules:
+        del sys.modules["versioneer"]
+        # this fixes the "python setup.py develop" case (also 'install' and
+        # 'easy_install .'), in which subdependencies of the main project are
+        # built (using setup.py bdist_egg) in the same python process. Assume
+        # a main project A and a dependency B, which use different versions
+        # of Versioneer. A's setup.py imports A's Versioneer, leaving it in
+        # sys.modules by the time B's setup.py is executed, causing B to run
+        # with the wrong versioneer. Setuptools wraps the sub-dep builds in a
+        # sandbox that restores sys.modules to it's pre-build state, so the
+        # parent is protected against the child's "import versioneer". By
+        # removing ourselves from sys.modules here, before the child build
+        # happens, we protect the child from the parent's versioneer too.
+        # Also see https://github.com/warner/python-versioneer/issues/52
+
+    cmds = {}
+
+    # we add "version" to both distutils and setuptools
+    from distutils.core import Command
+
+    class cmd_version(Command):
+        description = "report generated version string"
+        user_options = []
+        boolean_options = []
+
+        def initialize_options(self):
+            pass
+
+        def finalize_options(self):
+            pass
+
+        def run(self):
+            vers = get_versions(verbose=True)
+            print("Version: %s" % vers["version"])
+            print(" full-revisionid: %s" % vers.get("full-revisionid"))
+            print(" dirty: %s" % vers.get("dirty"))
+            print(" date: %s" % vers.get("date"))
+            if vers["error"]:
+                print(" error: %s" % vers["error"])
+    cmds["version"] = cmd_version
+
+    # we override "build_py" in both distutils and setuptools
+    #
+    # most invocation pathways end up running build_py:
+    #  distutils/build -> build_py
+    #  distutils/install -> distutils/build ->..
+    #  setuptools/bdist_wheel -> distutils/install ->..
+    #  setuptools/bdist_egg -> distutils/install_lib -> build_py
+    #  setuptools/install -> bdist_egg ->..
+    #  setuptools/develop -> ?
+    #  pip install:
+    #   copies source tree to a tempdir before running egg_info/etc
+    #   if .git isn't copied too, 'git describe' will fail
+    #   then does setup.py bdist_wheel, or sometimes setup.py install
+    #  setup.py egg_info -> ?
+
+    # we override different "build_py" commands for both environments
+    if "setuptools" in sys.modules:
+        from setuptools.command.build_py import build_py as _build_py
+    else:
+        from distutils.command.build_py import build_py as _build_py
+
+    class cmd_build_py(_build_py):
+        def run(self):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            versions = get_versions()
+            _build_py.run(self)
+            # now locate _version.py in the new build/ directory and replace
+            # it with an updated value
+            if cfg.versionfile_build:
+                target_versionfile = os.path.join(self.build_lib,
+                                                  cfg.versionfile_build)
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+    cmds["build_py"] = cmd_build_py
+
+    if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
+        from cx_Freeze.dist import build_exe as _build_exe
+        # nczeczulin reports that py2exe won't like the pep440-style string
+        # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
+        # setup(console=[{
+        #   "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
+        #   "product_version": versioneer.get_version(),
+        #   ...
+
+        class cmd_build_exe(_build_exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _build_exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["build_exe"] = cmd_build_exe
+        del cmds["build_py"]
+
+    if 'py2exe' in sys.modules:  # py2exe enabled?
+        try:
+            from py2exe.distutils_buildexe import py2exe as _py2exe  # py3
+        except ImportError:
+            from py2exe.build_exe import py2exe as _py2exe  # py2
+
+        class cmd_py2exe(_py2exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _py2exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["py2exe"] = cmd_py2exe
+
+    # we override different "sdist" commands for both environments
+    if "setuptools" in sys.modules:
+        from setuptools.command.sdist import sdist as _sdist
+    else:
+        from distutils.command.sdist import sdist as _sdist
+
+    class cmd_sdist(_sdist):
+        def run(self):
+            versions = get_versions()
+            self._versioneer_generated_versions = versions
+            # unless we update this, the command will keep using the old
+            # version
+            self.distribution.metadata.version = versions["version"]
+            return _sdist.run(self)
+
+        def make_release_tree(self, base_dir, files):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            _sdist.make_release_tree(self, base_dir, files)
+            # now locate _version.py in the new base_dir directory
+            # (remembering that it may be a hardlink) and replace it with an
+            # updated value
+            target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
+            print("UPDATING %s" % target_versionfile)
+            write_to_version_file(target_versionfile,
+                                  self._versioneer_generated_versions)
+    cmds["sdist"] = cmd_sdist
+
+    return cmds
+
+
+CONFIG_ERROR = """
+setup.cfg is missing the necessary Versioneer configuration. You need
+a section like:
+
+ [versioneer]
+ VCS = git
+ style = pep440
+ versionfile_source = src/myproject/_version.py
+ versionfile_build = myproject/_version.py
+ tag_prefix =
+ parentdir_prefix = myproject-
+
+You will also need to edit your setup.py to use the results:
+
+ import versioneer
+ setup(version=versioneer.get_version(),
+       cmdclass=versioneer.get_cmdclass(), ...)
+
+Please read the docstring in ./versioneer.py for configuration instructions,
+edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
+"""
+
+SAMPLE_CONFIG = """
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+#VCS = git
+#style = pep440
+#versionfile_source =
+#versionfile_build =
+#tag_prefix =
+#parentdir_prefix =
+
+"""
+
+INIT_PY_SNIPPET = """
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
+"""
+
+
+def do_setup():
+    """Main VCS-independent setup function for installing Versioneer."""
+    root = get_root()
+    try:
+        cfg = get_config_from_root(root)
+    except (EnvironmentError, configparser.NoSectionError,
+            configparser.NoOptionError) as e:
+        if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
+            print("Adding sample versioneer config to setup.cfg",
+                  file=sys.stderr)
+            with open(os.path.join(root, "setup.cfg"), "a") as f:
+                f.write(SAMPLE_CONFIG)
+        print(CONFIG_ERROR, file=sys.stderr)
+        return 1
+
+    print(" creating %s" % cfg.versionfile_source)
+    with open(cfg.versionfile_source, "w") as f:
+        LONG = LONG_VERSION_PY[cfg.VCS]
+        f.write(LONG % {"DOLLAR": "$",
+                        "STYLE": cfg.style,
+                        "TAG_PREFIX": cfg.tag_prefix,
+                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        })
+
+    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
+                       "__init__.py")
+    if os.path.exists(ipy):
+        try:
+            with open(ipy, "r") as f:
+                old = f.read()
+        except EnvironmentError:
+            old = ""
+        if INIT_PY_SNIPPET not in old:
+            print(" appending to %s" % ipy)
+            with open(ipy, "a") as f:
+                f.write(INIT_PY_SNIPPET)
+        else:
+            print(" %s unmodified" % ipy)
+    else:
+        print(" %s doesn't exist, ok" % ipy)
+        ipy = None
+
+    # Make sure both the top-level "versioneer.py" and versionfile_source
+    # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
+    # they'll be copied into source distributions. Pip won't be able to
+    # install the package without this.
+    manifest_in = os.path.join(root, "MANIFEST.in")
+    simple_includes = set()
+    try:
+        with open(manifest_in, "r") as f:
+            for line in f:
+                if line.startswith("include "):
+                    for include in line.split()[1:]:
+                        simple_includes.add(include)
+    except EnvironmentError:
+        pass
+    # That doesn't cover everything MANIFEST.in can do
+    # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
+    # it might give some false negatives. Appending redundant 'include'
+    # lines is safe, though.
+    if "versioneer.py" not in simple_includes:
+        print(" appending 'versioneer.py' to MANIFEST.in")
+        with open(manifest_in, "a") as f:
+            f.write("include versioneer.py\n")
+    else:
+        print(" 'versioneer.py' already in MANIFEST.in")
+    if cfg.versionfile_source not in simple_includes:
+        print(" appending versionfile_source ('%s') to MANIFEST.in" %
+              cfg.versionfile_source)
+        with open(manifest_in, "a") as f:
+            f.write("include %s\n" % cfg.versionfile_source)
+    else:
+        print(" versionfile_source already in MANIFEST.in")
+
+    # Make VCS-specific changes. For git, this means creating/changing
+    # .gitattributes to mark _version.py for export-subst keyword
+    # substitution.
+    do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
+    return 0
+
+
+def scan_setup_py():
+    """Validate the contents of setup.py against Versioneer's expectations."""
+    found = set()
+    setters = False
+    errors = 0
+    with open("setup.py", "r") as f:
+        for line in f.readlines():
+            if "import versioneer" in line:
+                found.add("import")
+            if "versioneer.get_cmdclass()" in line:
+                found.add("cmdclass")
+            if "versioneer.get_version()" in line:
+                found.add("get_version")
+            if "versioneer.VCS" in line:
+                setters = True
+            if "versioneer.versionfile_source" in line:
+                setters = True
+    if len(found) != 3:
+        print("")
+        print("Your setup.py appears to be missing some important items")
+        print("(but I might be wrong). Please make sure it has something")
+        print("roughly like the following:")
+        print("")
+        print(" import versioneer")
+        print(" setup( version=versioneer.get_version(),")
+        print("        cmdclass=versioneer.get_cmdclass(),  ...)")
+        print("")
+        errors += 1
+    if setters:
+        print("You should remove lines like 'versioneer.VCS = ' and")
+        print("'versioneer.versionfile_source = ' . This configuration")
+        print("now lives in setup.cfg, and should be removed from setup.py")
+        print("")
+        errors += 1
+    return errors
+
+
+if __name__ == "__main__":
+    cmd = sys.argv[1]
+    if cmd == "setup":
+        errors = do_setup()
+        errors += scan_setup_py()
+        if errors:
+            sys.exit(1)
diff --git a/python/raft/pytest.ini b/python/raft/pytest.ini
new file mode 100644
index 0000000000..e48e31a00a
--- /dev/null
+++ b/python/raft/pytest.ini
@@ -0,0 +1,8 @@
+[pytest]
+markers =
+  unit: marks unit tests
+  quality: marks quality tests
+  stress: marks stress tests
+  mg: marks a test as multi-GPU
+  memleak: marks a test as a memory leak test
+
diff --git a/python/raft/raft/__init__.py b/python/raft/raft/__init__.py
new file mode 100644
index 0000000000..b2431b4f6c
--- /dev/null
+++ b/python/raft/raft/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .include_test import raft_include_test
diff --git a/python/raft/raft/_version.py b/python/raft/raft/_version.py
new file mode 100644
index 0000000000..454b0fe7aa
--- /dev/null
+++ b/python/raft/raft/_version.py
@@ -0,0 +1,567 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.18 (https://github.com/warner/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "$Format:%d$"
+    git_full = "$Format:%H$"
+    git_date = "$Format:%ci$"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "pep440"
+    cfg.tag_prefix = "v"
+    cfg.parentdir_prefix = "raft-"
+    cfg.versionfile_source = "raft/_version.py"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+
+    return decorate
+
+
+def run_command(
+        commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
+):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen(
+                [c] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+                )
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {
+                "version": dirname[len(parentdir_prefix):],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r"\d", r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(
+        GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True
+    )
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            "%s*" % tag_prefix,
+            ],
+        cwd=root,
+    )
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = (
+                    "unable to parse git-describe output: '%s'" % describe_out
+            )
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(
+            GITS, ["rev-list", "HEAD", "--count"], cwd=root
+        )
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(
+            get_keywords(), cfg.tag_prefix, verbose
+        )
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split("/"):
+            root = os.path.dirname(root)
+    except NameError:
+        return {
+            "version": "0+unknown",
+            "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to find root of source tree",
+            "date": None,
+        }
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
diff --git a/python/raft/raft/common/__init__.pxd b/python/raft/raft/common/__init__.pxd
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/raft/raft/common/__init__.py b/python/raft/raft/common/__init__.py
new file mode 100644
index 0000000000..b5ef2b3079
--- /dev/null
+++ b/python/raft/raft/common/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .cuda import Stream
+from .handle import Handle
\ No newline at end of file
diff --git a/python/raft/raft/common/cuda.pxd b/python/raft/raft/common/cuda.pxd
new file mode 100644
index 0000000000..0459cb96af
--- /dev/null
+++ b/python/raft/raft/common/cuda.pxd
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from cuda.ccudart cimport cudaStream_t
+
+cdef class Stream:
+    cdef cudaStream_t s
+
+    cdef cudaStream_t getStream(self)
diff --git a/python/raft/raft/common/cuda.pyx b/python/raft/raft/common/cuda.pyx
new file mode 100644
index 0000000000..c3c90936aa
--- /dev/null
+++ b/python/raft/raft/common/cuda.pyx
@@ -0,0 +1,84 @@
+#
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cuda.ccudart cimport(
+    cudaStream_t,
+    cudaError_t,
+    cudaSuccess,
+    cudaStreamCreate,
+    cudaStreamDestroy,
+    cudaStreamSynchronize,
+    cudaGetLastError,
+    cudaGetErrorString,
+    cudaGetErrorName
+)
+
+
+class CudaRuntimeError(RuntimeError):
+    def __init__(self, extraMsg=None):
+        cdef cudaError_t e = cudaGetLastError()
+        cdef bytes errMsg = cudaGetErrorString(e)
+        cdef bytes errName = cudaGetErrorName(e)
+        msg = "Error! %s reason='%s'" % (errName.decode(), errMsg.decode())
+        if extraMsg is not None:
+            msg += " extraMsg='%s'" % extraMsg
+        super(CudaRuntimeError, self).__init__(msg)
+
+
+cdef class Stream:
+    """
+    Stream represents a thin-wrapper around cudaStream_t and its operations.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from raft.common.cuda import Stream
+        stream = Stream()
+        stream.sync()
+        del stream  # optional!
+    """
+    def __cinit__(self):
+        cdef cudaStream_t stream
+        cdef cudaError_t e = cudaStreamCreate(&stream)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream create")
+        self.s = stream
+
+    def __dealloc__(self):
+        self.sync()
+        cdef cudaError_t e = cudaStreamDestroy(self.s)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream destroy")
+
+    def sync(self):
+        """
+        Synchronize on the cudastream owned by this object. Note that this
+        could raise exception due to issues with previous asynchronous
+        launches
+        """
+        cdef cudaError_t e = cudaStreamSynchronize(self.s)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream sync")
+
+    cdef cudaStream_t getStream(self):
+        return self.s
diff --git a/python/raft/raft/common/handle.pxd b/python/raft/raft/common/handle.pxd
new file mode 100644
index 0000000000..8415b7e3d7
--- /dev/null
+++ b/python/raft/raft/common/handle.pxd
@@ -0,0 +1,48 @@
+#
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+
+from libcpp.memory cimport shared_ptr
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
+from libcpp.memory cimport shared_ptr
+from libcpp.memory cimport unique_ptr
+
+cdef extern from "raft/mr/device/allocator.hpp" \
+        namespace "raft::mr::device" nogil:
+    cdef cppclass allocator:
+        pass
+
+cdef extern from "raft/handle.hpp" namespace "raft" nogil:
+    cdef cppclass handle_t:
+        handle_t() except +
+        handle_t(cuda_stream_view stream_view) except +
+        handle_t(cuda_stream_view stream_view,
+                 shared_ptr[cuda_stream_pool] stream_pool) except +
+        void set_device_allocator(shared_ptr[allocator] a) except +
+        shared_ptr[allocator] get_device_allocator() except +
+        cuda_stream_view get_stream() except +
+        void sync_stream() except +
+
+cdef class Handle:
+    cdef unique_ptr[handle_t] c_obj
+    cdef shared_ptr[cuda_stream_pool] stream_pool
+    cdef int n_streams
diff --git a/python/raft/raft/common/handle.pyx b/python/raft/raft/common/handle.pyx
new file mode 100644
index 0000000000..661c5b5f23
--- /dev/null
+++ b/python/raft/raft/common/handle.pyx
@@ -0,0 +1,90 @@
+#
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+# import raft
+from libcpp.memory cimport shared_ptr
+from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+
+from .cuda cimport Stream
+from .cuda import CudaRuntimeError
+
+
+cdef class Handle:
+    """
+    Handle is a lightweight python wrapper around the corresponding C++ class
+    of handle_t exposed by RAFT's C++ interface. Refer to the header file
+    raft/handle.hpp for interface level details of this struct
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from raft.common import Stream, Handle
+        stream = Stream()
+        handle = Handle(stream)
+
+        # call algos here
+
+        # final sync of all work launched in the stream of this handle
+        # this is same as `raft.cuda.Stream.sync()` call, but safer in case
+        # the default stream inside the `handle_t` is being used
+        handle.sync()
+        del handle  # optional!
+    """
+
+    def __cinit__(self, stream: Stream = None, n_streams=0):
+        self.n_streams = n_streams
+        if n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(n_streams))
+
+        cdef cuda_stream_view c_stream
+        if stream is None:
+            # this constructor will construct a "main" handle on
+            # per-thread default stream, which is non-blocking
+            self.c_obj.reset(new handle_t(cuda_stream_per_thread,
+                                          self.stream_pool))
+        else:
+            # this constructor constructs a handle on user stream
+            c_stream = cuda_stream_view(stream.getStream())
+            self.c_obj.reset(new handle_t(c_stream,
+                                          self.stream_pool))
+
+    def sync(self):
+        """
+        Issues a sync on the stream set for this handle.
+        """
+        self.c_obj.get()[0].sync_stream()
+
+    def getHandle(self):
+        return <size_t> self.c_obj.get()
+
+    def __getstate__(self):
+        return self.n_streams
+
+    def __setstate__(self, state):
+        self.n_streams = state
+        if self.n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(self.n_streams))
+
+        self.c_obj.reset(new handle_t(cuda_stream_per_thread,
+                                      self.stream_pool))
diff --git a/python/raft/raft/common/interruptible.pxd b/python/raft/raft/common/interruptible.pxd
new file mode 100644
index 0000000000..a73e8c1ac7
--- /dev/null
+++ b/python/raft/raft/common/interruptible.pxd
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libcpp.memory cimport shared_ptr
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+
+cdef extern from "raft/interruptible.hpp" namespace "raft" nogil:
+    cdef cppclass interruptible:
+        void cancel()
+
+cdef extern from "raft/interruptible.hpp" \
+        namespace "raft::interruptible" nogil:
+    cdef void inter_synchronize \
+        "raft::interruptible::synchronize"(cuda_stream_view stream) except+
+    cdef void inter_yield "raft::interruptible::yield"() except+
+    cdef shared_ptr[interruptible] get_token() except+
diff --git a/python/raft/raft/common/interruptible.pyx b/python/raft/raft/common/interruptible.pyx
new file mode 100644
index 0000000000..dfc95490ed
--- /dev/null
+++ b/python/raft/raft/common/interruptible.pyx
@@ -0,0 +1,84 @@
+#
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+import contextlib
+import signal
+from cython.operator cimport dereference
+
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from cuda.ccudart cimport cudaStream_t
+from .cuda cimport Stream
+
+
+@contextlib.contextmanager
+def cuda_interruptible():
+    '''
+    Temporarily install a keyboard interrupt handler (Ctrl+C)
+    that cancels the enclosed interruptible C++ thread.
+
+    Use this on a long-running C++ function imported via cython:
+
+    .. code-block:: python
+
+        with cuda_interruptible():
+            my_long_running_function(...)
+
+    It's also recommended to release the GIL during the call, to
+    make sure the handler has a chance to run:
+
+    .. code-block:: python
+
+        with cuda_interruptible():
+            with nogil:
+                my_long_running_function(...)
+
+    '''
+    cdef shared_ptr[interruptible] token = get_token()
+
+    def newhr(*args, **kwargs):
+        with nogil:
+            dereference(token).cancel()
+
+    oldhr = signal.signal(signal.SIGINT, newhr)
+    try:
+        yield
+    finally:
+        signal.signal(signal.SIGINT, oldhr)
+
+
+def synchronize(stream: Stream):
+    '''
+    Same as cudaStreamSynchronize, but can be interrupted
+    if called within a `with cuda_interruptible()` block.
+    '''
+    cdef cuda_stream_view c_stream = cuda_stream_view(stream.getStream())
+    with nogil:
+        inter_synchronize(c_stream)
+
+
+def cuda_yield():
+    '''
+    Check for an asynchronously received interrupted_exception.
+    Raises the exception if a user pressed Ctrl+C within a
+    `with cuda_interruptible()` block before.
+    '''
+    with nogil:
+        inter_yield()
diff --git a/python/raft/raft/dask/__init__.py b/python/raft/raft/dask/__init__.py
new file mode 100644
index 0000000000..74231d256f
--- /dev/null
+++ b/python/raft/raft/dask/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .common.comms import Comms
\ No newline at end of file
diff --git a/python/raft/raft/dask/common/__init__.py b/python/raft/raft/dask/common/__init__.py
new file mode 100644
index 0000000000..c2265f6828
--- /dev/null
+++ b/python/raft/raft/dask/common/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .comms import Comms
+from .comms import local_handle
+
+from .comms_utils import inject_comms_on_handle
+from .comms_utils import inject_comms_on_handle_coll_only
+from .comms_utils import perform_test_comms_allreduce
+from .comms_utils import perform_test_comms_send_recv
+from .comms_utils import perform_test_comms_device_send_or_recv
+from .comms_utils import perform_test_comms_device_sendrecv
+from .comms_utils import perform_test_comms_device_multicast_sendrecv
+from .comms_utils import perform_test_comms_allgather
+from .comms_utils import perform_test_comms_gather
+from .comms_utils import perform_test_comms_gatherv
+from .comms_utils import perform_test_comms_bcast
+from .comms_utils import perform_test_comms_reduce
+from .comms_utils import perform_test_comms_reducescatter
+from .comms_utils import perform_test_comm_split
+
+from .ucx import UCX
diff --git a/python/raft/raft/dask/common/comms.py b/python/raft/raft/dask/common/comms.py
new file mode 100644
index 0000000000..ee768b41ff
--- /dev/null
+++ b/python/raft/raft/dask/common/comms.py
@@ -0,0 +1,648 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .nccl import nccl
+from .ucx import UCX
+
+from .comms_utils import inject_comms_on_handle
+from .comms_utils import inject_comms_on_handle_coll_only
+
+from .utils import parse_host_port
+from ...common.handle import Handle
+
+from dask.distributed import get_worker, default_client
+
+import warnings
+
+import logging
+import time
+import uuid
+from collections import OrderedDict
+
+logger = logging.getLogger(__name__)
+
+
+class Comms:
+
+    """
+    Initializes and manages underlying NCCL and UCX comms handles across
+    the workers of a Dask cluster. It is expected that `init()` will be
+    called explicitly. It is recommended to also call `destroy()` when
+    the comms are no longer needed so the underlying resources can be
+    cleaned up. This class is not meant to be thread-safe.
+
+    Examples
+    --------
+   .. code-block:: python
+
+        # The following code block assumes we have wrapped a C++
+        # function in a Python function called `run_algorithm`,
+        # which takes a `raft::handle_t` as a single argument.
+        # Once the `Comms` instance is successfully initialized,
+        # the underlying `raft::handle_t` will contain an instance
+        # of `raft::comms::comms_t`
+
+        from dask_cuda import LocalCUDACluster
+        from dask.distributed import Client
+
+        from raft.dask.common import Comms, local_handle
+
+        cluster = LocalCUDACluster()
+        client = Client(cluster)
+
+        def _use_comms(sessionId):
+            return run_algorithm(local_handle(sessionId))
+
+        comms = Comms(client=client)
+        comms.init()
+
+        futures = [client.submit(_use_comms,
+                                 comms.sessionId,
+                                 workers=[w],
+                                 pure=False) # Don't memoize
+                       for w in cb.worker_addresses]
+        wait(dfs, timeout=5)
+
+        comms.destroy()
+        client.close()
+        cluster.close()
+    """
+
+    valid_nccl_placements = ("client", "worker", "scheduler")
+
+    def __init__(
+        self,
+        comms_p2p=False,
+        client=None,
+        verbose=False,
+        streams_per_handle=0,
+        nccl_root_location="scheduler",
+    ):
+        """
+        Construct a new CommsContext instance
+
+        Parameters
+        ----------
+        comms_p2p : bool
+                    Initialize UCX endpoints?
+        client : dask.distributed.Client [optional]
+                 Dask client to use
+        verbose : bool
+                  Print verbose logging
+        nccl_root_location : string
+                  Indicates where the NCCL's root node should be located.
+                  ['client', 'worker', 'scheduler' (default)]
+
+        """
+        self.client = client if client is not None else default_client()
+
+        self.comms_p2p = comms_p2p
+
+        self.nccl_root_location = nccl_root_location.lower()
+        if self.nccl_root_location not in Comms.valid_nccl_placements:
+            raise ValueError(
+                f"nccl_root_location must be one of: "
+                f"{Comms.valid_nccl_placements}"
+            )
+
+        self.streams_per_handle = streams_per_handle
+
+        self.sessionId = uuid.uuid4().bytes
+
+        self.nccl_initialized = False
+        self.ucx_initialized = False
+
+        self.verbose = verbose
+
+        if verbose:
+            print("Initializing comms!")
+
+    def __del__(self):
+        if self.nccl_initialized or self.ucx_initialized:
+            self.destroy()
+
+    def create_nccl_uniqueid(self):
+        if self.nccl_root_location == "client":
+            self.uniqueId = nccl.get_unique_id()
+        elif self.nccl_root_location == "worker":
+            self.uniqueId = self.client.run(
+                _func_set_worker_as_nccl_root,
+                sessionId=self.sessionId,
+                verbose=self.verbose,
+                workers=[self.worker_addresses[0]],
+                wait=True,
+            )[self.worker_addresses[0]]
+        else:
+            self.uniqueId = self.client.run_on_scheduler(
+                _func_set_scheduler_as_nccl_root,
+                sessionId=self.sessionId,
+                verbose=self.verbose,
+            )
+
+    def worker_info(self, workers):
+        """
+        Builds a dictionary of { (worker_address, worker_port) :
+                                (worker_rank, worker_port ) }
+        """
+        ranks = _func_worker_ranks(workers)
+        ports = (
+            _func_ucp_ports(self.client, workers) if self.comms_p2p else None
+        )
+
+        output = {}
+        for k in ranks.keys():
+            output[k] = {"rank": ranks[k]}
+            if self.comms_p2p:
+                output[k]["port"] = ports[k]
+        return output
+
+    def init(self, workers=None):
+        """
+        Initializes the underlying comms. NCCL is required but
+        UCX is only initialized if `comms_p2p == True`
+
+        Parameters
+        ----------
+
+        workers : Sequence
+                  Unique collection of workers for initializing comms.
+        """
+
+        self.worker_addresses = list(
+            OrderedDict.fromkeys(
+                self.client.scheduler_info()["workers"].keys()
+                if workers is None
+                else workers
+            )
+        )
+
+        if self.nccl_initialized or self.ucx_initialized:
+            warnings.warn("Comms have already been initialized.")
+            return
+
+        worker_info = self.worker_info(self.worker_addresses)
+        worker_info = {w: worker_info[w] for w in self.worker_addresses}
+
+        self.create_nccl_uniqueid()
+
+        self.client.run(
+            _func_init_all,
+            self.sessionId,
+            self.uniqueId,
+            self.comms_p2p,
+            worker_info,
+            self.verbose,
+            self.streams_per_handle,
+            workers=self.worker_addresses,
+            wait=True,
+        )
+
+        self.nccl_initialized = True
+
+        if self.comms_p2p:
+            self.ucx_initialized = True
+
+        if self.verbose:
+            print("Initialization complete.")
+
+    def destroy(self):
+        """
+        Shuts down initialized comms and cleans up resources. This will
+        be called automatically by the Comms destructor, but may be called
+        earlier to save resources.
+        """
+        self.client.run(
+            _func_destroy_all,
+            self.sessionId,
+            self.comms_p2p,
+            self.verbose,
+            wait=True,
+            workers=self.worker_addresses,
+        )
+
+        if self.nccl_root_location == "scheduler":
+            self.client.run_on_scheduler(
+                _func_destroy_scheduler_session, self.sessionId
+            )
+
+        if self.verbose:
+            print("Destroying comms.")
+
+        self.nccl_initialized = False
+        self.ucx_initialized = False
+
+
+def local_handle(sessionId):
+    """
+    Simple helper function for retrieving the local handle_t instance
+    for a comms session on a worker.
+
+    Parameters
+    ----------
+    sessionId : str
+                session identifier from an initialized comms instance
+
+    Returns
+    -------
+
+    handle : raft.Handle or None
+    """
+    state = get_raft_comm_state(sessionId, get_worker())
+    return state["handle"] if "handle" in state else None
+
+
+def get_raft_comm_state(sessionId, state_object=None):
+    """
+    Retrieves cuML comms state on the scheduler node, for the given sessionId,
+    creating a new session if it does not exist. If no session id is given,
+    returns the state dict for all sessions.
+
+    Parameters
+    ----------
+    sessionId : SessionId value to retrieve from the dask_scheduler instances
+    state_object : Object (either Worker, or Scheduler) on which the raft
+                   comm state will retrieved (or created)
+
+    Returns
+    -------
+
+    session state : str
+                    session state associated with sessionId
+    """
+    state_object = state_object if state_object is not None else get_worker()
+
+    if not hasattr(state_object, "_raft_comm_state"):
+        state_object._raft_comm_state = {}
+
+    if (
+        sessionId is not None
+        and sessionId not in state_object._raft_comm_state
+    ):
+        state_object._raft_comm_state[sessionId] = {"ts": time.time()}
+
+    if sessionId is not None:
+        return state_object._raft_comm_state[sessionId]
+
+    return state_object._raft_comm_state
+
+
+def set_nccl_root(sessionId, state_object):
+    if sessionId is None:
+        raise ValueError("sessionId cannot be None.")
+
+    raft_comm_state = get_raft_comm_state(
+        sessionId=sessionId, state_object=state_object
+    )
+
+    if "nccl_uid" not in raft_comm_state:
+        raft_comm_state["nccl_uid"] = nccl.get_unique_id()
+
+    return raft_comm_state["nccl_uid"]
+
+
+def get_ucx():
+    """
+    A simple convenience wrapper to make sure UCP listener and
+    endpoints are only ever assigned once per worker.
+    """
+    raft_comm_state = get_raft_comm_state(
+        sessionId="ucp", state_object=get_worker()
+    )
+    if "ucx" not in raft_comm_state:
+        raft_comm_state["ucx"] = UCX.get()
+
+    return raft_comm_state["ucx"]
+
+
+def _func_destroy_scheduler_session(sessionId, dask_scheduler):
+    """
+    Remove session date from _raft_comm_state, associated with sessionId
+
+    Parameters
+    ----------
+    sessionId : session Id to be destroyed.
+    dask_scheduler : dask_scheduler object
+                    (Note: this is supplied by DASK, not the client)
+    """
+    if sessionId is not None and sessionId in dask_scheduler._raft_comm_state:
+        del dask_scheduler._raft_comm_state[sessionId]
+    else:
+        return 1
+
+    return 0
+
+
+def _func_set_scheduler_as_nccl_root(sessionId, verbose, dask_scheduler):
+    """
+    Creates a persistent nccl uniqueId on the scheduler node.
+
+
+    Parameters
+    ----------
+    sessionId : Associated session to attach the unique ID to.
+    verbose : Indicates whether or not to emit additional information
+    dask_scheduler : dask scheduler object,
+                    (Note: this is supplied by DASK, not the client)
+
+    Return
+    ------
+    uniqueId : byte str
+                NCCL uniqueId, associating the DASK scheduler as its root node.
+    """
+    if verbose:
+        logger.info(
+            msg=f"Setting scheduler as NCCL "
+            f"root for sessionId, '{sessionId}'"
+        )
+
+    nccl_uid = set_nccl_root(sessionId=sessionId, state_object=dask_scheduler)
+
+    if verbose:
+        logger.info("Done setting scheduler as NCCL root.")
+
+    return nccl_uid
+
+
+def _func_set_worker_as_nccl_root(sessionId, verbose):
+    """
+    Creates a persistent nccl uniqueId on the scheduler node.
+
+
+    Parameters
+    ----------
+    sessionId : Associated session to attach the unique ID to.
+    verbose : Indicates whether or not to emit additional information
+
+    Return
+    ------
+    uniqueId : byte str
+                NCCL uniqueId, associating this DASK worker as its root node.
+    """
+    worker = get_worker()
+    if verbose:
+        worker.log_event(
+            topic="info",
+            msg=f"Setting worker as NCCL root for session, '{sessionId}'",
+        )
+
+    nccl_uid = set_nccl_root(sessionId=sessionId, state_object=worker)
+
+    if verbose:
+        worker.log_event(
+            topic="info", msg="Done setting scheduler as NCCL root."
+        )
+
+    return nccl_uid
+
+
+def _func_ucp_listener_port():
+    return get_ucx().listener_port()
+
+
+async def _func_init_all(
+    sessionId, uniqueId, comms_p2p, worker_info, verbose, streams_per_handle
+):
+    worker = get_worker()
+    raft_comm_state = get_raft_comm_state(
+        sessionId=sessionId, state_object=worker
+    )
+    raft_comm_state["nccl_uid"] = uniqueId
+    raft_comm_state["wid"] = worker_info[get_worker().address]["rank"]
+    raft_comm_state["nworkers"] = len(worker_info)
+
+    if verbose:
+        worker.log_event(topic="info", msg="Initializing NCCL.")
+        start = time.time()
+
+    _func_init_nccl(sessionId, uniqueId)
+
+    if verbose:
+        elapsed = time.time() - start
+        worker.log_event(
+            topic="info", msg=f"NCCL Initialization took: {elapsed} seconds."
+        )
+
+    if comms_p2p:
+        if verbose:
+            worker.log_event(topic="info", msg="Initializing UCX Endpoints")
+
+        if verbose:
+            start = time.time()
+        await _func_ucp_create_endpoints(sessionId, worker_info)
+
+        if verbose:
+            elapsed = time.time() - start
+            msg = (
+                f"Done initializing UCX endpoints."
+                f"Took: {elapsed} seconds.\nBuilding handle."
+            )
+            worker.log_event(topic="info", msg=msg)
+
+        _func_build_handle_p2p(sessionId, streams_per_handle, verbose)
+
+        if verbose:
+            worker.log_event(topic="info", msg="Done building handle.")
+
+    else:
+        _func_build_handle(sessionId, streams_per_handle, verbose)
+
+
+def _func_init_nccl(sessionId, uniqueId):
+    """
+    Initialize ncclComm_t on worker
+
+    Parameters
+    ----------
+    sessionId : str
+                session identifier from a comms instance
+    uniqueId : array[byte]
+               The NCCL unique Id generated from the
+               client.
+    """
+
+    worker = get_worker()
+    raft_comm_state = get_raft_comm_state(
+        sessionId=sessionId, state_object=get_worker()
+    )
+    wid = raft_comm_state["wid"]
+    nWorkers = raft_comm_state["nworkers"]
+
+    try:
+        n = nccl()
+        n.init(nWorkers, uniqueId, wid)
+        raft_comm_state["nccl"] = n
+    except Exception as e:
+        worker.log_event(
+            topic="error", msg=f"An error occurred initializing NCCL: {e}."
+        )
+        raise
+
+
+def _func_build_handle_p2p(sessionId, streams_per_handle, verbose):
+    """
+    Builds a handle_t on the current worker given the initialized comms
+
+    Parameters
+    ----------
+    sessionId : str id to reference state for current comms instance.
+    streams_per_handle : int number of internal streams to create
+    verbose : bool print verbose logging output
+    """
+    worker = get_worker()
+    if verbose:
+        worker.log_event(topic="info", msg="Building p2p handle.")
+
+    ucp_worker = get_ucx().get_worker()
+    raft_comm_state = get_raft_comm_state(
+        sessionId=sessionId, state_object=worker
+    )
+
+    handle = Handle(n_streams=streams_per_handle)
+    nccl_comm = raft_comm_state["nccl"]
+    eps = raft_comm_state["ucp_eps"]
+    nWorkers = raft_comm_state["nworkers"]
+    workerId = raft_comm_state["wid"]
+
+    if verbose:
+        worker.log_event(topic="info", msg="Injecting comms on handle.")
+
+    inject_comms_on_handle(
+        handle, nccl_comm, ucp_worker, eps, nWorkers, workerId, verbose
+    )
+
+    if verbose:
+        worker.log_event(
+            topic="info", msg="Finished injecting comms on handle."
+        )
+
+    raft_comm_state["handle"] = handle
+
+
+def _func_build_handle(sessionId, streams_per_handle, verbose):
+    """
+    Builds a handle_t on the current worker given the initialized comms
+
+    Parameters
+    ----------
+    sessionId : str id to reference state for current comms instance.
+    streams_per_handle : int number of internal streams to create
+    verbose : bool print verbose logging output
+    """
+    worker = get_worker()
+    if verbose:
+        worker.log_event(
+            topic="info", msg="Finished injecting comms on handle."
+        )
+
+    handle = Handle(n_streams=streams_per_handle)
+
+    raft_comm_state = get_raft_comm_state(
+        sessionId=sessionId, state_object=worker
+    )
+
+    workerId = raft_comm_state["wid"]
+    nWorkers = raft_comm_state["nworkers"]
+
+    nccl_comm = raft_comm_state["nccl"]
+    inject_comms_on_handle_coll_only(
+        handle, nccl_comm, nWorkers, workerId, verbose
+    )
+    raft_comm_state["handle"] = handle
+
+
+def _func_store_initial_state(nworkers, sessionId, uniqueId, wid):
+    raft_comm_state = get_raft_comm_state(
+        sessionId=sessionId, state_object=get_worker()
+    )
+    raft_comm_state["nccl_uid"] = uniqueId
+    raft_comm_state["wid"] = wid
+    raft_comm_state["nworkers"] = nworkers
+
+
+async def _func_ucp_create_endpoints(sessionId, worker_info):
+    """
+    Runs on each worker to create ucp endpoints to all other workers
+
+    Parameters
+    ----------
+    sessionId : str
+                uuid unique id for this instance
+    worker_info : dict
+                  Maps worker addresses to NCCL ranks & UCX ports
+    """
+    eps = [None] * len(worker_info)
+    count = 1
+
+    for k in worker_info:
+        ip, port = parse_host_port(k)
+
+        ep = await get_ucx().get_endpoint(ip, worker_info[k]["port"])
+
+        eps[worker_info[k]["rank"]] = ep
+        count += 1
+
+    raft_comm_state = get_raft_comm_state(
+        sessionId=sessionId, state_object=get_worker()
+    )
+    raft_comm_state["ucp_eps"] = eps
+
+
+async def _func_destroy_all(sessionId, comms_p2p, verbose=False):
+    worker = get_worker()
+    if verbose:
+        worker.log_event(topic="info", msg="Destroying NCCL session state.")
+
+    raft_comm_state = get_raft_comm_state(
+        sessionId=sessionId, state_object=worker
+    )
+    if "nccl" in raft_comm_state:
+        raft_comm_state["nccl"].destroy()
+        del raft_comm_state["nccl"]
+        if verbose:
+            worker.log_event(topic="info", msg="NCCL session state destroyed.")
+    else:
+        if verbose:
+            worker.log_event(
+                topic="warning",
+                msg=f"Session state for, '{sessionId}', "
+                f"does not contain expected 'nccl' element",
+            )
+
+    if verbose:
+        worker.log_event(
+            topic="info",
+            msg=f"Destroying CUDA handle for sessionId, '{sessionId}.'",
+        )
+
+    if "handle" in raft_comm_state:
+        del raft_comm_state["handle"]
+    else:
+        if verbose:
+            worker.log_event(
+                topic="warning",
+                msg=f"Session state for, '{sessionId}', "
+                f"does not contain expected 'handle' element",
+            )
+
+
+def _func_ucp_ports(client, workers):
+    return client.run(_func_ucp_listener_port, workers=workers)
+
+
+def _func_worker_ranks(workers):
+    """
+    Builds a dictionary of { (worker_address, worker_port) : worker_rank }
+    """
+    return dict(list(zip(workers, range(len(workers)))))
diff --git a/python/raft/raft/dask/common/comms_utils.pyx b/python/raft/raft/dask/common/comms_utils.pyx
new file mode 100644
index 0000000000..38c5670372
--- /dev/null
+++ b/python/raft/raft/dask/common/comms_utils.pyx
@@ -0,0 +1,313 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libc.stdlib cimport malloc, free
+from cython.operator cimport dereference as deref
+
+from cpython.long cimport PyLong_AsVoidPtr
+
+from libcpp cimport bool
+
+from libc.stdint cimport uintptr_t
+
+cdef extern from "nccl.h":
+
+    cdef struct ncclComm
+    ctypedef ncclComm *ncclComm_t
+
+cdef extern from "raft/handle.hpp" namespace "raft":
+    cdef cppclass handle_t:
+        handle_t() except +
+
+cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms":
+
+    void build_comms_nccl_ucx(handle_t *handle,
+                              ncclComm_t comm,
+                              void *ucp_worker,
+                              void *eps,
+                              int size,
+                              int rank) except +
+
+    void build_comms_nccl_only(handle_t *handle,
+                               ncclComm_t comm,
+                               int size,
+                               int rank) except +
+
+cdef extern from "raft/comms/comms_test.hpp" namespace "raft::comms":
+
+    bool test_collective_allreduce(const handle_t &h, int root) except +
+    bool test_collective_broadcast(const handle_t &h, int root) except +
+    bool test_collective_reduce(const handle_t &h, int root) except +
+    bool test_collective_allgather(const handle_t &h, int root) except +
+    bool test_collective_gather(const handle_t &h, int root) except +
+    bool test_collective_gatherv(const handle_t &h, int root) except +
+    bool test_collective_reducescatter(const handle_t &h, int root) except +
+    bool test_pointToPoint_simple_send_recv(const handle_t &h,
+                                            int numTrials) except +
+    bool test_pointToPoint_device_send_or_recv(const handle_t &h,
+                                               int numTrials) except +
+    bool test_pointToPoint_device_sendrecv(const handle_t &h,
+                                           int numTrials) except +
+    bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
+                                                     int numTrials) except +
+    bool test_commsplit(const handle_t &h, int n_colors) except +
+
+
+def perform_test_comms_allreduce(handle, root):
+    """
+    Performs an allreduce on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    """
+    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    return test_collective_allreduce(deref(h), root)
+
+
+def perform_test_comms_reduce(handle, root):
+    """
+    Performs an allreduce on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    """
+    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    return test_collective_reduce(deref(h), root)
+
+
+def perform_test_comms_reducescatter(handle, root):
+    """
+    Performs an allreduce on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    """
+    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    return test_collective_reducescatter(deref(h), root)
+
+
+def perform_test_comms_bcast(handle, root):
+    """
+    Performs an broadcast on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    """
+    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    return test_collective_broadcast(deref(h), root)
+
+
+def perform_test_comms_allgather(handle, root):
+    """
+    Performs an broadcast on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    """
+    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    return test_collective_allgather(deref(h), root)
+
+
+def perform_test_comms_gather(handle, root):
+    """
+    Performs a gather on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    root : int
+           Rank of the root worker
+    """
+    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    return test_collective_gather(deref(h), root)
+
+
+def perform_test_comms_gatherv(handle, root):
+    """
+    Performs a gatherv on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    root : int
+           Rank of the root worker
+    """
+    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    return test_collective_gatherv(deref(h), root)
+
+
+def perform_test_comms_send_recv(handle, n_trials):
+    """
+    Performs a p2p send/recv on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    n_trilas : int
+               Number of test trials
+    """
+    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
+    return test_pointToPoint_simple_send_recv(deref(h), <int>n_trials)
+
+
+def perform_test_comms_device_send_or_recv(handle, n_trials):
+    """
+    Performs a p2p device send or recv on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    n_trilas : int
+               Number of test trials
+    """
+    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
+    return test_pointToPoint_device_send_or_recv(deref(h), <int>n_trials)
+
+
+def perform_test_comms_device_sendrecv(handle, n_trials):
+    """
+    Performs a p2p device concurrent send&recv on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    n_trilas : int
+               Number of test trials
+    """
+    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
+    return test_pointToPoint_device_sendrecv(deref(h), <int>n_trials)
+
+
+def perform_test_comms_device_multicast_sendrecv(handle, n_trials):
+    """
+    Performs a p2p device concurrent multicast send&recv on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    n_trilas : int
+               Number of test trials
+    """
+    cdef const handle_t *h = <handle_t *> <size_t> handle.getHandle()
+    return test_pointToPoint_device_multicast_sendrecv(deref(h), <int>n_trials)
+
+
+def perform_test_comm_split(handle, n_colors):
+    """
+    Performs a p2p send/recv on the current worker
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    """
+    cdef const handle_t * h = < handle_t * > < size_t > handle.getHandle()
+    return test_commsplit(deref(h), < int > n_colors)
+
+
+def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose):
+    """
+    Given a handle and initialized nccl comm, creates a comms_t
+    instance and injects it into the handle.
+
+        Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    nccl_inst : raft.dask.common.nccl
+                Initialized nccl comm to use
+    size : int
+           Number of workers in cluster
+    rank : int
+           Rank of current worker
+
+    """
+
+    cdef size_t handle_size_t = <size_t>handle.getHandle()
+    handle_ = <handle_t*>handle_size_t
+
+    cdef size_t nccl_comm_size_t = <size_t>nccl_inst.get_comm()
+    nccl_comm_ = <ncclComm_t*>nccl_comm_size_t
+
+    build_comms_nccl_only(handle_,
+                          deref(nccl_comm_),
+                          size,
+                          rank)
+
+
+def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size,
+                           rank, verbose):
+    """
+    Given a handle and initialized comms, creates a comms_t instance
+    and injects it into the handle.
+
+    Parameters
+    ----------
+    handle : raft.common.Handle
+             handle containing comms_t to use
+    nccl_inst : raft.dask.common.nccl
+                Initialized nccl comm to use
+    ucp_worker : size_t pointer to initialized ucp_worker_h instance
+    eps: size_t pointer to array of initialized ucp_ep_h instances
+    size : int
+           Number of workers in cluster
+    rank : int
+           Rank of current worker
+    """
+    cdef size_t *ucp_eps = <size_t*> malloc(len(eps)*sizeof(size_t))
+
+    for i in range(len(eps)):
+        if eps[i] is not None:
+            ep_st = <uintptr_t>eps[i].get_ucp_endpoint()
+            ucp_eps[i] = <size_t>ep_st
+        else:
+            ucp_eps[i] = 0
+
+    cdef void* ucp_worker_st = <void*><size_t>ucp_worker
+
+    cdef size_t handle_size_t = <size_t>handle.getHandle()
+    handle_ = <handle_t*>handle_size_t
+
+    cdef size_t nccl_comm_size_t = <size_t>nccl_inst.get_comm()
+    nccl_comm_ = <ncclComm_t*>nccl_comm_size_t
+
+    build_comms_nccl_ucx(handle_,
+                         deref(nccl_comm_),
+                         <void*>ucp_worker_st,
+                         <void*>ucp_eps,
+                         size,
+                         rank)
+
+    free(ucp_eps)
diff --git a/python/raft/raft/dask/common/nccl.pyx b/python/raft/raft/dask/common/nccl.pyx
new file mode 100644
index 0000000000..fd113e2222
--- /dev/null
+++ b/python/raft/raft/dask/common/nccl.pyx
@@ -0,0 +1,253 @@
+#
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from cython.operator cimport dereference as deref
+
+from libcpp cimport bool
+from libc.stdlib cimport malloc, free
+
+cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms":
+    void get_unique_id(char *uid, int size) except +
+    void nccl_unique_id_from_char(ncclUniqueId *id,
+                                  char *uniqueId,
+                                  int size) except +
+
+cdef extern from "nccl.h":
+
+    cdef struct ncclComm
+
+    ctypedef struct ncclUniqueId:
+        char *internal[128]
+
+    ctypedef ncclComm *ncclComm_t
+
+    ctypedef enum ncclResult_t:
+        ncclSuccess
+        ncclUnhandledCudaError
+        ncclSystemError
+        ncclInternalError
+        ncclInvalidArgument
+        ncclInvalidUsage
+        ncclNumResults
+
+    ncclResult_t ncclCommInitRank(ncclComm_t *comm,
+                                  int nranks,
+                                  ncclUniqueId commId,
+                                  int rank) nogil
+
+    ncclResult_t ncclGetUniqueId(ncclUniqueId *uniqueId) nogil
+
+    ncclResult_t ncclCommUserRank(const ncclComm_t comm, int *rank) nogil
+
+    ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int *count) nogil
+
+    const char *ncclGetErrorString(ncclResult_t result) nogil
+
+    ncclResult_t ncclCommAbort(ncclComm_t comm) nogil
+
+    ncclResult_t ncclCommDestroy(ncclComm_t comm) nogil
+
+NCCL_UNIQUE_ID_BYTES = 128
+
+
+def unique_id():
+    """
+    Returns a new ncclUniqueId converted to a
+    character array that can be safely serialized
+    and shared to a remote worker.
+
+    Returns
+    -------
+    128-byte unique id : str
+    """
+    cdef char *uid = <char *> malloc(NCCL_UNIQUE_ID_BYTES * sizeof(char))
+    get_unique_id(uid, NCCL_UNIQUE_ID_BYTES)
+    c_str = uid[:NCCL_UNIQUE_ID_BYTES-1]
+    free(uid)
+    return c_str
+
+
+cdef class nccl:
+    """
+    A NCCL wrapper for initializing and closing NCCL comms
+    in Python.
+    """
+    cdef ncclComm_t *comm
+
+    cdef int size
+    cdef int rank
+
+    def __cinit__(self):
+        self.comm = <ncclComm_t*>malloc(sizeof(ncclComm_t))
+
+    def __dealloc__(self):
+
+        comm_ = <ncclComm_t*>self.comm
+
+        if comm_ != NULL:
+            free(self.comm)
+            self.comm = NULL
+
+    @staticmethod
+    def get_unique_id():
+        """
+        Returns a new nccl unique id
+
+        Returns
+        -------
+        nccl unique id : str
+        """
+        return unique_id()
+
+    def init(self, nranks, commId, rank):
+        """
+        Construct a nccl-py object
+
+        Parameters
+        ----------
+        nranks : int size of clique
+        commId : string unique id from client
+        rank : int rank of current worker
+        """
+        self.size = nranks
+        self.rank = rank
+
+        cdef ncclUniqueId *ident = <ncclUniqueId*>malloc(sizeof(ncclUniqueId))
+        nccl_unique_id_from_char(ident, commId, NCCL_UNIQUE_ID_BYTES)
+
+        comm_ = <ncclComm_t*>self.comm
+
+        cdef int nr = nranks
+        cdef int r = rank
+        cdef ncclResult_t result
+
+        with nogil:
+            result = ncclCommInitRank(comm_, nr,
+                                      deref(ident), r)
+
+        if result != ncclSuccess:
+            with nogil:
+                err_str = ncclGetErrorString(result)
+
+            raise RuntimeError("NCCL_ERROR: %s" % err_str)
+
+    def destroy(self):
+        """
+        Call destroy on the underlying NCCL comm
+        """
+        comm_ = <ncclComm_t*>self.comm
+
+        cdef ncclResult_t result
+        if comm_ != NULL:
+            with nogil:
+                result = ncclCommDestroy(deref(comm_))
+
+            free(self.comm)
+            self.comm = NULL
+
+            if result != ncclSuccess:
+                with nogil:
+                    err_str = ncclGetErrorString(result)
+
+                raise RuntimeError("NCCL_ERROR: %s" % err_str)
+
+    def abort(self):
+        """
+        Call abort on the underlying nccl comm
+        """
+        comm_ = <ncclComm_t*>self.comm
+        cdef ncclResult_t result
+        if comm_ != NULL:
+            with nogil:
+                result = ncclCommAbort(deref(comm_))
+
+            free(comm_)
+            self.comm = NULL
+
+            if result != ncclSuccess:
+                with nogil:
+                    err_str = ncclGetErrorString(result)
+                raise RuntimeError("NCCL_ERROR: %s" % err_str)
+
+    def cu_device(self):
+        """
+        Get the device backing the underlying comm
+
+        Returns
+        -------
+        device id : int
+        """
+        cdef int *dev = <int*>malloc(sizeof(int))
+
+        comm_ = <ncclComm_t*>self.comm
+        cdef ncclResult_t result
+        with nogil:
+            result = ncclCommCuDevice(deref(comm_), dev)
+
+        ret = dev[0]
+        free(dev)
+
+        if result != ncclSuccess:
+            with nogil:
+                err_str = ncclGetErrorString(result)
+
+            raise RuntimeError("NCCL_ERROR: %s" % err_str)
+
+        return ret
+
+    def user_rank(self):
+        """
+        Get the rank id of the current comm
+
+        Returns
+        -------
+        rank : int
+        """
+
+        cdef int *rank = <int*>malloc(sizeof(int))
+
+        comm_ = <ncclComm_t*>self.comm
+
+        cdef ncclResult_t result
+        with nogil:
+            result = ncclCommUserRank(deref(comm_), rank)
+
+        ret = rank[0]
+        free(rank)
+
+        if result != ncclSuccess:
+            with nogil:
+                err_str = ncclGetErrorString(result)
+            raise RuntimeError("NCCL_ERROR: %s" % err_str)
+
+        return ret
+
+    def get_comm(self):
+        """
+        Returns the underlying nccl comm in a size_t (similar to void*).
+        This can be safely typecasted from size_t into ncclComm_t*
+
+        Returns
+        -------
+        ncclComm_t instance pointer : size_t
+        """
+        return <size_t>self.comm
diff --git a/python/raft/raft/dask/common/ucx.py b/python/raft/raft/dask/common/ucx.py
new file mode 100644
index 0000000000..f61479a0eb
--- /dev/null
+++ b/python/raft/raft/dask/common/ucx.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ucp
+
+
+async def _connection_func(ep):
+    UCX.get().add_server_endpoint(ep)
+
+
+class UCX:
+    """
+    Singleton UCX context to encapsulate all interactions with the
+    UCX-py API and guarantee only a single listener & endpoints are
+    created by RAFT Comms on a single process.
+    """
+
+    __instance = None
+
+    def __init__(self, listener_callback):
+
+        self.listener_callback = listener_callback
+
+        self._create_listener()
+        self._endpoints = {}
+        self._server_endpoints = []
+
+        assert UCX.__instance is None
+
+        UCX.__instance = self
+
+    @staticmethod
+    def get(listener_callback=_connection_func):
+        if UCX.__instance is None:
+            UCX(listener_callback)
+        return UCX.__instance
+
+    def get_worker(self):
+        return ucp.get_ucp_worker()
+
+    def _create_listener(self):
+        self._listener = ucp.create_listener(self.listener_callback)
+
+    def listener_port(self):
+        return self._listener.port
+
+    async def _create_endpoint(self, ip, port):
+        ep = await ucp.create_endpoint(ip, port)
+        self._endpoints[(ip, port)] = ep
+        return ep
+
+    def add_server_endpoint(self, ep):
+        self._server_endpoints.append(ep)
+
+    async def get_endpoint(self, ip, port):
+        if (ip, port) not in self._endpoints:
+            ep = await self._create_endpoint(ip, port)
+        else:
+            ep = self._endpoints[(ip, port)]
+
+        return ep
+
+    async def close_endpoints(self):
+        for k, ep in self._endpoints.items():
+            await ep.close()
+
+        for ep in self._server_endpoints:
+            ep.close()
+
+    def __del__(self):
+        for ip_port, ep in self._endpoints.items():
+            if not ep.closed():
+                ep.abort()
+            del ep
+
+        for ep in self._server_endpoints:
+            if not ep.closed():
+                ep.abort()
+            del ep
+
+        self._listener.close()
diff --git a/python/raft/raft/dask/common/utils.py b/python/raft/raft/dask/common/utils.py
new file mode 100644
index 0000000000..fdb5acfb5d
--- /dev/null
+++ b/python/raft/raft/dask/common/utils.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from dask.distributed import default_client
+
+
+def get_client(client=None):
+    return default_client() if client is None else client
+
+
+def parse_host_port(address):
+    """
+    Given a string address with host/port, build a tuple(host, port)
+
+    Parameters
+    ----------
+    address: string address to parse
+
+    Returns
+    -------
+    tuple with host and port info : tuple(host, port)
+    """
+    if '://' in address:
+        address = address.rsplit('://', 1)[1]
+    host, port = address.split(':')
+    port = int(port)
+    return host, port
diff --git a/python/raft/raft/include_test/__init__.py b/python/raft/raft/include_test/__init__.py
new file mode 100644
index 0000000000..2b81c05b26
--- /dev/null
+++ b/python/raft/raft/include_test/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .raft_include_test import raft_include_test
diff --git a/python/raft/raft/include_test/raft_include_test.pyx b/python/raft/raft/include_test/raft_include_test.pyx
new file mode 100644
index 0000000000..6ebcb79256
--- /dev/null
+++ b/python/raft/raft/include_test/raft_include_test.pyx
@@ -0,0 +1,19 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+def raft_include_test():
+    print("RAFT Setup succesfully")
+    return True
diff --git a/python/raft/raft/test/__init__.py b/python/raft/raft/test/__init__.py
new file mode 100644
index 0000000000..df8a4ae3b9
--- /dev/null
+++ b/python/raft/raft/test/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
\ No newline at end of file
diff --git a/python/raft/raft/test/conftest.py b/python/raft/raft/test/conftest.py
new file mode 100644
index 0000000000..7ba0e36b0e
--- /dev/null
+++ b/python/raft/raft/test/conftest.py
@@ -0,0 +1,49 @@
+import pytest
+
+from dask.distributed import Client
+
+from dask_cuda import initialize
+from dask_cuda import LocalCUDACluster
+
+import os
+os.environ["UCX_LOG_LEVEL"] = "error"
+
+
+enable_tcp_over_ucx = True
+enable_nvlink = False
+enable_infiniband = False
+
+
+@pytest.fixture(scope="session")
+def cluster():
+    cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0)
+    yield cluster
+    cluster.close()
+
+
+@pytest.fixture(scope="session")
+def ucx_cluster():
+    initialize.initialize(create_cuda_context=True,
+                          enable_tcp_over_ucx=enable_tcp_over_ucx,
+                          enable_nvlink=enable_nvlink,
+                          enable_infiniband=enable_infiniband)
+    cluster = LocalCUDACluster(protocol="ucx",
+                               enable_tcp_over_ucx=enable_tcp_over_ucx,
+                               enable_nvlink=enable_nvlink,
+                               enable_infiniband=enable_infiniband)
+    yield cluster
+    cluster.close()
+
+
+@pytest.fixture(scope="session")
+def client(cluster):
+    client = Client(cluster)
+    yield client
+    client.close()
+
+
+@pytest.fixture()
+def ucx_client(ucx_cluster):
+    client = Client(cluster)
+    yield client
+    client.close()
diff --git a/python/raft/raft/test/test_comms.py b/python/raft/raft/test/test_comms.py
new file mode 100644
index 0000000000..a540e8db10
--- /dev/null
+++ b/python/raft/raft/test/test_comms.py
@@ -0,0 +1,336 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pytest
+
+from collections import OrderedDict
+
+from dask.distributed import Client
+from dask.distributed import wait
+
+try:
+    from raft.dask import Comms
+    from raft.dask.common import local_handle
+    from raft.dask.common import perform_test_comms_send_recv
+    from raft.dask.common import perform_test_comms_device_send_or_recv
+    from raft.dask.common import perform_test_comms_device_sendrecv
+    from raft.dask.common import perform_test_comms_device_multicast_sendrecv
+    from raft.dask.common import perform_test_comms_allreduce
+    from raft.dask.common import perform_test_comms_bcast
+    from raft.dask.common import perform_test_comms_reduce
+    from raft.dask.common import perform_test_comms_allgather
+    from raft.dask.common import perform_test_comms_gather
+    from raft.dask.common import perform_test_comms_gatherv
+    from raft.dask.common import perform_test_comms_reducescatter
+    from raft.dask.common import perform_test_comm_split
+
+    pytestmark = pytest.mark.mg
+except ImportError:
+    pytestmark = pytest.mark.skip
+
+
+def test_comms_init_no_p2p(cluster):
+
+    client = Client(cluster)
+
+    try:
+        cb = Comms(verbose=True)
+        cb.init()
+
+        assert cb.nccl_initialized is True
+        assert cb.ucx_initialized is False
+
+    finally:
+
+        cb.destroy()
+        client.close()
+
+
+def func_test_collective(func, sessionId, root):
+    handle = local_handle(sessionId)
+    return func(handle, root)
+
+
+def func_test_send_recv(sessionId, n_trials):
+    handle = local_handle(sessionId)
+    return perform_test_comms_send_recv(handle, n_trials)
+
+
+def func_test_device_send_or_recv(sessionId, n_trials):
+    handle = local_handle(sessionId)
+    return perform_test_comms_device_send_or_recv(handle, n_trials)
+
+
+def func_test_device_sendrecv(sessionId, n_trials):
+    handle = local_handle(sessionId)
+    return perform_test_comms_device_sendrecv(handle, n_trials)
+
+
+def func_test_device_multicast_sendrecv(sessionId, n_trials):
+    handle = local_handle(sessionId)
+    return perform_test_comms_device_multicast_sendrecv(handle, n_trials)
+
+
+def func_test_comm_split(sessionId, n_trials):
+    handle = local_handle(sessionId)
+    return perform_test_comm_split(handle, n_trials)
+
+
+def func_check_uid(sessionId, uniqueId, state_object):
+    if not hasattr(state_object, "_raft_comm_state"):
+        return 1
+
+    state = state_object._raft_comm_state
+    if sessionId not in state:
+        return 2
+
+    session_state = state[sessionId]
+    if "nccl_uid" not in session_state:
+        return 3
+
+    nccl_uid = session_state["nccl_uid"]
+    if nccl_uid != uniqueId:
+        return 4
+
+    return 0
+
+
+def func_check_uid_on_scheduler(sessionId, uniqueId, dask_scheduler):
+    return func_check_uid(
+        sessionId=sessionId, uniqueId=uniqueId, state_object=dask_scheduler
+    )
+
+
+def func_check_uid_on_worker(sessionId, uniqueId):
+    from dask.distributed import get_worker
+
+    return func_check_uid(
+        sessionId=sessionId, uniqueId=uniqueId, state_object=get_worker()
+    )
+
+
+def test_handles(cluster):
+
+    client = Client(cluster)
+
+    def _has_handle(sessionId):
+        return local_handle(sessionId) is not None
+
+    try:
+        cb = Comms(verbose=True)
+        cb.init()
+
+        dfs = [
+            client.submit(_has_handle, cb.sessionId, pure=False, workers=[w])
+            for w in cb.worker_addresses
+        ]
+        wait(dfs, timeout=5)
+
+        assert all(client.compute(dfs, sync=True))
+
+    finally:
+        cb.destroy()
+        client.close()
+
+
+if pytestmark.markname != "skip":
+    functions = [
+        perform_test_comms_allgather,
+        perform_test_comms_allreduce,
+        perform_test_comms_bcast,
+        perform_test_comms_gather,
+        perform_test_comms_gatherv,
+        perform_test_comms_reduce,
+        perform_test_comms_reducescatter,
+    ]
+else:
+    functions = [None]
+
+
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
+def test_nccl_root_placement(client, root_location):
+
+    cb = None
+    try:
+        cb = Comms(
+            verbose=True, client=client, nccl_root_location=root_location
+        )
+        cb.init()
+
+        worker_addresses = list(
+            OrderedDict.fromkeys(client.scheduler_info()["workers"].keys())
+        )
+
+        if root_location in ("worker",):
+            result = client.run(
+                func_check_uid_on_worker,
+                cb.sessionId,
+                cb.uniqueId,
+                workers=[worker_addresses[0]],
+            )[worker_addresses[0]]
+        elif root_location in ("scheduler",):
+            result = client.run_on_scheduler(
+                func_check_uid_on_scheduler, cb.sessionId, cb.uniqueId
+            )
+        else:
+            result = int(cb.uniqueId is None)
+
+        assert result == 0
+
+    finally:
+        if cb:
+            cb.destroy()
+
+
+@pytest.mark.parametrize("func", functions)
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
+@pytest.mark.nccl
+def test_collectives(client, func, root_location):
+
+    try:
+        cb = Comms(
+            verbose=True, client=client, nccl_root_location=root_location
+        )
+        cb.init()
+
+        for k, v in cb.worker_info(cb.worker_addresses).items():
+
+            dfs = [
+                client.submit(
+                    func_test_collective,
+                    func,
+                    cb.sessionId,
+                    v["rank"],
+                    pure=False,
+                    workers=[w],
+                )
+                for w in cb.worker_addresses
+            ]
+            wait(dfs, timeout=5)
+
+            assert all([x.result() for x in dfs])
+    finally:
+        if cb:
+            cb.destroy()
+
+
+@pytest.mark.nccl
+def test_comm_split(client):
+
+    cb = Comms(comms_p2p=True, verbose=True)
+    cb.init()
+
+    dfs = [
+        client.submit(
+            func_test_comm_split, cb.sessionId, 3, pure=False, workers=[w]
+        )
+        for w in cb.worker_addresses
+    ]
+
+    wait(dfs, timeout=5)
+
+    assert all([x.result() for x in dfs])
+
+
+@pytest.mark.ucx
+@pytest.mark.parametrize("n_trials", [1, 5])
+def test_send_recv(n_trials, client):
+
+    cb = Comms(comms_p2p=True, verbose=True)
+    cb.init()
+
+    dfs = [
+        client.submit(
+            func_test_send_recv,
+            cb.sessionId,
+            n_trials,
+            pure=False,
+            workers=[w],
+        )
+        for w in cb.worker_addresses
+    ]
+
+    wait(dfs, timeout=5)
+
+    assert list(map(lambda x: x.result(), dfs))
+
+
+@pytest.mark.nccl
+@pytest.mark.parametrize("n_trials", [1, 5])
+def test_device_send_or_recv(n_trials, client):
+
+    cb = Comms(comms_p2p=True, verbose=True)
+    cb.init()
+
+    dfs = [
+        client.submit(
+            func_test_device_send_or_recv,
+            cb.sessionId,
+            n_trials,
+            pure=False,
+            workers=[w],
+        )
+        for w in cb.worker_addresses
+    ]
+
+    wait(dfs, timeout=5)
+
+    assert list(map(lambda x: x.result(), dfs))
+
+
+@pytest.mark.nccl
+@pytest.mark.parametrize("n_trials", [1, 5])
+def test_device_sendrecv(n_trials, client):
+
+    cb = Comms(comms_p2p=True, verbose=True)
+    cb.init()
+
+    dfs = [
+        client.submit(
+            func_test_device_sendrecv,
+            cb.sessionId,
+            n_trials,
+            pure=False,
+            workers=[w],
+        )
+        for w in cb.worker_addresses
+    ]
+
+    wait(dfs, timeout=5)
+
+    assert list(map(lambda x: x.result(), dfs))
+
+
+@pytest.mark.nccl
+@pytest.mark.parametrize("n_trials", [1, 5])
+def test_device_multicast_sendrecv(n_trials, client):
+
+    cb = Comms(comms_p2p=True, verbose=True)
+    cb.init()
+
+    dfs = [
+        client.submit(
+            func_test_device_multicast_sendrecv,
+            cb.sessionId,
+            n_trials,
+            pure=False,
+            workers=[w],
+        )
+        for w in cb.worker_addresses
+    ]
+
+    wait(dfs, timeout=5)
+
+    assert list(map(lambda x: x.result(), dfs))
diff --git a/python/raft/raft/test/test_interruptible.py b/python/raft/raft/test/test_interruptible.py
new file mode 100644
index 0000000000..81f4f99ed8
--- /dev/null
+++ b/python/raft/raft/test/test_interruptible.py
@@ -0,0 +1,54 @@
+
+import os
+import pytest
+import signal
+import time
+from raft.common.interruptible import cuda_interruptible, cuda_yield
+
+
+def send_ctrl_c():
+    # signal.raise_signal(signal.SIGINT) available only since python 3.8
+    os.kill(os.getpid(), signal.SIGINT)
+
+
+def test_should_cancel_via_interruptible():
+    start_time = time.monotonic()
+    with pytest.raises(RuntimeError, match='this thread was cancelled'):
+        with cuda_interruptible():
+            send_ctrl_c()
+            cuda_yield()
+            time.sleep(1.0)
+    end_time = time.monotonic()
+    assert end_time < start_time + 0.5, \
+        "The process seems to have waited, while it shouldn't have."
+
+
+def test_should_cancel_via_python():
+    start_time = time.monotonic()
+    with pytest.raises(KeyboardInterrupt):
+        send_ctrl_c()
+        cuda_yield()
+        time.sleep(1.0)
+    end_time = time.monotonic()
+    assert end_time < start_time + 0.5, \
+        "The process seems to have waited, while it shouldn't have."
+
+
+def test_should_wait_no_interrupt():
+    start_time = time.monotonic()
+    with cuda_interruptible():
+        cuda_yield()
+        time.sleep(1.0)
+    end_time = time.monotonic()
+    assert end_time > start_time + 0.5, \
+        "The process seems to be cancelled, while it shouldn't be."
+
+
+def test_should_wait_no_yield():
+    start_time = time.monotonic()
+    with cuda_interruptible():
+        send_ctrl_c()
+        time.sleep(1.0)
+    end_time = time.monotonic()
+    assert end_time > start_time + 0.5, \
+        "The process seems to be cancelled, while it shouldn't be."
diff --git a/python/raft/raft/test/test_raft.py b/python/raft/raft/test/test_raft.py
new file mode 100644
index 0000000000..9f0524e198
--- /dev/null
+++ b/python/raft/raft/test/test_raft.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pytest
+import sys
+
+try:
+    import raft
+except ImportError:
+    print("Skipping RAFT tests")
+    pytestmart = pytest.mark.skip
+
+pytestmark = pytest.mark.skipif(
+    'raft' not in sys.argv, reason="marker to allow integration of RAFT"
+)
+
+
+def test_raft():
+    assert raft.raft_include_test()
diff --git a/python/raft/record.txt b/python/raft/record.txt
new file mode 100644
index 0000000000..ecc39a48bf
--- /dev/null
+++ b/python/raft/record.txt
@@ -0,0 +1,44 @@
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/_version.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/test_comms.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/conftest.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/test_raft.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/test_interruptible.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/include_test/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/comms.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/utils.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/ucx.py
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/handle.pxd
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/__init__.pxd
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/cuda.pxd
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/interruptible.pxd
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/__pycache__/_version.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__pycache__/test_comms.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__pycache__/conftest.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__pycache__/test_raft.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__pycache__/test_interruptible.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/include_test/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/__pycache__/comms.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/__pycache__/utils.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/__pycache__/ucx.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/common/cuda.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/common/handle.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/common/interruptible.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/dask/common/comms_utils.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/dask/common/nccl.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/include_test/raft_include_test.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/top_level.txt
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/PKG-INFO
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/SOURCES.txt
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/requires.txt
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/dependency_links.txt
+/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/not-zip-safe
diff --git a/python/raft/setup.cfg b/python/raft/setup.cfg
new file mode 100644
index 0000000000..f6c096818b
--- /dev/null
+++ b/python/raft/setup.cfg
@@ -0,0 +1,58 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
+[flake8]
+exclude = __init__.py,versioneer.py
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+VCS = git
+style = pep440
+versionfile_source = raft/_version.py
+versionfile_build = raft/_version.py
+tag_prefix = v
+parentdir_prefix = raft-
+
+[isort]
+line_length=79
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+combine_as_imports=True
+order_by_type=True
+known_dask=
+    dask
+    distributed
+    dask_cuda
+known_rapids=
+    nvtext
+    cudf
+    cuml
+    cugraph
+    dask_cudf
+    rmm
+known_first_party=
+    raft
+default_section=THIRDPARTY
+sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
+skip=
+    thirdparty
+    .eggs
+    .git
+    .hg
+    .mypy_cache
+    .tox
+    .venv
+    _build
+    buck-out
+    build
+    dist
+    __init__.py
+
+[options]
+packages = find:
+install_requires =
+    numpy
+    numba>=0.49
+python_requires = >=3.7,<3.9
diff --git a/python/raft/setup.py b/python/raft/setup.py
new file mode 100644
index 0000000000..10beca1eb4
--- /dev/null
+++ b/python/raft/setup.py
@@ -0,0 +1,202 @@
+#
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy
+import os
+import shutil
+import sys
+import sysconfig
+
+# Must import in this order:
+#   setuptools -> Cython.Distutils.build_ext -> setuptools.command.build_ext
+# Otherwise, setuptools.command.build_ext ends up inheriting from
+# Cython.Distutils.old_build_ext which we do not want
+import setuptools
+
+try:
+    from Cython.Distutils.build_ext import new_build_ext as _build_ext
+except ImportError:
+    from setuptools.command.build_ext import build_ext as _build_ext
+
+from distutils.sysconfig import get_python_lib
+
+import setuptools.command.build_ext
+from setuptools import find_packages, setup
+from setuptools.extension import Extension
+
+from setuputils import clean_folder
+from setuputils import get_environment_option
+from setuputils import get_cli_option
+
+from pathlib import Path
+
+import versioneer
+
+
+##############################################################################
+# - Dependencies include and lib folder setup --------------------------------
+
+install_requires = [
+    'cython'
+]
+
+cuda_home = get_environment_option("CUDA_HOME")
+
+clean_artifacts = get_cli_option('clean')
+single_gpu_build = get_cli_option('--singlegpu')
+
+
+if not cuda_home:
+    cuda_home = (
+        os.popen('echo "$(dirname $(dirname $(which nvcc)))"').read().strip()
+    )
+    print("-- Using nvcc to detect CUDA, found at " + str(cuda_home))
+cuda_include_dir = os.path.join(cuda_home, "include")
+cuda_lib_dir = os.path.join(cuda_home, "lib64")
+
+##############################################################################
+# - Clean target -------------------------------------------------------------
+
+if clean_artifacts:
+    print("-- Cleaning all Python and Cython build artifacts...")
+
+    try:
+        setup_file_path = str(Path(__file__).parent.absolute())
+        shutil.rmtree(setup_file_path + '/.pytest_cache', ignore_errors=True)
+        shutil.rmtree(setup_file_path + '/_external_repositories',
+                      ignore_errors=True)
+        shutil.rmtree(setup_file_path + '/raft.egg-info', ignore_errors=True)
+        shutil.rmtree(setup_file_path + '/__pycache__', ignore_errors=True)
+
+        clean_folder(setup_file_path + '/raft')
+        shutil.rmtree(setup_file_path + '/build')
+
+    except IOError:
+        pass
+
+    # need to terminate script so cythonizing doesn't get triggered after
+    # cleanup unintendedly
+    sys.argv.remove("clean")
+
+    if "--all" in sys.argv:
+        sys.argv.remove("--all")
+
+    if len(sys.argv) == 1:
+        sys.exit(0)
+
+
+##############################################################################
+# - Cython extensions build and parameters -----------------------------------
+
+
+libs = ['cudart', "nccl", "cusolver", "cusparse", "cublas"]
+
+include_dirs = [cuda_include_dir,
+                numpy.get_include(),
+                "../cpp/include/",
+                os.path.dirname(sysconfig.get_path("include"))]
+
+extensions = [
+    Extension("*",
+              sources=["raft/**/*.pyx"],
+              include_dirs=include_dirs,
+              library_dirs=[get_python_lib()],
+              runtime_library_dirs=[cuda_lib_dir,
+                                    os.path.join(os.sys.prefix, "lib")],
+              libraries=libs,
+              language='c++',
+              extra_compile_args=['-std=c++17'])
+]
+
+
+class build_ext_no_debug(_build_ext):
+
+    def build_extensions(self):
+        def remove_flags(compiler, *flags):
+            for flag in flags:
+                try:
+                    compiler.compiler_so = list(
+                        filter((flag).__ne__, compiler.compiler_so)
+                    )
+                except Exception:
+                    pass
+
+        # Full optimization
+        self.compiler.compiler_so.append("-O3")
+
+        # Ignore deprecation declaration warnings
+        self.compiler.compiler_so.append("-Wno-deprecated-declarations")
+
+        # No debug symbols, full optimization, no '-Wstrict-prototypes' warning
+        remove_flags(
+            self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes"
+        )
+        super().build_extensions()
+
+    def finalize_options(self):
+        if self.distribution.ext_modules:
+            # Delay import this to allow for Cython-less installs
+            from Cython.Build.Dependencies import cythonize
+
+            nthreads = getattr(self, "parallel", None)  # -j option in Py3.5+
+            nthreads = int(nthreads) if nthreads else None
+            self.distribution.ext_modules = cythonize(
+                self.distribution.ext_modules,
+                nthreads=nthreads,
+                force=self.force,
+                gdb_debug=False,
+                compiler_directives=dict(
+                    profile=False, language_level=3, embedsignature=True
+                ),
+            )
+        # Skip calling super() and jump straight to setuptools
+        setuptools.command.build_ext.build_ext.finalize_options(self)
+
+
+cmdclass = dict()
+cmdclass.update(versioneer.get_cmdclass())
+cmdclass["build_ext"] = build_ext_no_debug
+
+
+##############################################################################
+# - Python package generation ------------------------------------------------
+
+
+setup(name='raft',
+      description="RAPIDS Analytics Frameworks Toolset",
+      version=versioneer.get_version(),
+      classifiers=[
+        "Intended Audience :: Developers",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7"
+      ],
+      author="NVIDIA Corporation",
+      setup_requires=['cython'],
+      ext_modules=extensions,
+      package_data=dict.fromkeys(
+                         find_packages(include=["raft.dask.common",
+                                                "raft.dask.common.includes",
+                                                "raft.common",
+                                                "raft.common.includes"]),
+                         ["*.hpp", "*.pxd"],
+      ),
+      packages=find_packages(include=['raft', 'raft.*']),
+      install_requires=install_requires,
+      license="Apache",
+      cmdclass=cmdclass,
+      zip_safe=False
+      )
diff --git a/python/raft/setuputils.py b/python/raft/setuputils.py
new file mode 100755
index 0000000000..61cb2da273
--- /dev/null
+++ b/python/raft/setuputils.py
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import glob
+import os
+import shutil
+import sys
+
+
+def get_environment_option(name):
+    ENV_VARIABLE = os.environ.get(name, False)
+
+    if not ENV_VARIABLE:
+        print("-- " + name + " environment variable not set.")
+
+    else:
+        print("-- " + name + " detected with value: " + str(ENV_VARIABLE))
+
+    return ENV_VARIABLE
+
+
+def get_cli_option(name):
+    if name in sys.argv:
+        print("-- Detected " + str(name) + " build option.")
+        return True
+
+    else:
+        return False
+
+
+def clean_folder(path):
+    """
+    Function to clean all Cython and Python artifacts and cache folders. It
+    clean the folder as well as its direct children recursively.
+
+    Parameters
+    ----------
+    path : String
+        Path to the folder to be cleaned.
+    """
+    shutil.rmtree(path + '/__pycache__', ignore_errors=True)
+
+    folders = glob.glob(path + '/*/')
+    for folder in folders:
+        shutil.rmtree(folder + '/__pycache__', ignore_errors=True)
+
+        clean_folder(folder)
+
+        cython_exts = glob.glob(folder + '/*.cpp')
+        cython_exts.extend(glob.glob(folder + '/*.cpython*'))
+        for file in cython_exts:
+            os.remove(file)
diff --git a/python/raft/versioneer.py b/python/raft/versioneer.py
new file mode 100644
index 0000000000..64fea1c892
--- /dev/null
+++ b/python/raft/versioneer.py
@@ -0,0 +1,1822 @@
+
+# Version: 0.18
+
+"""The Versioneer - like a rocketeer, but for versions.
+
+The Versioneer
+==============
+
+* like a rocketeer, but for versions!
+* https://github.com/warner/python-versioneer
+* Brian Warner
+* License: Public Domain
+* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy
+* [![Latest Version]
+(https://pypip.in/version/versioneer/badge.svg?style=flat)
+](https://pypi.python.org/pypi/versioneer/)
+* [![Build Status]
+(https://travis-ci.org/warner/python-versioneer.png?branch=master)
+](https://travis-ci.org/warner/python-versioneer)
+
+This is a tool for managing a recorded version number in distutils-based
+python projects. The goal is to remove the tedious and error-prone "update
+the embedded version string" step from your release process. Making a new
+release should be as easy as recording a new tag in your version-control
+system, and maybe making new tarballs.
+
+
+## Quick Install
+
+* `pip install versioneer` to somewhere to your $PATH
+* add a `[versioneer]` section to your setup.cfg (see below)
+* run `versioneer install` in your source tree, commit the results
+
+## Version Identifiers
+
+Source trees come from a variety of places:
+
+* a version-control system checkout (mostly used by developers)
+* a nightly tarball, produced by build automation
+* a snapshot tarball, produced by a web-based VCS browser, like github's
+  "tarball from tag" feature
+* a release tarball, produced by "setup.py sdist", distributed through PyPI
+
+Within each source tree, the version identifier (either a string or a number,
+this tool is format-agnostic) can come from a variety of places:
+
+* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
+  about recent "tags" and an absolute revision-id
+* the name of the directory into which the tarball was unpacked
+* an expanded VCS keyword ($Id$, etc)
+* a `_version.py` created by some earlier build step
+
+For released software, the version identifier is closely related to a VCS
+tag. Some projects use tag names that include more than just the version
+string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
+needs to strip the tag prefix to extract the version identifier. For
+unreleased software (between tags), the version identifier should provide
+enough information to help developers recreate the same tree, while also
+giving them an idea of roughly how old the tree is (after version 1.2, before
+version 1.3). Many VCS systems can report a description that captures this,
+for example `git describe --tags --dirty --always` reports things like
+"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
+0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
+uncommitted changes.
+
+The version identifier is used for multiple purposes:
+
+* to allow the module to self-identify its version: `myproject.__version__`
+* to choose a name and prefix for a 'setup.py sdist' tarball
+
+## Theory of Operation
+
+Versioneer works by adding a special `_version.py` file into your source
+tree, where your `__init__.py` can import it. This `_version.py` knows how to
+dynamically ask the VCS tool for version information at import time.
+
+`_version.py` also contains `$Revision$` markers, and the installation
+process marks `_version.py` to have this marker rewritten with a tag name
+during the `git archive` command. As a result, generated tarballs will
+contain enough information to get the proper version.
+
+To allow `setup.py` to compute a version too, a `versioneer.py` is added to
+the top level of your source tree, next to `setup.py` and the `setup.cfg`
+that configures it. This overrides several distutils/setuptools commands to
+compute the version when invoked, and changes `setup.py build` and `setup.py
+sdist` to replace `_version.py` with a small static file that contains just
+the generated version data.
+
+## Installation
+
+See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
+
+## Version-String Flavors
+
+Code which uses Versioneer can learn about its version string at runtime by
+importing `_version` from your main `__init__.py` file and running the
+`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
+import the top-level `versioneer.py` and run `get_versions()`.
+
+Both functions return a dictionary with different flavors of version
+information:
+
+* `['version']`: A condensed version string, rendered using the selected
+  style. This is the most commonly used value for the project's version
+  string. The default "pep440" style yields strings like `0.11`,
+  `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
+  below for alternative styles.
+
+* `['full-revisionid']`: detailed revision identifier. For Git, this is the
+  full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
+
+* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
+  commit date in ISO 8601 format. This will be None if the date is not
+  available.
+
+* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
+  this is only accurate if run in a VCS checkout, otherwise it is likely to
+  be False or None
+
+* `['error']`: if the version string could not be computed, this will be set
+  to a string describing the problem, otherwise it will be None. It may be
+  useful to throw an exception in setup.py if this is set, to avoid e.g.
+  creating tarballs with a version string of "unknown".
+
+Some variants are more useful than others. Including `full-revisionid` in a
+bug report should allow developers to reconstruct the exact code being tested
+(or indicate the presence of local changes that should be shared with the
+developers). `version` is suitable for display in an "about" box or a CLI
+`--version` output: it can be easily compared against release notes and lists
+of bugs fixed in various releases.
+
+The installer adds the following text to your `__init__.py` to place a basic
+version in `YOURPROJECT.__version__`:
+
+    from ._version import get_versions
+    __version__ = get_versions()['version']
+    del get_versions
+
+## Styles
+
+The setup.cfg `style=` configuration controls how the VCS information is
+rendered into a version string.
+
+The default style, "pep440", produces a PEP440-compliant string, equal to the
+un-prefixed tag name for actual releases, and containing an additional "local
+version" section with more detail for in-between builds. For Git, this is
+TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
+--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
+tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
+that this commit is two revisions ("+2") beyond the "0.11" tag. For released
+software (exactly equal to a known tag), the identifier will only contain the
+stripped tag, e.g. "0.11".
+
+Other styles are available. See [details.md](details.md) in the Versioneer
+source tree for descriptions.
+
+## Debugging
+
+Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
+to return a version of "0+unknown". To investigate the problem, run `setup.py
+version`, which will run the version-lookup code in a verbose mode, and will
+display the full contents of `get_versions()` (including the `error` string,
+which may help identify what went wrong).
+
+## Known Limitations
+
+Some situations are known to cause problems for Versioneer. This details the
+most significant ones. More can be found on Github
+[issues page](https://github.com/warner/python-versioneer/issues).
+
+### Subprojects
+
+Versioneer has limited support for source trees in which `setup.py` is not in
+the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
+two common reasons why `setup.py` might not be in the root:
+
+* Source trees which contain multiple subprojects, such as
+  [Buildbot](https://github.com/buildbot/buildbot), which contains both
+  "master" and "slave" subprojects, each with their own `setup.py`,
+  `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
+  distributions (and upload multiple independently-installable tarballs).
+* Source trees whose main purpose is to contain a C library, but which also
+  provide bindings to Python (and perhaps other langauges) in subdirectories.
+
+Versioneer will look for `.git` in parent directories, and most operations
+should get the right version string. However `pip` and `setuptools` have bugs
+and implementation details which frequently cause `pip install .` from a
+subproject directory to fail to find a correct version string (so it usually
+defaults to `0+unknown`).
+
+`pip install --editable .` should work correctly. `setup.py install` might
+work too.
+
+Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
+some later version.
+
+[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking
+this issue. The discussion in
+[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the
+issue from the Versioneer side in more detail.
+[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
+[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
+pip to let Versioneer work correctly.
+
+Versioneer-0.16 and earlier only looked for a `.git` directory next to the
+`setup.cfg`, so subprojects were completely unsupported with those releases.
+
+### Editable installs with setuptools <= 18.5
+
+`setup.py develop` and `pip install --editable .` allow you to install a
+project into a virtualenv once, then continue editing the source code (and
+test) without re-installing after every change.
+
+"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
+convenient way to specify executable scripts that should be installed along
+with the python package.
+
+These both work as expected when using modern setuptools. When using
+setuptools-18.5 or earlier, however, certain operations will cause
+`pkg_resources.DistributionNotFound` errors when running the entrypoint
+script, which must be resolved by re-installing the package. This happens
+when the install happens with one version, then the egg_info data is
+regenerated while a different version is checked out. Many setup.py commands
+cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
+a different virtualenv), so this can be surprising.
+
+[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes
+this one, but upgrading to a newer version of setuptools should probably
+resolve it.
+
+### Unicode version strings
+
+While Versioneer works (and is continually tested) with both Python 2 and
+Python 3, it is not entirely consistent with bytes-vs-unicode distinctions.
+Newer releases probably generate unicode version strings on py2. It's not
+clear that this is wrong, but it may be surprising for applications when then
+write these strings to a network connection or include them in bytes-oriented
+APIs like cryptographic checksums.
+
+[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates
+this question.
+
+
+## Updating Versioneer
+
+To upgrade your project to a new release of Versioneer, do the following:
+
+* install the new Versioneer (`pip install -U versioneer` or equivalent)
+* edit `setup.cfg`, if necessary, to include any new configuration settings
+  indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
+* re-run `versioneer install` in your source tree, to replace
+  `SRC/_version.py`
+* commit any changed files
+
+## Future Directions
+
+This tool is designed to make it easily extended to other version-control
+systems: all VCS-specific components are in separate directories like
+src/git/ . The top-level `versioneer.py` script is assembled from these
+components by running make-versioneer.py . In the future, make-versioneer.py
+will take a VCS name as an argument, and will construct a version of
+`versioneer.py` that is specific to the given VCS. It might also take the
+configuration arguments that are currently provided manually during
+installation by editing setup.py . Alternatively, it might go the other
+direction and include code from all supported VCS systems, reducing the
+number of intermediate scripts.
+
+
+## License
+
+To make Versioneer easier to embed, all its code is dedicated to the public
+domain. The `_version.py` that it creates is also in the public domain.
+Specifically, both are released under the Creative Commons "Public Domain
+Dedication" license (CC0-1.0), as described in
+https://creativecommons.org/publicdomain/zero/1.0/ .
+
+"""
+
+from __future__ import print_function
+try:
+    import configparser
+except ImportError:
+    import ConfigParser as configparser
+import errno
+import json
+import os
+import re
+import subprocess
+import sys
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_root():
+    """Get the project root directory.
+
+    We require that all commands are run from the project root, i.e. the
+    directory that contains setup.py, setup.cfg, and versioneer.py .
+    """
+    root = os.path.realpath(os.path.abspath(os.getcwd()))
+    setup_py = os.path.join(root, "setup.py")
+    versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        # allow 'python path/to/setup.py COMMAND'
+        root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
+        setup_py = os.path.join(root, "setup.py")
+        versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        err = ("Versioneer was unable to run the project root directory. "
+               "Versioneer requires setup.py to be executed from "
+               "its immediate directory (like 'python setup.py COMMAND'), "
+               "or in a way that lets it use sys.argv[0] to find the root "
+               "(like 'python path/to/setup.py COMMAND').")
+        raise VersioneerBadRootError(err)
+    try:
+        # Certain runtime workflows (setup.py install/develop in a setuptools
+        # tree) execute all dependencies in a single python process, so
+        # "versioneer" may be imported multiple times, and python's shared
+        # module-import table will cache the first one. So we can't use
+        # os.path.dirname(__file__), as that will find whichever
+        # versioneer.py was first imported, even in later projects.
+        me = os.path.realpath(os.path.abspath(__file__))
+        me_dir = os.path.normcase(os.path.splitext(me)[0])
+        vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
+        if me_dir != vsr_dir:
+            print("Warning: build in %s is using versioneer.py from %s"
+                  % (os.path.dirname(me), versioneer_py))
+    except NameError:
+        pass
+    return root
+
+
+def get_config_from_root(root):
+    """Read the project setup.cfg file to determine Versioneer config."""
+    # This might raise EnvironmentError (if setup.cfg is missing), or
+    # configparser.NoSectionError (if it lacks a [versioneer] section), or
+    # configparser.NoOptionError (if it lacks "VCS="). See the docstring at
+    # the top of versioneer.py for instructions on writing your setup.cfg .
+    setup_cfg = os.path.join(root, "setup.cfg")
+    parser = configparser.SafeConfigParser()
+    with open(setup_cfg, "r") as f:
+        parser.readfp(f)
+    VCS = parser.get("versioneer", "VCS")  # mandatory
+
+    def get(parser, name):
+        if parser.has_option("versioneer", name):
+            return parser.get("versioneer", name)
+        return None
+    cfg = VersioneerConfig()
+    cfg.VCS = VCS
+    cfg.style = get(parser, "style") or ""
+    cfg.versionfile_source = get(parser, "versionfile_source")
+    cfg.versionfile_build = get(parser, "versionfile_build")
+    cfg.tag_prefix = get(parser, "tag_prefix")
+    if cfg.tag_prefix in ("''", '""'):
+        cfg.tag_prefix = ""
+    cfg.parentdir_prefix = get(parser, "parentdir_prefix")
+    cfg.verbose = get(parser, "verbose")
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+# these dictionaries contain VCS-specific tools
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+LONG_VERSION_PY['git'] = '''
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.18 (https://github.com/warner/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
+    git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
+    git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "%(STYLE)s"
+    cfg.tag_prefix = "%(TAG_PREFIX)s"
+    cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
+    cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %%s" %% dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %%s" %% (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %%s (error)" %% dispcmd)
+            print("stdout was %%s" %% stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %%s but none started with prefix %%s" %%
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %%d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%%s', no digits" %% ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %%s" %% ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %%s" %% r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %%s not under git control" %% root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%%s*" %% tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%%s'"
+                               %% describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%%s' doesn't start with prefix '%%s'"
+                print(fmt %% (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
+                               %% (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%%d" %% pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%%d" %% pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%%s" %% pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%%s" %% pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%%s'" %% style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
+'''
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def do_vcs_install(manifest_in, versionfile_source, ipy):
+    """Git-specific installation logic for Versioneer.
+
+    For Git, this means creating/changing .gitattributes to mark _version.py
+    for export-subst keyword substitution.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+    files = [manifest_in, versionfile_source]
+    if ipy:
+        files.append(ipy)
+    try:
+        me = __file__
+        if me.endswith(".pyc") or me.endswith(".pyo"):
+            me = os.path.splitext(me)[0] + ".py"
+        versioneer_file = os.path.relpath(me)
+    except NameError:
+        versioneer_file = "versioneer.py"
+    files.append(versioneer_file)
+    present = False
+    try:
+        f = open(".gitattributes", "r")
+        for line in f.readlines():
+            if line.strip().startswith(versionfile_source):
+                if "export-subst" in line.strip().split()[1:]:
+                    present = True
+        f.close()
+    except EnvironmentError:
+        pass
+    if not present:
+        f = open(".gitattributes", "a+")
+        f.write("%s export-subst\n" % versionfile_source)
+        f.close()
+        files.append(".gitattributes")
+    run_command(GITS, ["add", "--"] + files)
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+SHORT_VERSION_PY = """
+# This file was generated by 'versioneer.py' (0.18) from
+# revision-control system data, or from the parent directory name of an
+# unpacked source archive. Distribution tarballs contain a pre-generated copy
+# of this file.
+
+import json
+
+version_json = '''
+%s
+'''  # END VERSION_JSON
+
+
+def get_versions():
+    return json.loads(version_json)
+"""
+
+
+def versions_from_file(filename):
+    """Try to determine the version from _version.py if present."""
+    try:
+        with open(filename) as f:
+            contents = f.read()
+    except EnvironmentError:
+        raise NotThisMethod("unable to read _version.py")
+    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
+                   contents, re.M | re.S)
+    if not mo:
+        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
+                       contents, re.M | re.S)
+    if not mo:
+        raise NotThisMethod("no version_json in _version.py")
+    return json.loads(mo.group(1))
+
+
+def write_to_version_file(filename, versions):
+    """Write the given version number to the given _version.py file."""
+    os.unlink(filename)
+    contents = json.dumps(versions, sort_keys=True,
+                          indent=1, separators=(",", ": "))
+    with open(filename, "w") as f:
+        f.write(SHORT_VERSION_PY % contents)
+
+    print("set %s to '%s'" % (filename, versions["version"]))
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+class VersioneerBadRootError(Exception):
+    """The project root directory is unknown or missing key files."""
+
+
+def get_versions(verbose=False):
+    """Get the project version from whatever source is available.
+
+    Returns dict with two keys: 'version' and 'full'.
+    """
+    if "versioneer" in sys.modules:
+        # see the discussion in cmdclass.py:get_cmdclass()
+        del sys.modules["versioneer"]
+
+    root = get_root()
+    cfg = get_config_from_root(root)
+
+    assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
+    handlers = HANDLERS.get(cfg.VCS)
+    assert handlers, "unrecognized VCS '%s'" % cfg.VCS
+    verbose = verbose or cfg.verbose
+    assert cfg.versionfile_source is not None, \
+        "please set versioneer.versionfile_source"
+    assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
+
+    versionfile_abs = os.path.join(root, cfg.versionfile_source)
+
+    # extract version from first of: _version.py, VCS command (e.g. 'git
+    # describe'), parentdir. This is meant to work for developers using a
+    # source checkout, for users of a tarball created by 'setup.py sdist',
+    # and for users of a tarball/zipball created by 'git archive' or github's
+    # download-from-tag feature or the equivalent in other VCSes.
+
+    get_keywords_f = handlers.get("get_keywords")
+    from_keywords_f = handlers.get("keywords")
+    if get_keywords_f and from_keywords_f:
+        try:
+            keywords = get_keywords_f(versionfile_abs)
+            ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
+            if verbose:
+                print("got version from expanded keyword %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        ver = versions_from_file(versionfile_abs)
+        if verbose:
+            print("got version from file %s %s" % (versionfile_abs, ver))
+        return ver
+    except NotThisMethod:
+        pass
+
+    from_vcs_f = handlers.get("pieces_from_vcs")
+    if from_vcs_f:
+        try:
+            pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
+            ver = render(pieces, cfg.style)
+            if verbose:
+                print("got version from VCS %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        if cfg.parentdir_prefix:
+            ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+            if verbose:
+                print("got version from parentdir %s" % ver)
+            return ver
+    except NotThisMethod:
+        pass
+
+    if verbose:
+        print("unable to compute version")
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None, "error": "unable to compute version",
+            "date": None}
+
+
+def get_version():
+    """Get the short version string for this project."""
+    return get_versions()["version"]
+
+
+def get_cmdclass():
+    """Get the custom setuptools/distutils subclasses used by Versioneer."""
+    if "versioneer" in sys.modules:
+        del sys.modules["versioneer"]
+        # this fixes the "python setup.py develop" case (also 'install' and
+        # 'easy_install .'), in which subdependencies of the main project are
+        # built (using setup.py bdist_egg) in the same python process. Assume
+        # a main project A and a dependency B, which use different versions
+        # of Versioneer. A's setup.py imports A's Versioneer, leaving it in
+        # sys.modules by the time B's setup.py is executed, causing B to run
+        # with the wrong versioneer. Setuptools wraps the sub-dep builds in a
+        # sandbox that restores sys.modules to it's pre-build state, so the
+        # parent is protected against the child's "import versioneer". By
+        # removing ourselves from sys.modules here, before the child build
+        # happens, we protect the child from the parent's versioneer too.
+        # Also see https://github.com/warner/python-versioneer/issues/52
+
+    cmds = {}
+
+    # we add "version" to both distutils and setuptools
+    from distutils.core import Command
+
+    class cmd_version(Command):
+        description = "report generated version string"
+        user_options = []
+        boolean_options = []
+
+        def initialize_options(self):
+            pass
+
+        def finalize_options(self):
+            pass
+
+        def run(self):
+            vers = get_versions(verbose=True)
+            print("Version: %s" % vers["version"])
+            print(" full-revisionid: %s" % vers.get("full-revisionid"))
+            print(" dirty: %s" % vers.get("dirty"))
+            print(" date: %s" % vers.get("date"))
+            if vers["error"]:
+                print(" error: %s" % vers["error"])
+    cmds["version"] = cmd_version
+
+    # we override "build_py" in both distutils and setuptools
+    #
+    # most invocation pathways end up running build_py:
+    #  distutils/build -> build_py
+    #  distutils/install -> distutils/build ->..
+    #  setuptools/bdist_wheel -> distutils/install ->..
+    #  setuptools/bdist_egg -> distutils/install_lib -> build_py
+    #  setuptools/install -> bdist_egg ->..
+    #  setuptools/develop -> ?
+    #  pip install:
+    #   copies source tree to a tempdir before running egg_info/etc
+    #   if .git isn't copied too, 'git describe' will fail
+    #   then does setup.py bdist_wheel, or sometimes setup.py install
+    #  setup.py egg_info -> ?
+
+    # we override different "build_py" commands for both environments
+    if "setuptools" in sys.modules:
+        from setuptools.command.build_py import build_py as _build_py
+    else:
+        from distutils.command.build_py import build_py as _build_py
+
+    class cmd_build_py(_build_py):
+        def run(self):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            versions = get_versions()
+            _build_py.run(self)
+            # now locate _version.py in the new build/ directory and replace
+            # it with an updated value
+            if cfg.versionfile_build:
+                target_versionfile = os.path.join(self.build_lib,
+                                                  cfg.versionfile_build)
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+    cmds["build_py"] = cmd_build_py
+
+    if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
+        from cx_Freeze.dist import build_exe as _build_exe
+        # nczeczulin reports that py2exe won't like the pep440-style string
+        # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
+        # setup(console=[{
+        #   "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
+        #   "product_version": versioneer.get_version(),
+        #   ...
+
+        class cmd_build_exe(_build_exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _build_exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["build_exe"] = cmd_build_exe
+        del cmds["build_py"]
+
+    if 'py2exe' in sys.modules:  # py2exe enabled?
+        try:
+            from py2exe.distutils_buildexe import py2exe as _py2exe  # py3
+        except ImportError:
+            from py2exe.build_exe import py2exe as _py2exe  # py2
+
+        class cmd_py2exe(_py2exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _py2exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["py2exe"] = cmd_py2exe
+
+    # we override different "sdist" commands for both environments
+    if "setuptools" in sys.modules:
+        from setuptools.command.sdist import sdist as _sdist
+    else:
+        from distutils.command.sdist import sdist as _sdist
+
+    class cmd_sdist(_sdist):
+        def run(self):
+            versions = get_versions()
+            self._versioneer_generated_versions = versions
+            # unless we update this, the command will keep using the old
+            # version
+            self.distribution.metadata.version = versions["version"]
+            return _sdist.run(self)
+
+        def make_release_tree(self, base_dir, files):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            _sdist.make_release_tree(self, base_dir, files)
+            # now locate _version.py in the new base_dir directory
+            # (remembering that it may be a hardlink) and replace it with an
+            # updated value
+            target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
+            print("UPDATING %s" % target_versionfile)
+            write_to_version_file(target_versionfile,
+                                  self._versioneer_generated_versions)
+    cmds["sdist"] = cmd_sdist
+
+    return cmds
+
+
+CONFIG_ERROR = """
+setup.cfg is missing the necessary Versioneer configuration. You need
+a section like:
+
+ [versioneer]
+ VCS = git
+ style = pep440
+ versionfile_source = src/myproject/_version.py
+ versionfile_build = myproject/_version.py
+ tag_prefix =
+ parentdir_prefix = myproject-
+
+You will also need to edit your setup.py to use the results:
+
+ import versioneer
+ setup(version=versioneer.get_version(),
+       cmdclass=versioneer.get_cmdclass(), ...)
+
+Please read the docstring in ./versioneer.py for configuration instructions,
+edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
+"""
+
+SAMPLE_CONFIG = """
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+#VCS = git
+#style = pep440
+#versionfile_source =
+#versionfile_build =
+#tag_prefix =
+#parentdir_prefix =
+
+"""
+
+INIT_PY_SNIPPET = """
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
+"""
+
+
+def do_setup():
+    """Main VCS-independent setup function for installing Versioneer."""
+    root = get_root()
+    try:
+        cfg = get_config_from_root(root)
+    except (EnvironmentError, configparser.NoSectionError,
+            configparser.NoOptionError) as e:
+        if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
+            print("Adding sample versioneer config to setup.cfg",
+                  file=sys.stderr)
+            with open(os.path.join(root, "setup.cfg"), "a") as f:
+                f.write(SAMPLE_CONFIG)
+        print(CONFIG_ERROR, file=sys.stderr)
+        return 1
+
+    print(" creating %s" % cfg.versionfile_source)
+    with open(cfg.versionfile_source, "w") as f:
+        LONG = LONG_VERSION_PY[cfg.VCS]
+        f.write(LONG % {"DOLLAR": "$",
+                        "STYLE": cfg.style,
+                        "TAG_PREFIX": cfg.tag_prefix,
+                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        })
+
+    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
+                       "__init__.py")
+    if os.path.exists(ipy):
+        try:
+            with open(ipy, "r") as f:
+                old = f.read()
+        except EnvironmentError:
+            old = ""
+        if INIT_PY_SNIPPET not in old:
+            print(" appending to %s" % ipy)
+            with open(ipy, "a") as f:
+                f.write(INIT_PY_SNIPPET)
+        else:
+            print(" %s unmodified" % ipy)
+    else:
+        print(" %s doesn't exist, ok" % ipy)
+        ipy = None
+
+    # Make sure both the top-level "versioneer.py" and versionfile_source
+    # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
+    # they'll be copied into source distributions. Pip won't be able to
+    # install the package without this.
+    manifest_in = os.path.join(root, "MANIFEST.in")
+    simple_includes = set()
+    try:
+        with open(manifest_in, "r") as f:
+            for line in f:
+                if line.startswith("include "):
+                    for include in line.split()[1:]:
+                        simple_includes.add(include)
+    except EnvironmentError:
+        pass
+    # That doesn't cover everything MANIFEST.in can do
+    # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
+    # it might give some false negatives. Appending redundant 'include'
+    # lines is safe, though.
+    if "versioneer.py" not in simple_includes:
+        print(" appending 'versioneer.py' to MANIFEST.in")
+        with open(manifest_in, "a") as f:
+            f.write("include versioneer.py\n")
+    else:
+        print(" 'versioneer.py' already in MANIFEST.in")
+    if cfg.versionfile_source not in simple_includes:
+        print(" appending versionfile_source ('%s') to MANIFEST.in" %
+              cfg.versionfile_source)
+        with open(manifest_in, "a") as f:
+            f.write("include %s\n" % cfg.versionfile_source)
+    else:
+        print(" versionfile_source already in MANIFEST.in")
+
+    # Make VCS-specific changes. For git, this means creating/changing
+    # .gitattributes to mark _version.py for export-subst keyword
+    # substitution.
+    do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
+    return 0
+
+
+def scan_setup_py():
+    """Validate the contents of setup.py against Versioneer's expectations."""
+    found = set()
+    setters = False
+    errors = 0
+    with open("setup.py", "r") as f:
+        for line in f.readlines():
+            if "import versioneer" in line:
+                found.add("import")
+            if "versioneer.get_cmdclass()" in line:
+                found.add("cmdclass")
+            if "versioneer.get_version()" in line:
+                found.add("get_version")
+            if "versioneer.VCS" in line:
+                setters = True
+            if "versioneer.versionfile_source" in line:
+                setters = True
+    if len(found) != 3:
+        print("")
+        print("Your setup.py appears to be missing some important items")
+        print("(but I might be wrong). Please make sure it has something")
+        print("roughly like the following:")
+        print("")
+        print(" import versioneer")
+        print(" setup( version=versioneer.get_version(),")
+        print("        cmdclass=versioneer.get_cmdclass(),  ...)")
+        print("")
+        errors += 1
+    if setters:
+        print("You should remove lines like 'versioneer.VCS = ' and")
+        print("'versioneer.versionfile_source = ' . This configuration")
+        print("now lives in setup.cfg, and should be removed from setup.py")
+        print("")
+        errors += 1
+    return errors
+
+
+if __name__ == "__main__":
+    cmd = sys.argv[1]
+    if cmd == "setup":
+        errors = do_setup()
+        errors += scan_setup_py()
+        if errors:
+            sys.exit(1)

From f1b502aee9ff159a17dee6aee22f696875e18a9a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Mar 2022 20:15:28 -0500
Subject: [PATCH 003/167] More infrastructure stuff

---
 build.sh                         | 35 +++++++++++++++------
 ci/cpu/build.sh                  |  6 ++++
 ci/cpu/upload.sh                 |  7 +++++
 conda/recipes/pylibraft/build.sh |  4 +++
 conda/recipes/pylibraft/meta.yml | 53 ++++++++++++++++++++++++++++++++
 5 files changed, 96 insertions(+), 9 deletions(-)
 create mode 100644 conda/recipes/pylibraft/build.sh
 create mode 100644 conda/recipes/pylibraft/meta.yml

diff --git a/build.sh b/build.sh
index 9d3a796c65..b43b683180 100755
--- a/build.sh
+++ b/build.sh
@@ -2,7 +2,7 @@
 
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-# cuml build script
+# raft build script
 
 # This script is used to build the component(s) in this repo from
 # source, and can be called with various options to customize the
@@ -18,13 +18,14 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft docs -v -g --noinstall --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --nogtest --buildfaiss"
+VALIDARGS="clean libraft pyraft pylibraft docs -v -g --noinstall --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --nogtest --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    libraft          - build the raft C++ code only. Also builds the C-wrapper library
                       around the C++ code.
-   pyraft             - build the cuml Python package
+   pyraft           - build the pyraft Python package
+   pylibraft        - build the pylibraft Python package
    docs             - build the documentation
 
  and <flag> is:
@@ -45,9 +46,9 @@ HELP="$0 [<target> ...] [<flag> ...]
 "
 LIBRAFT_BUILD_DIR=${LIBRAFT_BUILD_DIR:=${REPODIR}/cpp/build}
 SPHINX_BUILD_DIR=${REPODIR}/docs
-PY_RAFT_BUILD_DIR=${REPODIR}/python/build
-PYTHON_DEPS_CLONE=${REPODIR}/python/external_repositories
-BUILD_DIRS="${LIBRAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
+PY_RAFT_BUILD_DIR=${REPODIR}/python/raft/build
+PY_LIBRAFT_BUILD_DIR=${REPODIR}/python/pylibraft/build
+BUILD_DIRS="${LIBRAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PY_LIBRAFT_BUILD_DIR}"
 
 # Set defaults for vars modified by flags to this script
 CMAKE_LOG_LEVEL=""
@@ -157,7 +158,11 @@ if (( ${CLEAN} == 1 )); then
 
     done
 
-    cd ${REPODIR}/python
+    cd ${REPODIR}/python/raft
+    python setup.py clean --all
+    cd ${REPODIR}
+
+    cd ${REPODIR}/python/pylibraft
     python setup.py clean --all
     cd ${REPODIR}
 fi
@@ -200,10 +205,21 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
   fi
 fi
 
-# Build and (optionally) install the cuml Python package
+# Build and (optionally) install the pyraft Python package
 if (( ${NUMARGS} == 0 )) || hasArg pyraft || hasArg docs; then
 
-    cd ${REPODIR}/python
+    cd ${REPODIR}/python/raft
+    if [[ ${INSTALL_TARGET} != "" ]]; then
+        python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBRAFT_BUILD_DIR} install --single-version-externally-managed --record=record.txt
+    else
+        python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBRAFT_BUILD_DIR}
+    fi
+fi
+
+# Build and (optionally) install the pylibraft Python package
+if (( ${NUMARGS} == 0 )) || hasArg pylibraft || hasArg docs; then
+
+    cd ${REPODIR}/python/pylibraft
     if [[ ${INSTALL_TARGET} != "" ]]; then
         python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBRAFT_BUILD_DIR} install --single-version-externally-managed --record=record.txt
     else
@@ -211,6 +227,7 @@ if (( ${NUMARGS} == 0 )) || hasArg pyraft || hasArg docs; then
     fi
 fi
 
+
 if hasArg docs; then
     cmake --build ${LIBRAFT_BUILD_DIR} --target docs_raft
     cd ${SPHINX_BUILD_DIR}
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 64d46a68c7..4ca22c4277 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -114,10 +114,16 @@ if [ "$BUILD_RAFT" == "1" ]; then
   gpuci_logger "Building conda packages for pyraft"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pylibraft --python=$PYTHON
   else
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
     mkdir -p ${CONDA_BLD_DIR}/pyraft
     mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pyraft/work
+
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pylibraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
+    mkdir -p ${CONDA_BLD_DIR}/pylibraft
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pylibraft/work
+
   fi
 else
   gpuci_logger "SKIPPING build of conda packages for pyraft"
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index fe1d651c31..0b217f115b 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -34,6 +34,7 @@ export LIBRAFT_HEADERS_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/
 export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
 export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
 export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
+export PYLIBRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pylibraft --python=$PYTHON --output`
 
 ################################################################################
 # UPLOAD - Conda packages
@@ -64,4 +65,10 @@ if [[ "$BUILD_RAFT" == "1" ]]; then
   echo "Upload pyraft"
   echo ${PYRAFT_FILE}
   gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${PYRAFT_FILE} --no-progress
+
+  test -e ${PYLIBRAFT_FILE}
+  echo "Upload pylibraft"
+  echo ${PYLIBRAFT_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${PYLIBRAFT_FILE} --no-progress
+
 fi
diff --git a/conda/recipes/pylibraft/build.sh b/conda/recipes/pylibraft/build.sh
new file mode 100644
index 0000000000..9119644d3f
--- /dev/null
+++ b/conda/recipes/pylibraft/build.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+# This assumes the script is executed from the root of the repo directory
+./build.sh pylibraft
diff --git a/conda/recipes/pylibraft/meta.yml b/conda/recipes/pylibraft/meta.yml
new file mode 100644
index 0000000000..bde8ab7b8d
--- /dev/null
+++ b/conda/recipes/pylibraft/meta.yml
@@ -0,0 +1,53 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+# Usage:
+  #   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
+  {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+  {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
+  {% set cuda_version='.'.join(environ.get('CUDA', 'unknown').split('.')[:2]) %}
+  {% set cuda_major=cuda_version.split('.')[0] %}
+  {% set py_version=environ.get('CONDA_PY', 36) %}
+
+package:
+  name: pylibraft
+  version: {{ version }}
+
+source:
+  git_url: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  script_env:
+    - CC
+    - CXX
+    - VERSION_SUFFIX
+
+requirements:
+  build:
+    - python x.x
+    - setuptools
+    - cython>=0.29,<0.30
+    - rmm {{ minor_version }}
+    - libraft-headers {{ version }}
+    - cudatoolkit {{ cuda_version }}.*
+    - cuda-python >=11.5,<12.0
+  run:
+    - python x.x
+    - libraft-headers {{ version }}
+    - rmm {{ minor_version }}
+    - cuda-python >=11.5,<12.0
+    - joblib >=0.11
+    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+
+tests:                                 # [linux64]
+  requirements:                        # [linux64]
+    - cudatoolkit {{ cuda_version }}.* # [linux64]
+  imports:                             # [linux64]
+    - pylibraft                        # [linux64]
+
+about:
+  home: http://rapids.ai/
+  license: Apache-2.0
+  # license_file: LICENSE
+  summary: pylibraft library

From 555433bdd277847b1b2dbbd242b15219546b0c51 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Mar 2022 20:18:55 -0500
Subject: [PATCH 004/167] Updating year

---
 python/raft/raft/__init__.py                        | 2 +-
 python/raft/raft/common/__init__.py                 | 2 +-
 python/raft/raft/dask/__init__.py                   | 2 +-
 python/raft/raft/dask/common/__init__.py            | 2 +-
 python/raft/raft/dask/common/comms.py               | 2 +-
 python/raft/raft/dask/common/ucx.py                 | 2 +-
 python/raft/raft/dask/common/utils.py               | 2 +-
 python/raft/raft/include_test/__init__.py           | 2 +-
 python/raft/raft/include_test/raft_include_test.pyx | 2 +-
 python/raft/raft/test/__init__.py                   | 2 +-
 python/raft/raft/test/test_comms.py                 | 2 +-
 python/raft/raft/test/test_raft.py                  | 2 +-
 python/raft/setuputils.py                           | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/raft/raft/__init__.py b/python/raft/raft/__init__.py
index b2431b4f6c..5face05ef3 100644
--- a/python/raft/raft/__init__.py
+++ b/python/raft/raft/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/common/__init__.py b/python/raft/raft/common/__init__.py
index b5ef2b3079..62db7d5831 100644
--- a/python/raft/raft/common/__init__.py
+++ b/python/raft/raft/common/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/dask/__init__.py b/python/raft/raft/dask/__init__.py
index 74231d256f..f6a1c28ea8 100644
--- a/python/raft/raft/dask/__init__.py
+++ b/python/raft/raft/dask/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/dask/common/__init__.py b/python/raft/raft/dask/common/__init__.py
index c2265f6828..8c25cdde90 100644
--- a/python/raft/raft/dask/common/__init__.py
+++ b/python/raft/raft/dask/common/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/dask/common/comms.py b/python/raft/raft/dask/common/comms.py
index ee768b41ff..549ac7fccb 100644
--- a/python/raft/raft/dask/common/comms.py
+++ b/python/raft/raft/dask/common/comms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/dask/common/ucx.py b/python/raft/raft/dask/common/ucx.py
index f61479a0eb..eb246853f4 100644
--- a/python/raft/raft/dask/common/ucx.py
+++ b/python/raft/raft/dask/common/ucx.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/dask/common/utils.py b/python/raft/raft/dask/common/utils.py
index fdb5acfb5d..daf51530be 100644
--- a/python/raft/raft/dask/common/utils.py
+++ b/python/raft/raft/dask/common/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/include_test/__init__.py b/python/raft/raft/include_test/__init__.py
index 2b81c05b26..ea3511ea64 100644
--- a/python/raft/raft/include_test/__init__.py
+++ b/python/raft/raft/include_test/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/include_test/raft_include_test.pyx b/python/raft/raft/include_test/raft_include_test.pyx
index 6ebcb79256..7d860b4c35 100644
--- a/python/raft/raft/include_test/raft_include_test.pyx
+++ b/python/raft/raft/include_test/raft_include_test.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/test/__init__.py b/python/raft/raft/test/__init__.py
index df8a4ae3b9..99e0b7fac2 100644
--- a/python/raft/raft/test/__init__.py
+++ b/python/raft/raft/test/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/test/test_comms.py b/python/raft/raft/test/test_comms.py
index a540e8db10..345cdbf037 100644
--- a/python/raft/raft/test/test_comms.py
+++ b/python/raft/raft/test/test_comms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/test/test_raft.py b/python/raft/raft/test/test_raft.py
index 9f0524e198..796a4fface 100644
--- a/python/raft/raft/test/test_raft.py
+++ b/python/raft/raft/test/test_raft.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/setuputils.py b/python/raft/setuputils.py
index 61cb2da273..8893e09fd3 100755
--- a/python/raft/setuputils.py
+++ b/python/raft/setuputils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 4b448f590b92b56ebcbba10886a49456c45dd265 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Mar 2022 20:44:23 -0500
Subject: [PATCH 005/167] Adding missing copyright headers

---
 conda/recipes/pylibraft/build.sh            | 1 +
 conda/recipes/pyraft/build.sh               | 1 +
 python/pylibraft/versioneer.py              | 1 +
 python/raft/raft/test/conftest.py           | 2 ++
 python/raft/raft/test/test_interruptible.py | 1 +
 python/raft/versioneer.py                   | 1 +
 6 files changed, 7 insertions(+)

diff --git a/conda/recipes/pylibraft/build.sh b/conda/recipes/pylibraft/build.sh
index 9119644d3f..5ac2f5e33c 100644
--- a/conda/recipes/pylibraft/build.sh
+++ b/conda/recipes/pylibraft/build.sh
@@ -1,3 +1,4 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
 #!/usr/bin/env bash
 
 # This assumes the script is executed from the root of the repo directory
diff --git a/conda/recipes/pyraft/build.sh b/conda/recipes/pyraft/build.sh
index 044a34f906..1981b943cb 100644
--- a/conda/recipes/pyraft/build.sh
+++ b/conda/recipes/pyraft/build.sh
@@ -1,3 +1,4 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
 #!/usr/bin/env bash
 
 # This assumes the script is executed from the root of the repo directory
diff --git a/python/pylibraft/versioneer.py b/python/pylibraft/versioneer.py
index 64fea1c892..b8c4bc423b 100644
--- a/python/pylibraft/versioneer.py
+++ b/python/pylibraft/versioneer.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
 # Version: 0.18
 
diff --git a/python/raft/raft/test/conftest.py b/python/raft/raft/test/conftest.py
index 7ba0e36b0e..f5cdc49700 100644
--- a/python/raft/raft/test/conftest.py
+++ b/python/raft/raft/test/conftest.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
 import pytest
 
 from dask.distributed import Client
diff --git a/python/raft/raft/test/test_interruptible.py b/python/raft/raft/test/test_interruptible.py
index 81f4f99ed8..a3559f6476 100644
--- a/python/raft/raft/test/test_interruptible.py
+++ b/python/raft/raft/test/test_interruptible.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
 import os
 import pytest
diff --git a/python/raft/versioneer.py b/python/raft/versioneer.py
index 64fea1c892..b8c4bc423b 100644
--- a/python/raft/versioneer.py
+++ b/python/raft/versioneer.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
 # Version: 0.18
 

From f9aa8bc7055c50b1afde06000b529dbdafcaaf19 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Mar 2022 20:45:56 -0500
Subject: [PATCH 006/167] More style

---
 python/pylibraft/setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pylibraft/setup.py b/python/pylibraft/setup.py
index 5103ed1f83..4d33c0ea30 100644
--- a/python/pylibraft/setup.py
+++ b/python/pylibraft/setup.py
@@ -76,7 +76,8 @@
     try:
         setup_file_path = str(Path(__file__).parent.absolute())
         shutil.rmtree(setup_file_path + '/.pytest_cache', ignore_errors=True)
-        shutil.rmtree(setup_file_path + '/pylibraft.egg-info', ignore_errors=True)
+        shutil.rmtree(setup_file_path + '/pylibraft.egg-info',
+                      ignore_errors=True)
         shutil.rmtree(setup_file_path + '/__pycache__', ignore_errors=True)
 
         clean_folder(setup_file_path + '/pylibraft')

From 9c72b7a133210440beeb2fc10f6429c2533d8b71 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Mar 2022 21:41:23 -0500
Subject: [PATCH 007/167] Proper formatting

---
 conda/recipes/pylibraft/meta.yml | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/conda/recipes/pylibraft/meta.yml b/conda/recipes/pylibraft/meta.yml
index bde8ab7b8d..116a8ebebd 100644
--- a/conda/recipes/pylibraft/meta.yml
+++ b/conda/recipes/pylibraft/meta.yml
@@ -1,12 +1,12 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
 # Usage:
-  #   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
-  {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
-  {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
-  {% set cuda_version='.'.join(environ.get('CUDA', 'unknown').split('.')[:2]) %}
-  {% set cuda_major=cuda_version.split('.')[0] %}
-  {% set py_version=environ.get('CONDA_PY', 36) %}
+#   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+{% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set cuda_version='.'.join(environ.get('CUDA', 'unknown').split('.')[:2]) %}
+{% set cuda_major=cuda_version.split('.')[0] %}
+{% set py_version=environ.get('CONDA_PY', 36) %}
 
 package:
   name: pylibraft
@@ -28,14 +28,13 @@ requirements:
     - python x.x
     - setuptools
     - cython>=0.29,<0.30
-    - rmm {{ minor_version }}
+    - librmm {{ minor_version }}
     - libraft-headers {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0
   run:
     - python x.x
     - libraft-headers {{ version }}
-    - rmm {{ minor_version }}
     - cuda-python >=11.5,<12.0
     - joblib >=0.11
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}

From 2c278ab8db34d2239fb5d48a4ac50bb87f9d138f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Mar 2022 22:08:10 -0500
Subject: [PATCH 008/167] Updating setup

---
 python/pylibraft/setup.cfg | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/pylibraft/setup.cfg b/python/pylibraft/setup.cfg
index 163304670a..478822b727 100644
--- a/python/pylibraft/setup.cfg
+++ b/python/pylibraft/setup.cfg
@@ -9,10 +9,10 @@ exclude = __init__.py,versioneer.py
 [versioneer]
 VCS = git
 style = pep440
-versionfile_source = raft/_version.py
-versionfile_build = raft/_version.py
+versionfile_source = pylibraft/_version.py
+versionfile_build = pylibraft/_version.py
 tag_prefix = v
-parentdir_prefix = raft-
+parentdir_prefix = pylibraft-
 
 [isort]
 line_length=79

From c0214450ef8f7a397831f33af3d26ae5e1efee6c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Mar 2022 22:37:28 -0500
Subject: [PATCH 009/167] Rename

---
 conda/recipes/pylibraft/{meta.yml => meta.yaml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename conda/recipes/pylibraft/{meta.yml => meta.yaml} (100%)

diff --git a/conda/recipes/pylibraft/meta.yml b/conda/recipes/pylibraft/meta.yaml
similarity index 100%
rename from conda/recipes/pylibraft/meta.yml
rename to conda/recipes/pylibraft/meta.yaml

From 8de62fef9fe6836dfa072cf661baf556f95470e2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Mar 2022 23:01:12 -0500
Subject: [PATCH 010/167] Adding rmm dependency

---
 conda/recipes/pylibraft/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 116a8ebebd..6275564270 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -28,7 +28,7 @@ requirements:
     - python x.x
     - setuptools
     - cython>=0.29,<0.30
-    - librmm {{ minor_version }}
+    - rmm {{ minor_version }}
     - libraft-headers {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0

From 6e5499fe8ff9cec7aea84c432701eb1865380409 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Mar 2022 23:50:08 -0500
Subject: [PATCH 011/167] Updating gitignore

---
 .gitignore             |  4 ++--
 python/raft/record.txt | 44 ------------------------------------------
 2 files changed, 2 insertions(+), 46 deletions(-)
 delete mode 100644 python/raft/record.txt

diff --git a/.gitignore b/.gitignore
index 742a37aa35..972d491b86 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,8 +17,8 @@ build/
 build_prims/
 dist/
 python/**/**/*.cpp
-raft/external_repositories
-raft/record.txt
+python/raft/record.txt
+python/pylibraft/record.txt
 log
 .ipynb_checkpoints
 .DS_Store
diff --git a/python/raft/record.txt b/python/raft/record.txt
deleted file mode 100644
index ecc39a48bf..0000000000
--- a/python/raft/record.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/_version.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/test_comms.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/conftest.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/test_raft.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/test_interruptible.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/include_test/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/comms.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/utils.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/ucx.py
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/handle.pxd
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/__init__.pxd
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/cuda.pxd
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/interruptible.pxd
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/__pycache__/_version.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__pycache__/test_comms.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__pycache__/conftest.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__pycache__/test_raft.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/test/__pycache__/test_interruptible.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/include_test/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/common/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/__pycache__/comms.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/__pycache__/utils.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft/dask/common/__pycache__/ucx.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/common/cuda.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/common/handle.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/common/interruptible.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/dask/common/comms_utils.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/dask/common/nccl.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/jects/raft/python/raft/include_test/raft_include_test.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/top_level.txt
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/PKG-INFO
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/SOURCES.txt
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/requires.txt
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/dependency_links.txt
-/home/cjnolet/miniconda3/envs/cuml_2204_021922/lib/python3.9/site-packages/raft-22.4.0a0+64.g706df5de8.dirty-py3.9.egg-info/not-zip-safe

From 7fc834eade33c384438a27d4083d8c8cd7ea5462 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 2 Mar 2022 12:38:36 -0500
Subject: [PATCH 012/167] Updates

---
 BUILD.md                                      |  5 +-
 README.md                                     | 50 +++++++++++--------
 ci/local/README.md                            |  2 +-
 conda/recipes/pylibraft/meta.yaml             |  1 +
 python/pylibraft/pylibraft/common/mdarray.pxd | 27 ++++++++++
 .../distance/{distance.pxd => distance.pyx}   | 10 ++++
 .../pylibraft/distance/distance_type.pxd      | 40 +++++++++++++++
 python/pylibraft/setup.py                     |  7 +--
 8 files changed, 116 insertions(+), 26 deletions(-)
 create mode 100644 python/pylibraft/pylibraft/common/mdarray.pxd
 rename python/pylibraft/pylibraft/distance/{distance.pxd => distance.pyx} (66%)
 create mode 100644 python/pylibraft/pylibraft/distance/distance_type.pxd

diff --git a/BUILD.md b/BUILD.md
index 1bf3783fae..3b860d2ec5 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -69,10 +69,9 @@ cd cpp
 mkdir build
 cd build
 cmake -D BUILD_TESTS=ON -DRAFT_COMPILE_LIBRARIES=ON -DRAFT_ENABLE_NN_DEPENDENCIES=ON  -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX ../
-make install
+make -j<parallel_level> install
 ```
 
-
 RAFT's cmake has the following configurable flags available:.
 
 | Flag | Possible Values | Default Value | Behavior |
@@ -83,6 +82,8 @@ RAFT's cmake has the following configurable flags available:.
 | RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |  
 | RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. |
 | RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | 
+| RAFT_STATIC_LINK_LIBRARIES | ON, OFF | ON | Build static link libraries instead of shared libraries |
+| 
 | DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies |
 | NVTX | ON, OFF | OFF | Enable NVTX Markers |
 | CUDA_ENABLE_KERNELINFO | ON, OFF | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` | 
diff --git a/README.md b/README.md
index a79679c579..7fc5b6b3de 100755
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: RAPIDS Analytics Framework Toolkit</div>
 
-RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics in the [RAPIDS](https://rapids.ai) ecosystem. 
+RAFT (Reusable Algorithms, Functions, and other Tools) contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics in the [RAPIDS](https://rapids.ai) ecosystem. 
 
 By taking a primitives-based approach to algorithm development, RAFT
 - accelerates algorithm construction time
@@ -23,12 +23,9 @@ The algorithms in RAFT span the following general categories:
 
 RAFT provides a header-only C++ library and pre-compiled shared libraries that can 1) speed up compile times and 2) enable the APIs to be used without CUDA-enabled compilers.
 
-RAFT also provides a Python library that is currently limited to
-1. a python wrapper around the `raft::handle_t` for managing cuda library resources
-2. definitions for using `raft::handle_t` directly in cython
-3. tools for building multi-node multi-GPU algorithms that leverage [Dask](https://dask.org/)
-
-The Python API is being improved to wrap the algorithms and primitives from the categories above.
+RAFT also provides 2 Python libraries:
+- `pyraft` - reusable infrastructure for building analytics, such as tools for building multi-node multi-GPU algorithms that leverage [Dask](https://dask.org/).
+- `pylibraft` - cython wrappers around RAFT algorithms and primitives.
 
 ## Getting started
 
@@ -68,28 +65,31 @@ raft::distance::pairwise_distance(handle, input.data(), input.data(),
 
 ## Installing
 
-RAFT can be installed through conda, cmake-package-manager (cpm), or by building the repository from source. 
+RAFT can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), or by building the repository from source.
 
 ### Conda
 
-The easiest way to install RAFT is through conda and several packages are provided.
+The easiest way to install RAFT is through conda
 - `libraft-headers` contains all the CUDA/C++ headers
 - `libraft-nn` (optional) contains precompiled shared libraries for the nearest neighbors algorithms. If FAISS is not already installed in your environment, this will need to be installed to use the nearest neighbors headers.
 - `libraft-distance` (optional) contains shared libraries for distance algorithms.
-- `pyraft` (optional) contains the Python library
+- `pyraft` (optional) Python library with reusable infrastructure and tools
+- `pylibraft` (optional) Cython wrappers around RAFT algorithms and primitives
 
-To install RAFT with conda (change to `rapidsai-nightly` for more up-to-date but less stable nightly packages)
+Use the following command to install RAFT with conda (use `-c rapidsai-nightly` for more up-to-date but less stable nightly packages)
 ```bash
-conda install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft
+conda install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft pylibraft
 ```
 
 After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
 
 ### CPM
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). 
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS cmake provides a convenience layer around CPM. 
+
+After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids-cmake in your project, you can begin using RAFT by placing the code snippet below in a file named `get_raft.cmake` and including it in your cmake build with `include(get_raft.cmake)`. 
 
-After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids-cmake in your project, you can begin using RAFT by placing the code snippet below in a file named `get_raft.cmake` and including it in your cmake build with `include(get_raft.cmake)`. This will create the `raft::raft` target to add to configure the link libraries for your artifacts.
+With the default settings below, the target `raft::raft` will be available for linking. If `RAFT_COMPILE_LIBRARIES` is enabled, the additional targets `raft::nn` and `raft::distance` will also be available.
 
 ```cmake
 
@@ -135,13 +135,23 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
 )
 ```
 
+You can find a more comprehensive example of the above cmake snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) guide. 
+
 ### Source
 
 The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository,
-1. create an environment with the RAFT dependencies: `conda env create --name raft_dev -f conda/environments/raft_dev_cuda11.5.yml`
-2. run the build script from the repository root: `./build.sh pyraft libraft --compile-libs`
+1. create an environment with the RAFT dependencies:
+```
+conda env create --name raft_dev -f conda/environments/raft_dev_cuda11.5.yml
+conda activate raft_dev
+```
+
+2. run the build script from the repository root:
+```
+./build.sh pyraft pylibraft libraft --compile-libs
+```
 
-The [Build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) guide.
+The [Build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. 
 
 ## Folder Structure and Contents
 
@@ -149,12 +159,12 @@ The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with t
 
 - `ci`: Scripts for running CI in PRs
 - `conda`: Conda recipes and development conda environments
-- `cpp`: Source code for all C++ code. 
+- `cpp`: Source code for C++ libraries. 
   - `docs`: Doxygen configuration
-  - `include`: The C++ API is fully-contained here 
+  - `include`: The C++ API is fully-contained here
   - `src`: Compiled template specializations for the shared libraries
 - `docs`: Source code and scripts for building library documentation (doxygen + pydocs)
-- `python`: Source code for all Python source code.
+- `python`: Source code for Python libraries.
 
 ## Contributing
 
diff --git a/ci/local/README.md b/ci/local/README.md
index 3b47ef3b53..7126a3973d 100644
--- a/ci/local/README.md
+++ b/ci/local/README.md
@@ -23,7 +23,7 @@ where:
 ```
 
 Example Usage:
-`bash build.sh -r ~/rapids/raft -i gpuci/rapidsai-base:cuda9.2-ubuntu16.04-gcc5-py3.6`
+`bash build.sh -r ~/rapids/raft -i gpuci/rapidsai-base:cuda11.5-ubuntu21.04-py3.8`
 
 For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai-base/tags) page.
 
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 6275564270..2565966401 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -35,6 +35,7 @@ requirements:
   run:
     - python x.x
     - libraft-headers {{ version }}
+    - libraft-distance {{ version }}
     - cuda-python >=11.5,<12.0
     - joblib >=0.11
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
diff --git a/python/pylibraft/pylibraft/common/mdarray.pxd b/python/pylibraft/pylibraft/common/mdarray.pxd
new file mode 100644
index 0000000000..1f5e275b82
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/mdarray.pxd
@@ -0,0 +1,27 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .handle cimport handle_t
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+
+
+cdef extern from "raft/mdarray.hpp" namespace "raft":
+    cdef cppclass device_matrix[T]:
+        pass
+
+    cdef device_matrix[T] make_device_matrix[T](size_t n_rows,
+                            size_t n_cols,
+                            cuda_stream_view stream)
diff --git a/python/pylibraft/pylibraft/distance/distance.pxd b/python/pylibraft/pylibraft/distance/distance.pyx
similarity index 66%
rename from python/pylibraft/pylibraft/distance/distance.pxd
rename to python/pylibraft/pylibraft/distance/distance.pyx
index 273b4497cc..89232600f6 100644
--- a/python/pylibraft/pylibraft/distance/distance.pxd
+++ b/python/pylibraft/pylibraft/distance/distance.pyx
@@ -1,3 +1,4 @@
+#
 # Copyright (c) 2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,3 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+from pylibraft.common.handle cimport handle_t
+from .distance_type import DistanceType
+from pylibraft.common.mdarray cimport make_device_matrix, device_matrix
+
+cdef pairwise_distance():
+
+    cdef handle_t handle
+    cdef device_matrix[int] hellp = make_device_matrix[int](5, 10, handle.get_stream())
diff --git a/python/pylibraft/pylibraft/distance/distance_type.pxd b/python/pylibraft/pylibraft/distance/distance_type.pxd
new file mode 100644
index 0000000000..2c01e42e53
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/distance_type.pxd
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+cdef extern from "raft/distance/distance_type.hpp" namespace "raft::distance":
+
+    ctypedef enum DistanceType:
+        L2Expanded "raft::distance::DistanceType::L2Expanded"
+        L2SqrtExpanded "raft::distance::DistanceType::L2SqrtExpanded"
+        CosineExpanded "raft::distance::DistanceType::CosineExpanded"
+        L1 "raft::distance::DistanceType::L1"
+        L2Unexpanded "raft::distance::DistanceType::L2Unexpanded"
+        L2SqrtUnexpanded "raft::distance::DistanceType::L2SqrtUnexpanded"
+        InnerProduct "raft::distance::DistanceType::InnerProduct"
+        Linf "raft::distance::DistanceType::Linf"
+        Canberra "raft::distance::DistanceType::Canberra"
+        LpUnexpanded "raft::distance::DistanceType::LpUnexpanded"
+        CorrelationExpanded "raft::distance::DistanceType::CorrelationExpanded"
+        JaccardExpanded "raft::distance::DistanceType::JaccardExpanded"
+        HellingerExpanded "raft::distance::DistanceType::HellingerExpanded"
+        Haversine "raft::distance::DistanceType::Haversine"
+        BrayCurtis "raft::distance::DistanceType::BrayCurtis"
+        JensenShannon "raft::distance::DistanceType::JensenShannon"
+        HammingUnexpanded "raft::distance::DistanceType::HammingUnexpanded"
+        KLDivergence "raft::distance::DistanceType::KLDivergence"
+        RusselRaoExpanded "raft::distance::DistanceType::RusselRaoExpanded"
+        DiceExpanded "raft::distance::DistanceType::DiceExpanded"
+        Precomputed "raft::distance::DistanceType::Precomputed"
diff --git a/python/pylibraft/setup.py b/python/pylibraft/setup.py
index 4d33c0ea30..ec7e11ba85 100644
--- a/python/pylibraft/setup.py
+++ b/python/pylibraft/setup.py
@@ -100,8 +100,7 @@
 ##############################################################################
 # - Cython extensions build and parameters -----------------------------------
 
-
-libs = ['cudart', "cusolver", "cusparse", "cublas"]
+libs = ['raft_distance', 'cudart', "cusolver", "cusparse", "cublas"]
 
 include_dirs = [cuda_include_dir,
                 numpy.get_include(),
@@ -187,7 +186,9 @@ def finalize_options(self):
       setup_requires=['cython'],
       ext_modules=extensions,
       package_data=dict.fromkeys(
-                         find_packages(include=["pylibraft.common",
+                         find_packages(include=["pylibraft.distance",
+                                                "pylibraft.distance.includes",
+                                                "pylibraft.common",
                                                 "pylibraft.common.includes"]),
                          ["*.hpp", "*.pxd"],
       ),

From 401c167a5c6aabecfc49a0ed4d93e8f427cdc08e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 7 Mar 2022 20:18:39 -0500
Subject: [PATCH 013/167] adding raft_runtime public includes

---
 cpp/include/raft.hpp                         |    4 +
 cpp/include/raft/comms/comms.hpp             | 1148 +++++++++---------
 cpp/include/raft/cudart_utils.h              |  461 +++----
 cpp/include/raft/error.hpp                   |   81 +-
 cpp/include/raft/handle.hpp                  |  566 ++++-----
 cpp/include/raft/interruptible.hpp           |  389 +++---
 cpp/include/raft/linalg/cublas_macros.h      |   64 +-
 cpp/include/raft/linalg/cusolver_macros.h    |   61 +-
 cpp/include/raft_runtime/comms.hpp           |  637 ++++++++++
 cpp/include/raft_runtime/cublas_macros.hpp   |  121 ++
 cpp/include/raft_runtime/cudart_utils.hpp    |  438 +++++++
 cpp/include/raft_runtime/cusolver_macros.hpp |  117 ++
 cpp/include/raft_runtime/cusparse_macros.hpp |  128 ++
 cpp/include/raft_runtime/error.hpp           |  177 +++
 cpp/include/raft_runtime/handle.hpp          |  339 ++++++
 cpp/include/raft_runtime/interruptible.hpp   |  271 +++++
 cpp/include/raft_runtime/raft.hpp            |   20 +
 17 files changed, 3673 insertions(+), 1349 deletions(-)
 create mode 100644 cpp/include/raft_runtime/comms.hpp
 create mode 100644 cpp/include/raft_runtime/cublas_macros.hpp
 create mode 100644 cpp/include/raft_runtime/cudart_utils.hpp
 create mode 100644 cpp/include/raft_runtime/cusolver_macros.hpp
 create mode 100644 cpp/include/raft_runtime/cusparse_macros.hpp
 create mode 100644 cpp/include/raft_runtime/error.hpp
 create mode 100644 cpp/include/raft_runtime/handle.hpp
 create mode 100644 cpp/include/raft_runtime/interruptible.hpp
 create mode 100644 cpp/include/raft_runtime/raft.hpp

diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index 08f836d3a8..42370d9e4f 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ */
+
 #include <string>
 
 namespace raft {
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 05678a7e49..e8855192ba 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -14,621 +14,629 @@
  * limitations under the License.
  */
 
-#pragma once
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/comms.hpp instead.
+ */
+
+#ifndef __RAFT_RT_COMMS_H
+#define __RAFT_RT_COMMS_H
 
-#include <raft/cudart_utils.h>
+#pragma once
 
 #include <memory>
 #include <vector>
 
 namespace raft {
-namespace comms {
+    namespace comms {
 
-typedef unsigned int request_t;
-enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
-enum class op_t { SUM, PROD, MIN, MAX };
+        typedef unsigned int request_t;
+        enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
+        enum class op_t { SUM, PROD, MIN, MAX };
 
 /**
  * The resulting status of distributed stream synchronization
  */
-enum class status_t {
-  SUCCESS,  // Synchronization successful
-  ERROR,    // An error occured querying sync status
-  ABORT     // A failure occurred in sync, queued operations aborted
-};
-
-template <typename value_t>
-constexpr datatype_t
+        enum class status_t {
+            SUCCESS,  // Synchronization successful
+            ERROR,    // An error occured querying sync status
+            ABORT     // A failure occurred in sync, queued operations aborted
+        };
 
-get_type();
+        template <typename value_t>
+        constexpr datatype_t
 
-template <>
-constexpr datatype_t
+        get_type();
 
-get_type<char>()
-{
-  return datatype_t::CHAR;
-}
+        template <>
+        constexpr datatype_t
 
-template <>
-constexpr datatype_t
+        get_type<char>()
+        {
+            return datatype_t::CHAR;
+        }
 
-get_type<uint8_t>()
-{
-  return datatype_t::UINT8;
-}
+        template <>
+        constexpr datatype_t
 
-template <>
-constexpr datatype_t
+        get_type<uint8_t>()
+        {
+            return datatype_t::UINT8;
+        }
 
-get_type<int>()
-{
-  return datatype_t::INT32;
-}
+        template <>
+        constexpr datatype_t
 
-template <>
-constexpr datatype_t
+        get_type<int>()
+        {
+            return datatype_t::INT32;
+        }
 
-get_type<uint32_t>()
-{
-  return datatype_t::UINT32;
-}
+        template <>
+        constexpr datatype_t
 
-template <>
-constexpr datatype_t
+        get_type<uint32_t>()
+        {
+            return datatype_t::UINT32;
+        }
 
-get_type<int64_t>()
-{
-  return datatype_t::INT64;
-}
+        template <>
+        constexpr datatype_t
 
-template <>
-constexpr datatype_t
+        get_type<int64_t>()
+        {
+            return datatype_t::INT64;
+        }
 
-get_type<uint64_t>()
-{
-  return datatype_t::UINT64;
-}
+        template <>
+        constexpr datatype_t
 
-template <>
-constexpr datatype_t
+        get_type<uint64_t>()
+        {
+            return datatype_t::UINT64;
+        }
 
-get_type<float>()
-{
-  return datatype_t::FLOAT32;
-}
+        template <>
+        constexpr datatype_t
 
-template <>
-constexpr datatype_t
+        get_type<float>()
+        {
+            return datatype_t::FLOAT32;
+        }
 
-get_type<double>()
-{
-  return datatype_t::FLOAT64;
-}
+        template <>
+        constexpr datatype_t
 
-class comms_iface {
- public:
-  virtual ~comms_iface() {}
+        get_type<double>()
+        {
+            return datatype_t::FLOAT64;
+        }
 
-  virtual int get_size() const = 0;
+        class comms_iface {
+        public:
+            virtual ~comms_iface() {}
 
-  virtual int get_rank() const = 0;
+            virtual int get_size() const = 0;
 
-  virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
+            virtual int get_rank() const = 0;
 
-  virtual void barrier() const = 0;
+            virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
 
-  virtual status_t sync_stream(cudaStream_t stream) const = 0;
+            virtual void barrier() const = 0;
 
-  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
+            virtual status_t sync_stream(cudaStream_t stream) const = 0;
 
-  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
+            virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
 
-  virtual void waitall(int count, request_t array_of_requests[]) const = 0;
+            virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
 
-  virtual void allreduce(const void* sendbuff,
-                         void* recvbuff,
-                         size_t count,
-                         datatype_t datatype,
-                         op_t op,
-                         cudaStream_t stream) const = 0;
+            virtual void waitall(int count, request_t array_of_requests[]) const = 0;
 
-  virtual void bcast(
-    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
+            virtual void allreduce(const void* sendbuff,
+                                   void* recvbuff,
+                                   size_t count,
+                                   datatype_t datatype,
+                                   op_t op,
+                                   cudaStream_t stream) const = 0;
 
-  virtual void bcast(const void* sendbuff,
-                     void* recvbuff,
-                     size_t count,
-                     datatype_t datatype,
-                     int root,
-                     cudaStream_t stream) const = 0;
+            virtual void bcast(
+                    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
 
-  virtual void reduce(const void* sendbuff,
-                      void* recvbuff,
-                      size_t count,
-                      datatype_t datatype,
-                      op_t op,
-                      int root,
-                      cudaStream_t stream) const = 0;
-
-  virtual void allgather(const void* sendbuff,
-                         void* recvbuff,
-                         size_t sendcount,
-                         datatype_t datatype,
-                         cudaStream_t stream) const = 0;
-
-  virtual void allgatherv(const void* sendbuf,
-                          void* recvbuf,
-                          const size_t* recvcounts,
-                          const size_t* displs,
-                          datatype_t datatype,
-                          cudaStream_t stream) const = 0;
-
-  virtual void gather(const void* sendbuff,
-                      void* recvbuff,
-                      size_t sendcount,
-                      datatype_t datatype,
-                      int root,
-                      cudaStream_t stream) const = 0;
-
-  virtual void gatherv(const void* sendbuf,
-                       void* recvbuf,
-                       size_t sendcount,
-                       const size_t* recvcounts,
-                       const size_t* displs,
-                       datatype_t datatype,
-                       int root,
-                       cudaStream_t stream) const = 0;
-
-  virtual void reducescatter(const void* sendbuff,
-                             void* recvbuff,
-                             size_t recvcount,
-                             datatype_t datatype,
-                             op_t op,
-                             cudaStream_t stream) const = 0;
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
-
-  virtual void device_sendrecv(const void* sendbuf,
-                               size_t sendsize,
-                               int dest,
-                               void* recvbuf,
-                               size_t recvsize,
-                               int source,
+            virtual void bcast(const void* sendbuff,
+                               void* recvbuff,
+                               size_t count,
+                               datatype_t datatype,
+                               int root,
                                cudaStream_t stream) const = 0;
 
-  virtual void device_multicast_sendrecv(const void* sendbuf,
-                                         std::vector<size_t> const& sendsizes,
-                                         std::vector<size_t> const& sendoffsets,
-                                         std::vector<int> const& dests,
+            virtual void reduce(const void* sendbuff,
+                                void* recvbuff,
+                                size_t count,
+                                datatype_t datatype,
+                                op_t op,
+                                int root,
+                                cudaStream_t stream) const = 0;
+
+            virtual void allgather(const void* sendbuff,
+                                   void* recvbuff,
+                                   size_t sendcount,
+                                   datatype_t datatype,
+                                   cudaStream_t stream) const = 0;
+
+            virtual void allgatherv(const void* sendbuf,
+                                    void* recvbuf,
+                                    const size_t* recvcounts,
+                                    const size_t* displs,
+                                    datatype_t datatype,
+                                    cudaStream_t stream) const = 0;
+
+            virtual void gather(const void* sendbuff,
+                                void* recvbuff,
+                                size_t sendcount,
+                                datatype_t datatype,
+                                int root,
+                                cudaStream_t stream) const = 0;
+
+            virtual void gatherv(const void* sendbuf,
+                                 void* recvbuf,
+                                 size_t sendcount,
+                                 const size_t* recvcounts,
+                                 const size_t* displs,
+                                 datatype_t datatype,
+                                 int root,
+                                 cudaStream_t stream) const = 0;
+
+            virtual void reducescatter(const void* sendbuff,
+                                       void* recvbuff,
+                                       size_t recvcount,
+                                       datatype_t datatype,
+                                       op_t op,
+                                       cudaStream_t stream) const = 0;
+
+            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+            virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
+
+            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+            virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
+
+            virtual void device_sendrecv(const void* sendbuf,
+                                         size_t sendsize,
+                                         int dest,
                                          void* recvbuf,
-                                         std::vector<size_t> const& recvsizes,
-                                         std::vector<size_t> const& recvoffsets,
-                                         std::vector<int> const& sources,
+                                         size_t recvsize,
+                                         int source,
                                          cudaStream_t stream) const = 0;
-};
-
-class comms_t {
- public:
-  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
-  {
-    ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
-  }
-
-  /**
-   * Virtual Destructor to enable polymorphism
-   */
-  virtual ~comms_t() {}
-
-  /**
-   * Returns the size of the communicator clique
-   */
-
-  int get_size() const { return impl_->get_size(); }
-
-  /**
-   * Returns the local rank
-   */
-  int get_rank() const { return impl_->get_rank(); }
-
-  /**
-   * Splits the current communicator clique into sub-cliques matching
-   * the given color and key
-   *
-   * @param color ranks w/ the same color are placed in the same communicator
-   * @param key controls rank assignment
-   */
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const
-  {
-    return impl_->comm_split(color, key);
-  }
-
-  /**
-   * Performs a collective barrier synchronization
-   */
-  void barrier() const { impl_->barrier(); }
-
-  /**
-   * Some collective communications implementations (eg. NCCL) might use asynchronous
-   * collectives that are explicitly synchronized. It's important to always synchronize
-   * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
-   * to prevent the potential for deadlocks.
-   *
-   * @param stream the cuda stream to sync collective operations on
-   */
-  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
-
-  /**
-   * Performs an asynchronous point-to-point send
-   * @tparam value_t the type of data to send
-   * @param buf pointer to array of data to send
-   * @param size number of elements in buf
-   * @param dest destination rank
-   * @param tag a tag to use for the receiver to filter
-   * @param request pointer to hold returned request_t object.
-   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-   */
-  template <typename value_t>
-  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
-  {
-    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
-  }
-
-  /**
-   * Performs an asynchronous point-to-point receive
-   * @tparam value_t the type of data to be received
-   * @param buf pointer to (initialized) array that will hold received data
-   * @param size number of elements in buf
-   * @param source source rank
-   * @param tag a tag to use for message filtering
-   * @param request pointer to hold returned request_t object.
-   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-   */
-  template <typename value_t>
-  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
-  {
-    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
-  }
-
-  /**
-   * Synchronize on an array of request_t objects returned from isend/irecv
-   * @param count number of requests to synchronize on
-   * @param array_of_requests an array of request_t objects returned from isend/irecv
-   */
-  void waitall(int count, request_t array_of_requests[]) const
-  {
-    impl_->waitall(count, array_of_requests);
-  }
-
-  /**
-   * Perform an allreduce collective
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff data to reduce
-   * @param recvbuff buffer to hold the reduced result
-   * @param count number of elements in sendbuff
-   * @param op reduction operation to perform
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allreduce(
-    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
-  {
-    impl_->allreduce(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff),
-                     count,
-                     get_type<value_t>(),
-                     op,
-                     stream);
-  }
-
-  /**
-   * Broadcast data from one rank to the rest
-   * @tparam value_t datatype of underlying buffers
-   * @param buff buffer to send
-   * @param count number of elements if buff
-   * @param root the rank initiating the broadcast
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
-  {
-    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
-  }
-
-  /**
-   * Broadcast data from one rank to the rest
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to broadcast (only used in root)
-   * @param recvbuff buffer to receive broadcasted data
-   * @param count number of elements if buff
-   * @param root the rank initiating the broadcast
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void bcast(
-    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
-  {
-    impl_->bcast(static_cast<const void*>(sendbuff),
-                 static_cast<void*>(recvbuff),
-                 count,
-                 get_type<value_t>(),
-                 root,
-                 stream);
-  }
-
-  /**
-   * Reduce data from many ranks down to a single rank
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to reduce
-   * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
-   * @param count number of elements in sendbuff
-   * @param op reduction operation to perform
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void reduce(const value_t* sendbuff,
-              value_t* recvbuff,
-              size_t count,
-              op_t op,
-              int root,
-              cudaStream_t stream) const
-  {
-    impl_->reduce(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff),
-                  count,
-                  get_type<value_t>(),
-                  op,
-                  root,
-                  stream);
-  }
-
-  /**
-   * Gathers data from each rank onto all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to gather
-   * @param recvbuff buffer containing gathered data from all ranks
-   * @param sendcount number of elements in send buffer
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allgather(const value_t* sendbuff,
-                 value_t* recvbuff,
-                 size_t sendcount,
-                 cudaStream_t stream) const
-  {
-    impl_->allgather(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff),
-                     sendcount,
-                     get_type<value_t>(),
-                     stream);
-  }
-
-  /**
-   * Gathers data from all ranks and delivers to combined data to all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuf buffer containing data to send
-   * @param recvbuf buffer containing data to receive
-   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-   *                   elements that are to be received from each rank
-   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-   *               (relative to recvbuf) at which to place the incoming data from each rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allgatherv(const value_t* sendbuf,
-                  value_t* recvbuf,
-                  const size_t* recvcounts,
-                  const size_t* displs,
-                  cudaStream_t stream) const
-  {
-    impl_->allgatherv(static_cast<const void*>(sendbuf),
-                      static_cast<void*>(recvbuf),
-                      recvcounts,
-                      displs,
-                      get_type<value_t>(),
-                      stream);
-  }
-
-  /**
-   * Gathers data from each rank onto all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to gather
-   * @param recvbuff buffer containing gathered data from all ranks
-   * @param sendcount number of elements in send buffer
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void gather(const value_t* sendbuff,
-              value_t* recvbuff,
-              size_t sendcount,
-              int root,
-              cudaStream_t stream) const
-  {
-    impl_->gather(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff),
-                  sendcount,
-                  get_type<value_t>(),
-                  root,
-                  stream);
-  }
-
-  /**
-   * Gathers data from all ranks and delivers to combined data to all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuf buffer containing data to send
-   * @param recvbuf buffer containing data to receive
-   * @param sendcount number of elements in send buffer
-   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-   *                   elements that are to be received from each rank
-   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-   *               (relative to recvbuf) at which to place the incoming data from each rank
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void gatherv(const value_t* sendbuf,
-               value_t* recvbuf,
-               size_t sendcount,
-               const size_t* recvcounts,
-               const size_t* displs,
-               int root,
-               cudaStream_t stream) const
-  {
-    impl_->gatherv(static_cast<const void*>(sendbuf),
-                   static_cast<void*>(recvbuf),
-                   sendcount,
-                   recvcounts,
-                   displs,
-                   get_type<value_t>(),
-                   root,
-                   stream);
-  }
-
-  /**
-   * Reduces data from all ranks then scatters the result across ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
-   * @param recvbuff buffer containing received data
-   * @param recvcount number of items to receive
-   * @param op reduction operation to perform
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void reducescatter(const value_t* sendbuff,
-                     value_t* recvbuff,
-                     size_t recvcount,
-                     op_t op,
-                     cudaStream_t stream) const
-  {
-    impl_->reducescatter(static_cast<const void*>(sendbuff),
-                         static_cast<void*>(recvbuff),
-                         recvcount,
-                         get_type<value_t>(),
-                         op,
-                         stream);
-  }
-
-  /**
-   * Performs a point-to-point send
-   *
-   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-   *
-   * @tparam value_t the type of data to send
-   * @param buf pointer to array of data to send
-   * @param size number of elements in buf
-   * @param dest destination rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
-  {
-    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
-  }
-
-  /**
-   * Performs a point-to-point receive
-   *
-   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-   *
-   * @tparam value_t the type of data to be received
-   * @param buf pointer to (initialized) array that will hold received data
-   * @param size number of elements in buf
-   * @param source source rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
-  {
-    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
-  }
-
-  /**
-   * Performs a point-to-point send/receive
-   *
-   * @tparam value_t the type of data to be sent & received
-   * @param sendbuf pointer to array of data to send
-   * @param sendsize number of elements in sendbuf
-   * @param dest destination rank
-   * @param recvbuf pointer to (initialized) array that will hold received data
-   * @param recvsize number of elements in recvbuf
-   * @param source source rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_sendrecv(const value_t* sendbuf,
-                       size_t sendsize,
-                       int dest,
-                       value_t* recvbuf,
-                       size_t recvsize,
-                       int source,
-                       cudaStream_t stream) const
-  {
-    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
-                           sendsize * sizeof(value_t),
-                           dest,
-                           static_cast<void*>(recvbuf),
-                           recvsize * sizeof(value_t),
-                           source,
-                           stream);
-  }
-
-  /**
-   * Performs a multicast send/receive
-   *
-   * @tparam value_t the type of data to be sent & received
-   * @param sendbuf pointer to array of data to send
-   * @param sendsizes numbers of elements to send
-   * @param sendoffsets offsets in a number of elements from sendbuf
-   * @param dests destination ranks
-   * @param recvbuf pointer to (initialized) array that will hold received data
-   * @param recvsizes numbers of elements to recv
-   * @param recvoffsets offsets in a number of elements from recvbuf
-   * @param sources source ranks
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_multicast_sendrecv(const value_t* sendbuf,
-                                 std::vector<size_t> const& sendsizes,
-                                 std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests,
+
+            virtual void device_multicast_sendrecv(const void* sendbuf,
+                                                   std::vector<size_t> const& sendsizes,
+                                                   std::vector<size_t> const& sendoffsets,
+                                                   std::vector<int> const& dests,
+                                                   void* recvbuf,
+                                                   std::vector<size_t> const& recvsizes,
+                                                   std::vector<size_t> const& recvoffsets,
+                                                   std::vector<int> const& sources,
+                                                   cudaStream_t stream) const = 0;
+        };
+
+        class comms_t {
+        public:
+            comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
+            {
+                ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
+            }
+
+            /**
+             * Virtual Destructor to enable polymorphism
+             */
+            virtual ~comms_t() {}
+
+            /**
+             * Returns the size of the communicator clique
+             */
+
+            int get_size() const { return impl_->get_size(); }
+
+            /**
+             * Returns the local rank
+             */
+            int get_rank() const { return impl_->get_rank(); }
+
+            /**
+             * Splits the current communicator clique into sub-cliques matching
+             * the given color and key
+             *
+             * @param color ranks w/ the same color are placed in the same communicator
+             * @param key controls rank assignment
+             */
+            std::unique_ptr<comms_iface> comm_split(int color, int key) const
+            {
+                return impl_->comm_split(color, key);
+            }
+
+            /**
+             * Performs a collective barrier synchronization
+             */
+            void barrier() const { impl_->barrier(); }
+
+            /**
+             * Some collective communications implementations (eg. NCCL) might use asynchronous
+             * collectives that are explicitly synchronized. It's important to always synchronize
+             * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
+             * to prevent the potential for deadlocks.
+             *
+             * @param stream the cuda stream to sync collective operations on
+             */
+            status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
+
+            /**
+             * Performs an asynchronous point-to-point send
+             * @tparam value_t the type of data to send
+             * @param buf pointer to array of data to send
+             * @param size number of elements in buf
+             * @param dest destination rank
+             * @param tag a tag to use for the receiver to filter
+             * @param request pointer to hold returned request_t object.
+             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+             */
+            template <typename value_t>
+            void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
+            {
+                impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
+            }
+
+            /**
+             * Performs an asynchronous point-to-point receive
+             * @tparam value_t the type of data to be received
+             * @param buf pointer to (initialized) array that will hold received data
+             * @param size number of elements in buf
+             * @param source source rank
+             * @param tag a tag to use for message filtering
+             * @param request pointer to hold returned request_t object.
+             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+             */
+            template <typename value_t>
+            void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
+            {
+                impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
+            }
+
+            /**
+             * Synchronize on an array of request_t objects returned from isend/irecv
+             * @param count number of requests to synchronize on
+             * @param array_of_requests an array of request_t objects returned from isend/irecv
+             */
+            void waitall(int count, request_t array_of_requests[]) const
+            {
+                impl_->waitall(count, array_of_requests);
+            }
+
+            /**
+             * Perform an allreduce collective
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff data to reduce
+             * @param recvbuff buffer to hold the reduced result
+             * @param count number of elements in sendbuff
+             * @param op reduction operation to perform
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void allreduce(
+                    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
+            {
+                impl_->allreduce(static_cast<const void*>(sendbuff),
+                                 static_cast<void*>(recvbuff),
+                                 count,
+                                 get_type<value_t>(),
+                                 op,
+                                 stream);
+            }
+
+            /**
+             * Broadcast data from one rank to the rest
+             * @tparam value_t datatype of underlying buffers
+             * @param buff buffer to send
+             * @param count number of elements if buff
+             * @param root the rank initiating the broadcast
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
+            {
+                impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
+            }
+
+            /**
+             * Broadcast data from one rank to the rest
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to broadcast (only used in root)
+             * @param recvbuff buffer to receive broadcasted data
+             * @param count number of elements if buff
+             * @param root the rank initiating the broadcast
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void bcast(
+                    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
+            {
+                impl_->bcast(static_cast<const void*>(sendbuff),
+                             static_cast<void*>(recvbuff),
+                             count,
+                             get_type<value_t>(),
+                             root,
+                             stream);
+            }
+
+            /**
+             * Reduce data from many ranks down to a single rank
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to reduce
+             * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
+             * @param count number of elements in sendbuff
+             * @param op reduction operation to perform
+             * @param root rank to store the results
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void reduce(const value_t* sendbuff,
+                        value_t* recvbuff,
+                        size_t count,
+                        op_t op,
+                        int root,
+                        cudaStream_t stream) const
+            {
+                impl_->reduce(static_cast<const void*>(sendbuff),
+                              static_cast<void*>(recvbuff),
+                              count,
+                              get_type<value_t>(),
+                              op,
+                              root,
+                              stream);
+            }
+
+            /**
+             * Gathers data from each rank onto all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to gather
+             * @param recvbuff buffer containing gathered data from all ranks
+             * @param sendcount number of elements in send buffer
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void allgather(const value_t* sendbuff,
+                           value_t* recvbuff,
+                           size_t sendcount,
+                           cudaStream_t stream) const
+            {
+                impl_->allgather(static_cast<const void*>(sendbuff),
+                                 static_cast<void*>(recvbuff),
+                                 sendcount,
+                                 get_type<value_t>(),
+                                 stream);
+            }
+
+            /**
+             * Gathers data from all ranks and delivers to combined data to all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuf buffer containing data to send
+             * @param recvbuf buffer containing data to receive
+             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+             *                   elements that are to be received from each rank
+             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+             *               (relative to recvbuf) at which to place the incoming data from each rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void allgatherv(const value_t* sendbuf,
+                            value_t* recvbuf,
+                            const size_t* recvcounts,
+                            const size_t* displs,
+                            cudaStream_t stream) const
+            {
+                impl_->allgatherv(static_cast<const void*>(sendbuf),
+                                  static_cast<void*>(recvbuf),
+                                  recvcounts,
+                                  displs,
+                                  get_type<value_t>(),
+                                  stream);
+            }
+
+            /**
+             * Gathers data from each rank onto all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to gather
+             * @param recvbuff buffer containing gathered data from all ranks
+             * @param sendcount number of elements in send buffer
+             * @param root rank to store the results
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void gather(const value_t* sendbuff,
+                        value_t* recvbuff,
+                        size_t sendcount,
+                        int root,
+                        cudaStream_t stream) const
+            {
+                impl_->gather(static_cast<const void*>(sendbuff),
+                              static_cast<void*>(recvbuff),
+                              sendcount,
+                              get_type<value_t>(),
+                              root,
+                              stream);
+            }
+
+            /**
+             * Gathers data from all ranks and delivers to combined data to all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuf buffer containing data to send
+             * @param recvbuf buffer containing data to receive
+             * @param sendcount number of elements in send buffer
+             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+             *                   elements that are to be received from each rank
+             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+             *               (relative to recvbuf) at which to place the incoming data from each rank
+             * @param root rank to store the results
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void gatherv(const value_t* sendbuf,
+                         value_t* recvbuf,
+                         size_t sendcount,
+                         const size_t* recvcounts,
+                         const size_t* displs,
+                         int root,
+                         cudaStream_t stream) const
+            {
+                impl_->gatherv(static_cast<const void*>(sendbuf),
+                               static_cast<void*>(recvbuf),
+                               sendcount,
+                               recvcounts,
+                               displs,
+                               get_type<value_t>(),
+                               root,
+                               stream);
+            }
+
+            /**
+             * Reduces data from all ranks then scatters the result across ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
+             * @param recvbuff buffer containing received data
+             * @param recvcount number of items to receive
+             * @param op reduction operation to perform
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void reducescatter(const value_t* sendbuff,
+                               value_t* recvbuff,
+                               size_t recvcount,
+                               op_t op,
+                               cudaStream_t stream) const
+            {
+                impl_->reducescatter(static_cast<const void*>(sendbuff),
+                                     static_cast<void*>(recvbuff),
+                                     recvcount,
+                                     get_type<value_t>(),
+                                     op,
+                                     stream);
+            }
+
+            /**
+             * Performs a point-to-point send
+             *
+             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+             *
+             * @tparam value_t the type of data to send
+             * @param buf pointer to array of data to send
+             * @param size number of elements in buf
+             * @param dest destination rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
+            {
+                impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
+            }
+
+            /**
+             * Performs a point-to-point receive
+             *
+             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+             *
+             * @tparam value_t the type of data to be received
+             * @param buf pointer to (initialized) array that will hold received data
+             * @param size number of elements in buf
+             * @param source source rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
+            {
+                impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
+            }
+
+            /**
+             * Performs a point-to-point send/receive
+             *
+             * @tparam value_t the type of data to be sent & received
+             * @param sendbuf pointer to array of data to send
+             * @param sendsize number of elements in sendbuf
+             * @param dest destination rank
+             * @param recvbuf pointer to (initialized) array that will hold received data
+             * @param recvsize number of elements in recvbuf
+             * @param source source rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_sendrecv(const value_t* sendbuf,
+                                 size_t sendsize,
+                                 int dest,
                                  value_t* recvbuf,
-                                 std::vector<size_t> const& recvsizes,
-                                 std::vector<size_t> const& recvoffsets,
-                                 std::vector<int> const& sources,
+                                 size_t recvsize,
+                                 int source,
                                  cudaStream_t stream) const
-  {
-    auto sendbytesizes   = sendsizes;
-    auto sendbyteoffsets = sendoffsets;
-    for (size_t i = 0; i < sendsizes.size(); ++i) {
-      sendbytesizes[i] *= sizeof(value_t);
-      sendbyteoffsets[i] *= sizeof(value_t);
-    }
-    auto recvbytesizes   = recvsizes;
-    auto recvbyteoffsets = recvoffsets;
-    for (size_t i = 0; i < recvsizes.size(); ++i) {
-      recvbytesizes[i] *= sizeof(value_t);
-      recvbyteoffsets[i] *= sizeof(value_t);
-    }
-    impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
-                                     sendbytesizes,
-                                     sendbyteoffsets,
-                                     dests,
-                                     static_cast<void*>(recvbuf),
-                                     recvbytesizes,
-                                     recvbyteoffsets,
-                                     sources,
-                                     stream);
-  }
-
- private:
-  std::unique_ptr<comms_iface> impl_;
-};
-
-}  // namespace comms
+            {
+                impl_->device_sendrecv(static_cast<const void*>(sendbuf),
+                                       sendsize * sizeof(value_t),
+                                       dest,
+                                       static_cast<void*>(recvbuf),
+                                       recvsize * sizeof(value_t),
+                                       source,
+                                       stream);
+            }
+
+            /**
+             * Performs a multicast send/receive
+             *
+             * @tparam value_t the type of data to be sent & received
+             * @param sendbuf pointer to array of data to send
+             * @param sendsizes numbers of elements to send
+             * @param sendoffsets offsets in a number of elements from sendbuf
+             * @param dests destination ranks
+             * @param recvbuf pointer to (initialized) array that will hold received data
+             * @param recvsizes numbers of elements to recv
+             * @param recvoffsets offsets in a number of elements from recvbuf
+             * @param sources source ranks
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_multicast_sendrecv(const value_t* sendbuf,
+                                           std::vector<size_t> const& sendsizes,
+                                           std::vector<size_t> const& sendoffsets,
+                                           std::vector<int> const& dests,
+                                           value_t* recvbuf,
+                                           std::vector<size_t> const& recvsizes,
+                                           std::vector<size_t> const& recvoffsets,
+                                           std::vector<int> const& sources,
+                                           cudaStream_t stream) const
+            {
+                auto sendbytesizes   = sendsizes;
+                auto sendbyteoffsets = sendoffsets;
+                for (size_t i = 0; i < sendsizes.size(); ++i) {
+                    sendbytesizes[i] *= sizeof(value_t);
+                    sendbyteoffsets[i] *= sizeof(value_t);
+                }
+                auto recvbytesizes   = recvsizes;
+                auto recvbyteoffsets = recvoffsets;
+                for (size_t i = 0; i < recvsizes.size(); ++i) {
+                    recvbytesizes[i] *= sizeof(value_t);
+                    recvbyteoffsets[i] *= sizeof(value_t);
+                }
+                impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
+                                                 sendbytesizes,
+                                                 sendbyteoffsets,
+                                                 dests,
+                                                 static_cast<void*>(recvbuf),
+                                                 recvbytesizes,
+                                                 recvbyteoffsets,
+                                                 sources,
+                                                 stream);
+            }
+
+        private:
+            std::unique_ptr<comms_iface> impl_;
+        };
+
+    }  // namespace comms
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 936065afba..8fa1c114e3 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/cudart_utils.hpp instead.
+ */
+
+
+#ifndef __RAFT_RT_CUDART_UTILS_H
+#define __RAFT_RT_CUDART_UTILS_H
+
 #pragma once
 
 #include <raft/error.hpp>
@@ -38,10 +47,10 @@ namespace raft {
 /**
  * @brief Exception thrown when a CUDA error is encountered.
  */
-struct cuda_error : public raft::exception {
-  explicit cuda_error(char const* const message) : raft::exception(message) {}
-  explicit cuda_error(std::string const& message) : raft::exception(message) {}
-};
+    struct cuda_error : public raft::exception {
+        explicit cuda_error(char const* const message) : raft::exception(message) {}
+        explicit cuda_error(std::string const& message) : raft::exception(message) {}
+    };
 
 }  // namespace raft
 
@@ -133,99 +142,99 @@ struct cuda_error : public raft::exception {
 namespace raft {
 
 /** Helper method to get to know warp size in device code */
-__host__ __device__ constexpr inline int warp_size() { return 32; }
+    __host__ __device__ constexpr inline int warp_size() { return 32; }
 
-__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
+    __host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to threads.
  */
-class grid_1d_thread_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   * @param elements_per_thread Typically, a single kernel thread processes more than a single
-   * element; this affects the number of threads the grid must contain
-   */
-  grid_1d_thread_t(size_t overall_num_elements,
-                   size_t num_threads_per_block,
-                   size_t max_num_blocks_1d,
-                   size_t elements_per_thread = 1)
-    : block_size(num_threads_per_block),
-      num_blocks(
-        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
-                   (elements_per_thread * num_threads_per_block),
-                 max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-    RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
-  }
-};
+    class grid_1d_thread_t {
+    public:
+        int const block_size{0};
+        int const num_blocks{0};
+
+        /**
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         * @param elements_per_thread Typically, a single kernel thread processes more than a single
+         * element; this affects the number of threads the grid must contain
+         */
+        grid_1d_thread_t(size_t overall_num_elements,
+                         size_t num_threads_per_block,
+                         size_t max_num_blocks_1d,
+                         size_t elements_per_thread = 1)
+                : block_size(num_threads_per_block),
+                  num_blocks(
+                          std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
+                                   (elements_per_thread * num_threads_per_block),
+                                   max_num_blocks_1d))
+        {
+            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                         "num_threads_per_block / warp_size() must be > 0");
+            RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
+        }
+    };
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to warps.
  */
-class grid_1d_warp_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   */
-  grid_1d_warp_t(size_t overall_num_elements,
-                 size_t num_threads_per_block,
-                 size_t max_num_blocks_1d)
-    : block_size(num_threads_per_block),
-      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
-                            (num_threads_per_block / warp_size()),
-                          max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-  }
-};
+    class grid_1d_warp_t {
+    public:
+        int const block_size{0};
+        int const num_blocks{0};
+
+        /**
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         */
+        grid_1d_warp_t(size_t overall_num_elements,
+                       size_t num_threads_per_block,
+                       size_t max_num_blocks_1d)
+                : block_size(num_threads_per_block),
+                  num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
+                                      (num_threads_per_block / warp_size()),
+                                      max_num_blocks_1d))
+        {
+            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                         "num_threads_per_block / warp_size() must be > 0");
+        }
+    };
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to blocks.
  */
-class grid_1d_block_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   */
-  grid_1d_block_t(size_t overall_num_elements,
-                  size_t num_threads_per_block,
-                  size_t max_num_blocks_1d)
-    : block_size(num_threads_per_block),
-      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-  }
-};
+    class grid_1d_block_t {
+    public:
+        int const block_size{0};
+        int const num_blocks{0};
+
+        /**
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         */
+        grid_1d_block_t(size_t overall_num_elements,
+                        size_t num_threads_per_block,
+                        size_t max_num_blocks_1d)
+                : block_size(num_threads_per_block),
+                  num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
+        {
+            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                         "num_threads_per_block / warp_size() must be > 0");
+        }
+    };
 
 /**
  * @brief Generic copy method for all kinds of transfers
@@ -235,11 +244,11 @@ class grid_1d_block_t {
  * @param len lenth of the src/dst buffers in terms of number of elements
  * @param stream cuda stream
  */
-template <typename Type>
-void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
-{
-  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
-}
+    template <typename Type>
+    void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
+    {
+        CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+    }
 
 /**
  * @defgroup Copy Copy methods
@@ -248,152 +257,152 @@ void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
  * @{
  */
 /** performs a host to device copy */
-template <typename Type>
-void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
-{
-  copy(d_ptr, h_ptr, len, stream);
-}
+    template <typename Type>
+    void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
+    {
+        copy(d_ptr, h_ptr, len, stream);
+    }
 
 /** performs a device to host copy */
-template <typename Type>
-void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
-{
-  copy(h_ptr, d_ptr, len, stream);
-}
-
-template <typename Type>
-void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
-{
-  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
-}
+    template <typename Type>
+    void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
+    {
+        copy(h_ptr, d_ptr, len, stream);
+    }
+
+    template <typename Type>
+    void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
+    {
+        CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+    }
 /** @} */
 
 /**
  * @defgroup Debug Utils for debugging host/device buffers
  * @{
  */
-template <class T, class OutStream>
-void print_host_vector(const char* variable_name,
-                       const T* host_mem,
-                       size_t componentsCount,
-                       OutStream& out)
-{
-  out << variable_name << "=[";
-  for (size_t i = 0; i < componentsCount; ++i) {
-    if (i != 0) out << ",";
-    out << host_mem[i];
-  }
-  out << "];\n";
-}
-
-template <class T, class OutStream>
-void print_device_vector(const char* variable_name,
-                         const T* devMem,
-                         size_t componentsCount,
-                         OutStream& out)
-{
-  T* host_mem = new T[componentsCount];
-  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
-  print_host_vector(variable_name, host_mem, componentsCount, out);
-  delete[] host_mem;
-}
+    template <class T, class OutStream>
+    void print_host_vector(const char* variable_name,
+                           const T* host_mem,
+                           size_t componentsCount,
+                           OutStream& out)
+    {
+        out << variable_name << "=[";
+        for (size_t i = 0; i < componentsCount; ++i) {
+            if (i != 0) out << ",";
+            out << host_mem[i];
+        }
+        out << "];\n";
+    }
+
+    template <class T, class OutStream>
+    void print_device_vector(const char* variable_name,
+                             const T* devMem,
+                             size_t componentsCount,
+                             OutStream& out)
+    {
+        T* host_mem = new T[componentsCount];
+        CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
+        print_host_vector(variable_name, host_mem, componentsCount, out);
+        delete[] host_mem;
+    }
 /** @} */
 
-static std::mutex mutex_;
-static std::unordered_map<void*, size_t> allocations;
-
-template <typename Type>
-void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false)
-{
-  size_t size = len * sizeof(Type);
-  ptr         = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
-  if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
-
-  std::lock_guard<std::mutex> _(mutex_);
-  allocations[ptr] = size;
-}
-
-template <typename Type>
-void deallocate(Type*& ptr, rmm::cuda_stream_view stream)
-{
-  std::lock_guard<std::mutex> _(mutex_);
-  size_t size = allocations[ptr];
-  allocations.erase(ptr);
-  rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
-}
-
-inline void deallocate_all(rmm::cuda_stream_view stream)
-{
-  std::lock_guard<std::mutex> _(mutex_);
-  for (auto& alloc : allocations) {
-    void* ptr   = alloc.first;
-    size_t size = alloc.second;
-    rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
-  }
-  allocations.clear();
-}
+    static std::mutex mutex_;
+    static std::unordered_map<void*, size_t> allocations;
+
+    template <typename Type>
+    void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false)
+    {
+        size_t size = len * sizeof(Type);
+        ptr         = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
+        if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
+
+        std::lock_guard<std::mutex> _(mutex_);
+        allocations[ptr] = size;
+    }
+
+    template <typename Type>
+    void deallocate(Type*& ptr, rmm::cuda_stream_view stream)
+    {
+        std::lock_guard<std::mutex> _(mutex_);
+        size_t size = allocations[ptr];
+        allocations.erase(ptr);
+        rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
+    }
+
+    inline void deallocate_all(rmm::cuda_stream_view stream)
+    {
+        std::lock_guard<std::mutex> _(mutex_);
+        for (auto& alloc : allocations) {
+            void* ptr   = alloc.first;
+            size_t size = alloc.second;
+            rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
+        }
+        allocations.clear();
+    }
 
 /** helper method to get max usable shared mem per block parameter */
-inline int getSharedMemPerBlock()
-{
-  int devId;
-  RAFT_CUDA_TRY(cudaGetDevice(&devId));
-  int smemPerBlk;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
-  return smemPerBlk;
-}
+    inline int getSharedMemPerBlock()
+    {
+        int devId;
+        RAFT_CUDA_TRY(cudaGetDevice(&devId));
+        int smemPerBlk;
+        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
+        return smemPerBlk;
+    }
 
 /** helper method to get multi-processor count parameter */
-inline int getMultiProcessorCount()
-{
-  int devId;
-  RAFT_CUDA_TRY(cudaGetDevice(&devId));
-  int mpCount;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
-  return mpCount;
-}
+    inline int getMultiProcessorCount()
+    {
+        int devId;
+        RAFT_CUDA_TRY(cudaGetDevice(&devId));
+        int mpCount;
+        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+        return mpCount;
+    }
 
 /** helper method to convert an array on device to a string on host */
-template <typename T>
-std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
-{
-  std::stringstream ss;
+    template <typename T>
+    std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
+    {
+        std::stringstream ss;
 
-  T* arr_h = (T*)malloc(size * sizeof(T));
-  update_host(arr_h, arr, size, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+        T* arr_h = (T*)malloc(size * sizeof(T));
+        update_host(arr_h, arr, size, stream);
+        RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
-  ss << name << " = [ ";
-  for (int i = 0; i < size; i++) {
-    ss << std::setw(width) << arr_h[i];
+        ss << name << " = [ ";
+        for (int i = 0; i < size; i++) {
+            ss << std::setw(width) << arr_h[i];
 
-    if (i < size - 1) ss << ", ";
-  }
-  ss << " ]" << std::endl;
+            if (i < size - 1) ss << ", ";
+        }
+        ss << " ]" << std::endl;
 
-  free(arr_h);
+        free(arr_h);
 
-  return ss.str();
-}
+        return ss.str();
+    }
 
 /** this seems to be unused, but may be useful in the future */
-template <typename T>
-void ASSERT_DEVICE_MEM(T* ptr, std::string name)
-{
-  cudaPointerAttributes s_att;
-  cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
-
-  if (s_err != 0 || s_att.device == -1)
-    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
-              << ", err=" << s_err << std::endl;
-}
-
-inline uint32_t curTimeMillis()
-{
-  auto now      = std::chrono::high_resolution_clock::now();
-  auto duration = now.time_since_epoch();
-  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
-}
+    template <typename T>
+    void ASSERT_DEVICE_MEM(T* ptr, std::string name)
+    {
+        cudaPointerAttributes s_att;
+        cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
+
+        if (s_err != 0 || s_att.device == -1)
+            std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
+                      << ", err=" << s_err << std::endl;
+    }
+
+    inline uint32_t curTimeMillis()
+    {
+        auto now      = std::chrono::high_resolution_clock::now();
+        auto duration = now.time_since_epoch();
+        return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+    }
 
 /** Helper function to calculate need memory for allocate to store dense matrix.
  * @param rows number of rows in matrix
@@ -401,33 +410,35 @@ inline uint32_t curTimeMillis()
  * @return need number of items to allocate via allocate()
  * @sa allocate()
  */
-inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
+    inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
 
 /** Helper function to check alignment of pointer.
  * @param ptr the pointer to check
  * @param alignment to be checked for
  * @return true if address in bytes is a multiple of alignment
  */
-template <typename Type>
-bool is_aligned(Type* ptr, size_t alignment)
-{
-  return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
-}
+    template <typename Type>
+    bool is_aligned(Type* ptr, size_t alignment)
+    {
+        return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+    }
 
 /** calculate greatest common divisor of two numbers
  * @a integer
  * @b integer
  * @ return gcd of a and b
  */
-template <typename IntType>
-IntType gcd(IntType a, IntType b)
-{
-  while (b != 0) {
-    IntType tmp = b;
-    b           = a % b;
-    a           = tmp;
-  }
-  return a;
-}
+    template <typename IntType>
+    IntType gcd(IntType a, IntType b)
+    {
+        while (b != 0) {
+            IntType tmp = b;
+            b           = a % b;
+            a           = tmp;
+        }
+        return a;
+    }
 
 }  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index 0eba4326e6..1dc6e3d755 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the include/raft_runtime/error.hpp instead.
+ */
+
+
+#ifndef __RAFT_RT_ERROR
+#define __RAFT_RT_ERROR
+
 #pragma once
 
 #include <cstdio>
@@ -27,36 +36,36 @@
 namespace raft {
 
 /** base exception class for the whole of raft */
-class exception : public std::exception {
- public:
-  /** default ctor */
-  explicit exception() noexcept : std::exception(), msg_() {}
-
-  /** copy ctor */
-  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
-  {
-    collect_call_stack();
-  }
-
-  /** ctor from an input message */
-  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
-  {
-    collect_call_stack();
-  }
-
-  /** get the message associated with this exception */
-  char const* what() const noexcept override { return msg_.c_str(); }
-
- private:
-  /** message associated with this exception */
-  std::string msg_;
-
-  /** append call stack info to this exception's message for ease of debug */
-  // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-  void collect_call_stack() noexcept
-  {
+    class exception : public std::exception {
+    public:
+        /** default ctor */
+        explicit exception() noexcept : std::exception(), msg_() {}
+
+        /** copy ctor */
+        exception(exception const& src) noexcept : std::exception(), msg_(src.what())
+        {
+            collect_call_stack();
+        }
+
+        /** ctor from an input message */
+        explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
+        {
+            collect_call_stack();
+        }
+
+        /** get the message associated with this exception */
+        char const* what() const noexcept override { return msg_.c_str(); }
+
+    private:
+        /** message associated with this exception */
+        std::string msg_;
+
+        /** append call stack info to this exception's message for ease of debug */
+        // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
+        void collect_call_stack() noexcept
+        {
 #ifdef __GNUC__
-    constexpr int kMaxStackDepth = 64;
+            constexpr int kMaxStackDepth = 64;
     void* stack[kMaxStackDepth];  // NOLINT
     auto depth = backtrace(stack, kMaxStackDepth);
     std::ostringstream oss;
@@ -74,8 +83,8 @@ class exception : public std::exception {
     free(strings);
     msg_ += oss.str();
 #endif  // __GNUC__
-  }
-};
+        }
+    };
 
 /**
  * @brief Exception thrown when logical precondition is violated.
@@ -84,10 +93,10 @@ class exception : public std::exception {
  * RAFT_EXPECTS and  RAFT_FAIL macros.
  *
  */
-struct logic_error : public raft::exception {
-  explicit logic_error(char const* const message) : raft::exception(message) {}
-  explicit logic_error(std::string const& message) : raft::exception(message) {}
-};
+    struct logic_error : public raft::exception {
+        explicit logic_error(char const* const message) : raft::exception(message) {}
+        explicit logic_error(std::string const& message) : raft::exception(message) {}
+    };
 
 }  // namespace raft
 
@@ -169,3 +178,5 @@ struct logic_error : public raft::exception {
     SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
     throw raft::logic_error(msg);                               \
   } while (0)
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 7d6a5bfafd..13a3fc26d9 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the include/raft_runtime/handle.hpp instead.
+ */
+
+#ifndef __RAFT_RT_HANDLE
+#define __RAFT_RT_HANDLE
+
 #pragma once
 
 #include <memory>
@@ -48,287 +56,289 @@ namespace raft {
  * @brief Main handle object that stores all necessary context used for calling
  *        necessary cuda kernels and/or libraries
  */
-class handle_t {
- public:
-  // delete copy/move constructors and assignment operators as
-  // copying and moving underlying resources is unsafe
-  handle_t(const handle_t&) = delete;
-  handle_t& operator=(const handle_t&) = delete;
-  handle_t(handle_t&&)                 = delete;
-  handle_t& operator=(handle_t&&) = delete;
-
-  /**
-   * @brief Construct a handle with a stream view and stream pool
-   *
-   * @param[in] stream_view the default stream (which has the default per-thread stream if
-   * unspecified)
-   * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
-   */
-  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
-           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
-    : dev_id_([]() -> int {
-        int cur_dev = -1;
-        RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
-        return cur_dev;
-      }()),
-      stream_view_{stream_view},
-      stream_pool_{stream_pool}
-  {
-    create_resources();
-  }
-
-  /** Destroys all held-up resources */
-  virtual ~handle_t() { destroy_resources(); }
-
-  int get_device() const { return dev_id_; }
-
-  cublasHandle_t get_cublas_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cublas_initialized_) {
-      RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
-      RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
-      cublas_initialized_ = true;
-    }
-    return cublas_handle_;
-  }
-
-  cusolverDnHandle_t get_cusolver_dn_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
-      cusolver_dn_initialized_ = true;
-    }
-    return cusolver_dn_handle_;
-  }
-
-  cusolverSpHandle_t get_cusolver_sp_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
-      cusolver_sp_initialized_ = true;
-    }
-    return cusolver_sp_handle_;
-  }
-
-  cusparseHandle_t get_cusparse_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusparse_initialized_) {
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
-      cusparse_initialized_ = true;
-    }
-    return cusparse_handle_;
-  }
-
-  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
-
-  /**
-   * @brief synchronize a stream on the handle
-   */
-  void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
-
-  /**
-   * @brief synchronize main stream on the handle
-   */
-  void sync_stream() const { sync_stream(stream_view_); }
-
-  /**
-   * @brief returns main stream on the handle
-   */
-  rmm::cuda_stream_view get_stream() const { return stream_view_; }
-
-  /**
-   * @brief returns whether stream pool was initialized on the handle
-   */
-
-  bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
-
-  /**
-   * @brief returns stream pool on the handle
-   */
-  const rmm::cuda_stream_pool& get_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return *stream_pool_;
-  }
-
-  std::size_t get_stream_pool_size() const
-  {
-    return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
-  }
-
-  /**
-   * @brief return stream from pool
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream();
-  }
-
-  /**
-   * @brief return stream from pool at index
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream(stream_idx);
-  }
-
-  /**
-   * @brief return stream from pool if size > 0, else main stream on handle
-   */
-  rmm::cuda_stream_view get_next_usable_stream() const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
-  }
-
-  /**
-   * @brief return stream from pool at index if size > 0, else main stream on handle
-   *
-   * @param[in] stream_idx the required index of the stream in the stream pool if available
-   */
-  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
-  }
-
-  /**
-   * @brief synchronize the stream pool on the handle
-   */
-  void sync_stream_pool() const
-  {
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      sync_stream(stream_pool_->get_stream(i));
-    }
-  }
-
-  /**
-   * @brief synchronize subset of stream pool
-   *
-   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
-   */
-  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    for (const auto& stream_index : stream_indices) {
-      sync_stream(stream_pool_->get_stream(stream_index));
-    }
-  }
-
-  /**
-   * @brief ask stream pool to wait on last event in main stream
-   */
-  void wait_stream_pool_on_stream() const
-  {
-    RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
-    }
-  }
-
-  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
-
-  const comms::comms_t& get_comms() const
-  {
-    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
-    return *communicator_;
-  }
-
-  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
-  {
-    subcomms_[key] = subcomm;
-  }
-
-  const comms::comms_t& get_subcomm(std::string key) const
-  {
-    RAFT_EXPECTS(
-      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
-
-    auto subcomm = subcomms_.at(key);
-
-    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
-
-    return *subcomm;
-  }
-
-  bool comms_initialized() const { return (nullptr != communicator_.get()); }
-
-  const cudaDeviceProp& get_device_properties() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!device_prop_initialized_) {
-      RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
-      device_prop_initialized_ = true;
-    }
-    return prop_;
-  }
-
- private:
-  std::shared_ptr<comms::comms_t> communicator_;
-  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
-
-  const int dev_id_;
-  mutable cublasHandle_t cublas_handle_;
-  mutable bool cublas_initialized_{false};
-  mutable cusolverDnHandle_t cusolver_dn_handle_;
-  mutable bool cusolver_dn_initialized_{false};
-  mutable cusolverSpHandle_t cusolver_sp_handle_;
-  mutable bool cusolver_sp_initialized_{false};
-  mutable cusparseHandle_t cusparse_handle_;
-  mutable bool cusparse_initialized_{false};
-  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
-  rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
-  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
-  cudaEvent_t event_;
-  mutable cudaDeviceProp prop_;
-  mutable bool device_prop_initialized_{false};
-  mutable std::mutex mutex_;
-
-  void create_resources()
-  {
-    thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
-
-    RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-  }
-
-  void destroy_resources()
-  {
-    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
-    if (cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
-    }
-    if (cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
-    }
-    if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
-    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
-  }
-};  // class handle_t
+    class handle_t {
+    public:
+        // delete copy/move constructors and assignment operators as
+        // copying and moving underlying resources is unsafe
+        handle_t(const handle_t&) = delete;
+        handle_t& operator=(const handle_t&) = delete;
+        handle_t(handle_t&&)                 = delete;
+        handle_t& operator=(handle_t&&) = delete;
+
+        /**
+         * @brief Construct a handle with a stream view and stream pool
+         *
+         * @param[in] stream_view the default stream (which has the default per-thread stream if
+         * unspecified)
+         * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
+         */
+        handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
+                 std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
+                : dev_id_([]() -> int {
+            int cur_dev = -1;
+            RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
+            return cur_dev;
+        }()),
+                  stream_view_{stream_view},
+                  stream_pool_{stream_pool}
+        {
+            create_resources();
+        }
+
+        /** Destroys all held-up resources */
+        virtual ~handle_t() { destroy_resources(); }
+
+        int get_device() const { return dev_id_; }
+
+        cublasHandle_t get_cublas_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cublas_initialized_) {
+                RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
+                RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
+                cublas_initialized_ = true;
+            }
+            return cublas_handle_;
+        }
+
+        cusolverDnHandle_t get_cusolver_dn_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cusolver_dn_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
+                cusolver_dn_initialized_ = true;
+            }
+            return cusolver_dn_handle_;
+        }
+
+        cusolverSpHandle_t get_cusolver_sp_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cusolver_sp_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
+                cusolver_sp_initialized_ = true;
+            }
+            return cusolver_sp_handle_;
+        }
+
+        cusparseHandle_t get_cusparse_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cusparse_initialized_) {
+                RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
+                RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
+                cusparse_initialized_ = true;
+            }
+            return cusparse_handle_;
+        }
+
+        rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
+
+        /**
+         * @brief synchronize a stream on the handle
+         */
+        void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
+
+        /**
+         * @brief synchronize main stream on the handle
+         */
+        void sync_stream() const { sync_stream(stream_view_); }
+
+        /**
+         * @brief returns main stream on the handle
+         */
+        rmm::cuda_stream_view get_stream() const { return stream_view_; }
+
+        /**
+         * @brief returns whether stream pool was initialized on the handle
+         */
+
+        bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
+
+        /**
+         * @brief returns stream pool on the handle
+         */
+        const rmm::cuda_stream_pool& get_stream_pool() const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            return *stream_pool_;
+        }
+
+        std::size_t get_stream_pool_size() const
+        {
+            return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
+        }
+
+        /**
+         * @brief return stream from pool
+         */
+        rmm::cuda_stream_view get_stream_from_stream_pool() const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            return stream_pool_->get_stream();
+        }
+
+        /**
+         * @brief return stream from pool at index
+         */
+        rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            return stream_pool_->get_stream(stream_idx);
+        }
+
+        /**
+         * @brief return stream from pool if size > 0, else main stream on handle
+         */
+        rmm::cuda_stream_view get_next_usable_stream() const
+        {
+            return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
+        }
+
+        /**
+         * @brief return stream from pool at index if size > 0, else main stream on handle
+         *
+         * @param[in] stream_idx the required index of the stream in the stream pool if available
+         */
+        rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
+        {
+            return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
+        }
+
+        /**
+         * @brief synchronize the stream pool on the handle
+         */
+        void sync_stream_pool() const
+        {
+            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+                sync_stream(stream_pool_->get_stream(i));
+            }
+        }
+
+        /**
+         * @brief synchronize subset of stream pool
+         *
+         * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+         */
+        void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            for (const auto& stream_index : stream_indices) {
+                sync_stream(stream_pool_->get_stream(stream_index));
+            }
+        }
+
+        /**
+         * @brief ask stream pool to wait on last event in main stream
+         */
+        void wait_stream_pool_on_stream() const
+        {
+            RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
+            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+                RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
+            }
+        }
+
+        void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
+
+        const comms::comms_t& get_comms() const
+        {
+            RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
+            return *communicator_;
+        }
+
+        void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+        {
+            subcomms_[key] = subcomm;
+        }
+
+        const comms::comms_t& get_subcomm(std::string key) const
+        {
+            RAFT_EXPECTS(
+                    subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
+
+            auto subcomm = subcomms_.at(key);
+
+            RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
+
+            return *subcomm;
+        }
+
+        bool comms_initialized() const { return (nullptr != communicator_.get()); }
+
+        const cudaDeviceProp& get_device_properties() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!device_prop_initialized_) {
+                RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
+                device_prop_initialized_ = true;
+            }
+            return prop_;
+        }
+
+    private:
+        std::shared_ptr<comms::comms_t> communicator_;
+        std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
+
+        const int dev_id_;
+        mutable cublasHandle_t cublas_handle_;
+        mutable bool cublas_initialized_{false};
+        mutable cusolverDnHandle_t cusolver_dn_handle_;
+        mutable bool cusolver_dn_initialized_{false};
+        mutable cusolverSpHandle_t cusolver_sp_handle_;
+        mutable bool cusolver_sp_initialized_{false};
+        mutable cusparseHandle_t cusparse_handle_;
+        mutable bool cusparse_initialized_{false};
+        std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
+        rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
+        std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
+        cudaEvent_t event_;
+        mutable cudaDeviceProp prop_;
+        mutable bool device_prop_initialized_{false};
+        mutable std::mutex mutex_;
+
+        void create_resources()
+        {
+            thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
+
+            RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+        }
+
+        void destroy_resources()
+        {
+            if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
+            if (cusolver_dn_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+            }
+            if (cusolver_sp_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+            }
+            if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
+            RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
+        }
+    };  // class handle_t
 
 /**
  * @brief RAII approach to synchronizing across all streams in the handle
  */
-class stream_syncer {
- public:
-  explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
-  ~stream_syncer()
-  {
-    handle_.wait_stream_pool_on_stream();
-    handle_.sync_stream_pool();
-  }
-
-  stream_syncer(const stream_syncer& other) = delete;
-  stream_syncer& operator=(const stream_syncer& other) = delete;
-
- private:
-  const handle_t& handle_;
-};  // class stream_syncer
+    class stream_syncer {
+    public:
+        explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
+        ~stream_syncer()
+        {
+            handle_.wait_stream_pool_on_stream();
+            handle_.sync_stream_pool();
+        }
+
+        stream_syncer(const stream_syncer& other) = delete;
+        stream_syncer& operator=(const stream_syncer& other) = delete;
+
+    private:
+        const handle_t& handle_;
+    };  // class stream_syncer
 
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/interruptible.hpp b/cpp/include/raft/interruptible.hpp
index 7ff5ca0c88..959d1063fb 100644
--- a/cpp/include/raft/interruptible.hpp
+++ b/cpp/include/raft/interruptible.hpp
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the include/raft_runtime/interruptible.hpp instead.
+ */
+
+
+#ifndef __RAFT_RT_INTERRUPTIBLE_H
+#define __RAFT_RT_INTERRUPTIBLE_H
+
 #pragma once
 
 #include <memory>
@@ -31,9 +40,9 @@ namespace raft {
  * @brief Exception thrown during `interruptible::synchronize` call when it detects a request
  * to cancel the work performed in this CPU thread.
  */
-struct interrupted_exception : public raft::exception {
-  using raft::exception::exception;
-};
+    struct interrupted_exception : public raft::exception {
+        using raft::exception::exception;
+    };
 
 /**
  * @brief Cooperative-style interruptible execution.
@@ -60,207 +69,209 @@ struct interrupted_exception : public raft::exception {
  * (e.g., CTRL+C), but extra effort on the use side is required to allow safe interrupting and
  * resuming of the GPU stream work.
  */
-class interruptible {
- public:
-  /**
-   * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
-   * called on this CPU thread.
-   *
-   * @param [in] stream a CUDA stream.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread before the currently captured work has been finished.
-   * @throw raft::cuda_error if another CUDA error happens.
-   */
-  static inline void synchronize(rmm::cuda_stream_view stream)
-  {
-    get_token()->synchronize_impl(cudaStreamQuery, stream);
-  }
+    class interruptible {
+    public:
+        /**
+         * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
+         * called on this CPU thread.
+         *
+         * @param [in] stream a CUDA stream.
+         *
+         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+         * thread before the currently captured work has been finished.
+         * @throw raft::cuda_error if another CUDA error happens.
+         */
+        static inline void synchronize(rmm::cuda_stream_view stream)
+        {
+            get_token()->synchronize_impl(cudaStreamQuery, stream);
+        }
 
-  /**
-   * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
-   * called on this CPU thread.
-   *
-   * @param [in] event a CUDA event.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread before the currently captured work has been finished.
-   * @throw raft::cuda_error if another CUDA error happens.
-   */
-  static inline void synchronize(cudaEvent_t event)
-  {
-    get_token()->synchronize_impl(cudaEventQuery, event);
-  }
+        /**
+         * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
+         * called on this CPU thread.
+         *
+         * @param [in] event a CUDA event.
+         *
+         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+         * thread before the currently captured work has been finished.
+         * @throw raft::cuda_error if another CUDA error happens.
+         */
+        static inline void synchronize(cudaEvent_t event)
+        {
+            get_token()->synchronize_impl(cudaEventQuery, event);
+        }
 
-  /**
-   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-   * `interruptible::cancel`.
-   *
-   * This is a cancellation point for an interruptible thread. It's called in the internals of
-   * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
-   * recommended to call `interruptible::yield()` in between to make sure the thread does not become
-   * unresponsive for too long.
-   *
-   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread.
-   */
-  static inline void yield() { get_token()->yield_impl(); }
+        /**
+         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+         * `interruptible::cancel`.
+         *
+         * This is a cancellation point for an interruptible thread. It's called in the internals of
+         * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
+         * recommended to call `interruptible::yield()` in between to make sure the thread does not become
+         * unresponsive for too long.
+         *
+         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+         *
+         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+         * thread.
+         */
+        static inline void yield() { get_token()->yield_impl(); }
 
-  /**
-   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-   * `interruptible::cancel`.
-   *
-   * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
-   *
-   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-   *
-   * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
-   */
-  static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
+        /**
+         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+         * `interruptible::cancel`.
+         *
+         * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
+         *
+         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+         *
+         * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
+         */
+        static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
 
-  /**
-   * @brief Get a cancellation token for this CPU thread.
-   *
-   * @return an object that can be used to cancel the GPU work waited on this CPU thread.
-   */
-  static inline auto get_token() -> std::shared_ptr<interruptible>
-  {
-    // NB: using static thread-local storage to keep the token alive once it is initialized
-    static thread_local std::shared_ptr<interruptible> s(
-      get_token_impl<true>(std::this_thread::get_id()));
-    return s;
-  }
+        /**
+         * @brief Get a cancellation token for this CPU thread.
+         *
+         * @return an object that can be used to cancel the GPU work waited on this CPU thread.
+         */
+        static inline auto get_token() -> std::shared_ptr<interruptible>
+        {
+            // NB: using static thread-local storage to keep the token alive once it is initialized
+            static thread_local std::shared_ptr<interruptible> s(
+                    get_token_impl<true>(std::this_thread::get_id()));
+            return s;
+        }
 
-  /**
-   * @brief Get a cancellation token for a CPU thread given by its id.
-   *
-   * The returned token may live longer than the associated thread. In that case, using its
-   * `cancel` method has no effect.
-   *
-   * @param [in] thread_id an id of a C++ CPU thread.
-   * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
-   */
-  static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-  {
-    return get_token_impl<false>(thread_id);
-  }
+        /**
+         * @brief Get a cancellation token for a CPU thread given by its id.
+         *
+         * The returned token may live longer than the associated thread. In that case, using its
+         * `cancel` method has no effect.
+         *
+         * @param [in] thread_id an id of a C++ CPU thread.
+         * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
+         */
+        static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+        {
+            return get_token_impl<false>(thread_id);
+        }
 
-  /**
-   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-   * CPU thread given by the `thread_id`
-   *
-   * Note, this function uses a mutex to safely get a cancellation token that may be shared
-   * among multiple threads. If you plan to use it from a signal handler, consider the non-static
-   * `cancel()` instead.
-   *
-   * @param [in] thread_id a CPU thread, in which the work should be interrupted.
-   */
-  static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
+        /**
+         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+         * CPU thread given by the `thread_id`
+         *
+         * Note, this function uses a mutex to safely get a cancellation token that may be shared
+         * among multiple threads. If you plan to use it from a signal handler, consider the non-static
+         * `cancel()` instead.
+         *
+         * @param [in] thread_id a CPU thread, in which the work should be interrupted.
+         */
+        static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
 
-  /**
-   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-   * CPU thread given by this `interruptible` token.
-   *
-   * Note, this function does not involve thread synchronization/locks and does not throw any
-   * exceptions, so it's safe to call from a signal handler.
-   */
-  inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
+        /**
+         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+         * CPU thread given by this `interruptible` token.
+         *
+         * Note, this function does not involve thread synchronization/locks and does not throw any
+         * exceptions, so it's safe to call from a signal handler.
+         */
+        inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
 
-  // don't allow the token to leave the shared_ptr
-  interruptible(interruptible const&) = delete;
-  interruptible(interruptible&&)      = delete;
-  auto operator=(interruptible const&) -> interruptible& = delete;
-  auto operator=(interruptible&&) -> interruptible& = delete;
+        // don't allow the token to leave the shared_ptr
+        interruptible(interruptible const&) = delete;
+        interruptible(interruptible&&)      = delete;
+        auto operator=(interruptible const&) -> interruptible& = delete;
+        auto operator=(interruptible&&) -> interruptible& = delete;
 
- private:
-  /** Global registry of thread-local cancellation stores. */
-  static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
-  /** Protect the access to the registry. */
-  static inline std::mutex mutex_;
+    private:
+        /** Global registry of thread-local cancellation stores. */
+        static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
+        /** Protect the access to the registry. */
+        static inline std::mutex mutex_;
 
-  /**
-   * Create a new interruptible token or get an existing from the global registry_.
-   *
-   * Presumptions:
-   *
-   *   1. get_token_impl<true> must be called at most once per thread.
-   *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
-   *   3. get_token_impl<false> can be called as many times as needed, producing a valid
-   *      token for any input thread_id, independent of whether a C++ thread with this
-   *      id exists or not.
-   *
-   * @tparam Claim whether to bind the token to the given thread.
-   * @param [in] thread_id the id of the associated C++ thread.
-   * @return new or existing interruptible token.
-   */
-  template <bool Claim>
-  static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-  {
-    std::lock_guard<std::mutex> guard_get(mutex_);
-    // the following constructs an empty shared_ptr if the key does not exist.
-    auto& weak_store  = registry_[thread_id];
-    auto thread_store = weak_store.lock();
-    if (!thread_store || (Claim && thread_store->claimed_)) {
-      // Create a new thread_store in two cases:
-      //  1. It does not exist in the map yet
-      //  2. The previous store in the map has not yet been deleted
-      thread_store.reset(new interruptible(), [thread_id](auto ts) {
-        std::lock_guard<std::mutex> guard_erase(mutex_);
-        auto found = registry_.find(thread_id);
-        if (found != registry_.end()) {
-          auto stored = found->second.lock();
-          // thread_store is not moveable, thus retains its original location.
-          // Not equal pointers below imply the new store has been already placed
-          // in the registry_ by the same std::thread::id
-          if (!stored || stored.get() == ts) { registry_.erase(found); }
+        /**
+         * Create a new interruptible token or get an existing from the global registry_.
+         *
+         * Presumptions:
+         *
+         *   1. get_token_impl<true> must be called at most once per thread.
+         *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
+         *   3. get_token_impl<false> can be called as many times as needed, producing a valid
+         *      token for any input thread_id, independent of whether a C++ thread with this
+         *      id exists or not.
+         *
+         * @tparam Claim whether to bind the token to the given thread.
+         * @param [in] thread_id the id of the associated C++ thread.
+         * @return new or existing interruptible token.
+         */
+        template <bool Claim>
+        static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+        {
+            std::lock_guard<std::mutex> guard_get(mutex_);
+            // the following constructs an empty shared_ptr if the key does not exist.
+            auto& weak_store  = registry_[thread_id];
+            auto thread_store = weak_store.lock();
+            if (!thread_store || (Claim && thread_store->claimed_)) {
+                // Create a new thread_store in two cases:
+                //  1. It does not exist in the map yet
+                //  2. The previous store in the map has not yet been deleted
+                thread_store.reset(new interruptible(), [thread_id](auto ts) {
+                    std::lock_guard<std::mutex> guard_erase(mutex_);
+                    auto found = registry_.find(thread_id);
+                    if (found != registry_.end()) {
+                        auto stored = found->second.lock();
+                        // thread_store is not moveable, thus retains its original location.
+                        // Not equal pointers below imply the new store has been already placed
+                        // in the registry_ by the same std::thread::id
+                        if (!stored || stored.get() == ts) { registry_.erase(found); }
+                    }
+                    delete ts;
+                });
+                std::weak_ptr<interruptible>(thread_store).swap(weak_store);
+            }
+            // The thread_store is "claimed" by the thread
+            if constexpr (Claim) { thread_store->claimed_ = true; }
+            return thread_store;
         }
-        delete ts;
-      });
-      std::weak_ptr<interruptible>(thread_store).swap(weak_store);
-    }
-    // The thread_store is "claimed" by the thread
-    if constexpr (Claim) { thread_store->claimed_ = true; }
-    return thread_store;
-  }
 
-  /**
-   * Communicate whether the thread is in a cancelled state or can continue execution.
-   *
-   * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
-   * These are the only two places where it's used.
-   */
-  std::atomic_flag continue_;
-  /** This flag is set to true when the created token is placed into a thread-local storage. */
-  bool claimed_ = false;
+        /**
+         * Communicate whether the thread is in a cancelled state or can continue execution.
+         *
+         * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
+         * These are the only two places where it's used.
+         */
+        std::atomic_flag continue_;
+        /** This flag is set to true when the created token is placed into a thread-local storage. */
+        bool claimed_ = false;
 
-  interruptible() noexcept { yield_no_throw_impl(); }
+        interruptible() noexcept { yield_no_throw_impl(); }
 
-  void yield_impl()
-  {
-    if (!yield_no_throw_impl()) {
-      throw interrupted_exception("The work in this thread was cancelled.");
-    }
-  }
+        void yield_impl()
+        {
+            if (!yield_no_throw_impl()) {
+                throw interrupted_exception("The work in this thread was cancelled.");
+            }
+        }
 
-  auto yield_no_throw_impl() noexcept -> bool
-  {
-    return continue_.test_and_set(std::memory_order_relaxed);
-  }
+        auto yield_no_throw_impl() noexcept -> bool
+        {
+            return continue_.test_and_set(std::memory_order_relaxed);
+        }
 
-  template <typename Query, typename Object>
-  inline void synchronize_impl(Query query, Object object)
-  {
-    cudaError_t query_result;
-    while (true) {
-      yield_impl();
-      query_result = query(object);
-      if (query_result != cudaErrorNotReady) { break; }
-      std::this_thread::yield();
-    }
-    RAFT_CUDA_TRY(query_result);
-  }
-};
+        template <typename Query, typename Object>
+        inline void synchronize_impl(Query query, Object object)
+        {
+            cudaError_t query_result;
+            while (true) {
+                yield_impl();
+                query_result = query(object);
+                if (query_result != cudaErrorNotReady) { break; }
+                std::this_thread::yield();
+            }
+            RAFT_CUDA_TRY(query_result);
+        }
+    };
 
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cublas_macros.h b/cpp/include/raft/linalg/cublas_macros.h
index 1cb5cfc81a..5a96444e45 100644
--- a/cpp/include/raft/linalg/cublas_macros.h
+++ b/cpp/include/raft/linalg/cublas_macros.h
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/cublas_macros.hpp instead.
+ */
+
+#ifndef __RAFT_RT_CUBLAS_MACROS_H
+#define __RAFT_RT_CUBLAS_MACROS_H
+
 #pragma once
 
 #include <cublas_v2.h>
@@ -32,33 +40,33 @@ namespace raft {
 /**
  * @brief Exception thrown when a cuBLAS error is encountered.
  */
-struct cublas_error : public raft::exception {
-  explicit cublas_error(char const* const message) : raft::exception(message) {}
-  explicit cublas_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-namespace detail {
-
-inline const char* cublas_error_to_string(cublasStatus_t err)
-{
-  switch (err) {
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default: return "CUBLAS_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace detail
-}  // namespace linalg
+    struct cublas_error : public raft::exception {
+        explicit cublas_error(char const* const message) : raft::exception(message) {}
+        explicit cublas_error(std::string const& message) : raft::exception(message) {}
+    };
+
+    namespace linalg {
+        namespace detail {
+
+            inline const char* cublas_error_to_string(cublasStatus_t err)
+            {
+                switch (err) {
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
+                    default: return "CUBLAS_STATUS_UNKNOWN";
+                };
+            }
+
+        }  // namespace detail
+    }  // namespace linalg
 }  // namespace raft
 
 #undef _CUBLAS_ERR_TO_STR
@@ -114,3 +122,5 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
 #ifndef CUBLAS_CHECK_NO_THROW
 #define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
 #endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cusolver_macros.h b/cpp/include/raft/linalg/cusolver_macros.h
index 6db0577509..a97c4d2332 100644
--- a/cpp/include/raft/linalg/cusolver_macros.h
+++ b/cpp/include/raft/linalg/cusolver_macros.h
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/cusolver_macros.hpp instead.
+ */
+
+
+#ifndef __RAFT_RT_CUSOLVER_MACROS_H
+#define __RAFT_RT_CUSOLVER_MACROS_H
+
 #pragma once
 
 #include <cusolverDn.h>
@@ -31,31 +40,31 @@ namespace raft {
 /**
  * @brief Exception thrown when a cuSOLVER error is encountered.
  */
-struct cusolver_error : public raft::exception {
-  explicit cusolver_error(char const* const message) : raft::exception(message) {}
-  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-
-inline const char* cusolver_error_to_string(cusolverStatus_t err)
-{
-  switch (err) {
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-    default: return "CUSOLVER_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace linalg
+    struct cusolver_error : public raft::exception {
+        explicit cusolver_error(char const* const message) : raft::exception(message) {}
+        explicit cusolver_error(std::string const& message) : raft::exception(message) {}
+    };
+
+    namespace linalg {
+
+        inline const char* cusolver_error_to_string(cusolverStatus_t err)
+        {
+            switch (err) {
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
+                default: return "CUSOLVER_STATUS_UNKNOWN";
+            };
+        }
+
+    }  // namespace linalg
 }  // namespace raft
 
 #undef _CUSOLVER_ERR_TO_STR
@@ -110,3 +119,5 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
 #ifndef CUSOLVER_CHECK_NO_THROW
 #define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
 #endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft_runtime/comms.hpp b/cpp/include/raft_runtime/comms.hpp
new file mode 100644
index 0000000000..cf3ffe350c
--- /dev/null
+++ b/cpp/include/raft_runtime/comms.hpp
@@ -0,0 +1,637 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_COMMS_H
+#define __RAFT_RT_COMMS_H
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+namespace raft {
+    namespace comms {
+
+        typedef unsigned int request_t;
+        enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
+        enum class op_t { SUM, PROD, MIN, MAX };
+
+/**
+ * The resulting status of distributed stream synchronization
+ */
+        enum class status_t {
+            SUCCESS,  // Synchronization successful
+            ERROR,    // An error occured querying sync status
+            ABORT     // A failure occurred in sync, queued operations aborted
+        };
+
+        template <typename value_t>
+        constexpr datatype_t
+
+        get_type();
+
+        template <>
+        constexpr datatype_t
+
+        get_type<char>()
+        {
+            return datatype_t::CHAR;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<uint8_t>()
+        {
+            return datatype_t::UINT8;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<int>()
+        {
+            return datatype_t::INT32;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<uint32_t>()
+        {
+            return datatype_t::UINT32;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<int64_t>()
+        {
+            return datatype_t::INT64;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<uint64_t>()
+        {
+            return datatype_t::UINT64;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<float>()
+        {
+            return datatype_t::FLOAT32;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<double>()
+        {
+            return datatype_t::FLOAT64;
+        }
+
+        class comms_iface {
+        public:
+            virtual ~comms_iface() {}
+
+            virtual int get_size() const = 0;
+
+            virtual int get_rank() const = 0;
+
+            virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
+
+            virtual void barrier() const = 0;
+
+            virtual status_t sync_stream(cudaStream_t stream) const = 0;
+
+            virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
+
+            virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
+
+            virtual void waitall(int count, request_t array_of_requests[]) const = 0;
+
+            virtual void allreduce(const void* sendbuff,
+                                   void* recvbuff,
+                                   size_t count,
+                                   datatype_t datatype,
+                                   op_t op,
+                                   cudaStream_t stream) const = 0;
+
+            virtual void bcast(
+                    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
+
+            virtual void bcast(const void* sendbuff,
+                               void* recvbuff,
+                               size_t count,
+                               datatype_t datatype,
+                               int root,
+                               cudaStream_t stream) const = 0;
+
+            virtual void reduce(const void* sendbuff,
+                                void* recvbuff,
+                                size_t count,
+                                datatype_t datatype,
+                                op_t op,
+                                int root,
+                                cudaStream_t stream) const = 0;
+
+            virtual void allgather(const void* sendbuff,
+                                   void* recvbuff,
+                                   size_t sendcount,
+                                   datatype_t datatype,
+                                   cudaStream_t stream) const = 0;
+
+            virtual void allgatherv(const void* sendbuf,
+                                    void* recvbuf,
+                                    const size_t* recvcounts,
+                                    const size_t* displs,
+                                    datatype_t datatype,
+                                    cudaStream_t stream) const = 0;
+
+            virtual void gather(const void* sendbuff,
+                                void* recvbuff,
+                                size_t sendcount,
+                                datatype_t datatype,
+                                int root,
+                                cudaStream_t stream) const = 0;
+
+            virtual void gatherv(const void* sendbuf,
+                                 void* recvbuf,
+                                 size_t sendcount,
+                                 const size_t* recvcounts,
+                                 const size_t* displs,
+                                 datatype_t datatype,
+                                 int root,
+                                 cudaStream_t stream) const = 0;
+
+            virtual void reducescatter(const void* sendbuff,
+                                       void* recvbuff,
+                                       size_t recvcount,
+                                       datatype_t datatype,
+                                       op_t op,
+                                       cudaStream_t stream) const = 0;
+
+            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+            virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
+
+            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+            virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
+
+            virtual void device_sendrecv(const void* sendbuf,
+                                         size_t sendsize,
+                                         int dest,
+                                         void* recvbuf,
+                                         size_t recvsize,
+                                         int source,
+                                         cudaStream_t stream) const = 0;
+
+            virtual void device_multicast_sendrecv(const void* sendbuf,
+                                                   std::vector<size_t> const& sendsizes,
+                                                   std::vector<size_t> const& sendoffsets,
+                                                   std::vector<int> const& dests,
+                                                   void* recvbuf,
+                                                   std::vector<size_t> const& recvsizes,
+                                                   std::vector<size_t> const& recvoffsets,
+                                                   std::vector<int> const& sources,
+                                                   cudaStream_t stream) const = 0;
+        };
+
+        class comms_t {
+        public:
+            comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
+            {
+                ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
+            }
+
+            /**
+             * Virtual Destructor to enable polymorphism
+             */
+            virtual ~comms_t() {}
+
+            /**
+             * Returns the size of the communicator clique
+             */
+
+            int get_size() const { return impl_->get_size(); }
+
+            /**
+             * Returns the local rank
+             */
+            int get_rank() const { return impl_->get_rank(); }
+
+            /**
+             * Splits the current communicator clique into sub-cliques matching
+             * the given color and key
+             *
+             * @param color ranks w/ the same color are placed in the same communicator
+             * @param key controls rank assignment
+             */
+            std::unique_ptr<comms_iface> comm_split(int color, int key) const
+            {
+                return impl_->comm_split(color, key);
+            }
+
+            /**
+             * Performs a collective barrier synchronization
+             */
+            void barrier() const { impl_->barrier(); }
+
+            /**
+             * Some collective communications implementations (eg. NCCL) might use asynchronous
+             * collectives that are explicitly synchronized. It's important to always synchronize
+             * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
+             * to prevent the potential for deadlocks.
+             *
+             * @param stream the cuda stream to sync collective operations on
+             */
+            status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
+
+            /**
+             * Performs an asynchronous point-to-point send
+             * @tparam value_t the type of data to send
+             * @param buf pointer to array of data to send
+             * @param size number of elements in buf
+             * @param dest destination rank
+             * @param tag a tag to use for the receiver to filter
+             * @param request pointer to hold returned request_t object.
+             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+             */
+            template <typename value_t>
+            void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
+            {
+                impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
+            }
+
+            /**
+             * Performs an asynchronous point-to-point receive
+             * @tparam value_t the type of data to be received
+             * @param buf pointer to (initialized) array that will hold received data
+             * @param size number of elements in buf
+             * @param source source rank
+             * @param tag a tag to use for message filtering
+             * @param request pointer to hold returned request_t object.
+             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+             */
+            template <typename value_t>
+            void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
+            {
+                impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
+            }
+
+            /**
+             * Synchronize on an array of request_t objects returned from isend/irecv
+             * @param count number of requests to synchronize on
+             * @param array_of_requests an array of request_t objects returned from isend/irecv
+             */
+            void waitall(int count, request_t array_of_requests[]) const
+            {
+                impl_->waitall(count, array_of_requests);
+            }
+
+            /**
+             * Perform an allreduce collective
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff data to reduce
+             * @param recvbuff buffer to hold the reduced result
+             * @param count number of elements in sendbuff
+             * @param op reduction operation to perform
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void allreduce(
+                    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
+            {
+                impl_->allreduce(static_cast<const void*>(sendbuff),
+                                 static_cast<void*>(recvbuff),
+                                 count,
+                                 get_type<value_t>(),
+                                 op,
+                                 stream);
+            }
+
+            /**
+             * Broadcast data from one rank to the rest
+             * @tparam value_t datatype of underlying buffers
+             * @param buff buffer to send
+             * @param count number of elements if buff
+             * @param root the rank initiating the broadcast
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
+            {
+                impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
+            }
+
+            /**
+             * Broadcast data from one rank to the rest
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to broadcast (only used in root)
+             * @param recvbuff buffer to receive broadcasted data
+             * @param count number of elements if buff
+             * @param root the rank initiating the broadcast
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void bcast(
+                    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
+            {
+                impl_->bcast(static_cast<const void*>(sendbuff),
+                             static_cast<void*>(recvbuff),
+                             count,
+                             get_type<value_t>(),
+                             root,
+                             stream);
+            }
+
+            /**
+             * Reduce data from many ranks down to a single rank
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to reduce
+             * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
+             * @param count number of elements in sendbuff
+             * @param op reduction operation to perform
+             * @param root rank to store the results
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void reduce(const value_t* sendbuff,
+                        value_t* recvbuff,
+                        size_t count,
+                        op_t op,
+                        int root,
+                        cudaStream_t stream) const
+            {
+                impl_->reduce(static_cast<const void*>(sendbuff),
+                              static_cast<void*>(recvbuff),
+                              count,
+                              get_type<value_t>(),
+                              op,
+                              root,
+                              stream);
+            }
+
+            /**
+             * Gathers data from each rank onto all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to gather
+             * @param recvbuff buffer containing gathered data from all ranks
+             * @param sendcount number of elements in send buffer
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void allgather(const value_t* sendbuff,
+                           value_t* recvbuff,
+                           size_t sendcount,
+                           cudaStream_t stream) const
+            {
+                impl_->allgather(static_cast<const void*>(sendbuff),
+                                 static_cast<void*>(recvbuff),
+                                 sendcount,
+                                 get_type<value_t>(),
+                                 stream);
+            }
+
+            /**
+             * Gathers data from all ranks and delivers to combined data to all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuf buffer containing data to send
+             * @param recvbuf buffer containing data to receive
+             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+             *                   elements that are to be received from each rank
+             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+             *               (relative to recvbuf) at which to place the incoming data from each rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void allgatherv(const value_t* sendbuf,
+                            value_t* recvbuf,
+                            const size_t* recvcounts,
+                            const size_t* displs,
+                            cudaStream_t stream) const
+            {
+                impl_->allgatherv(static_cast<const void*>(sendbuf),
+                                  static_cast<void*>(recvbuf),
+                                  recvcounts,
+                                  displs,
+                                  get_type<value_t>(),
+                                  stream);
+            }
+
+            /**
+             * Gathers data from each rank onto all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to gather
+             * @param recvbuff buffer containing gathered data from all ranks
+             * @param sendcount number of elements in send buffer
+             * @param root rank to store the results
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void gather(const value_t* sendbuff,
+                        value_t* recvbuff,
+                        size_t sendcount,
+                        int root,
+                        cudaStream_t stream) const
+            {
+                impl_->gather(static_cast<const void*>(sendbuff),
+                              static_cast<void*>(recvbuff),
+                              sendcount,
+                              get_type<value_t>(),
+                              root,
+                              stream);
+            }
+
+            /**
+             * Gathers data from all ranks and delivers to combined data to all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuf buffer containing data to send
+             * @param recvbuf buffer containing data to receive
+             * @param sendcount number of elements in send buffer
+             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+             *                   elements that are to be received from each rank
+             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+             *               (relative to recvbuf) at which to place the incoming data from each rank
+             * @param root rank to store the results
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void gatherv(const value_t* sendbuf,
+                         value_t* recvbuf,
+                         size_t sendcount,
+                         const size_t* recvcounts,
+                         const size_t* displs,
+                         int root,
+                         cudaStream_t stream) const
+            {
+                impl_->gatherv(static_cast<const void*>(sendbuf),
+                               static_cast<void*>(recvbuf),
+                               sendcount,
+                               recvcounts,
+                               displs,
+                               get_type<value_t>(),
+                               root,
+                               stream);
+            }
+
+            /**
+             * Reduces data from all ranks then scatters the result across ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
+             * @param recvbuff buffer containing received data
+             * @param recvcount number of items to receive
+             * @param op reduction operation to perform
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void reducescatter(const value_t* sendbuff,
+                               value_t* recvbuff,
+                               size_t recvcount,
+                               op_t op,
+                               cudaStream_t stream) const
+            {
+                impl_->reducescatter(static_cast<const void*>(sendbuff),
+                                     static_cast<void*>(recvbuff),
+                                     recvcount,
+                                     get_type<value_t>(),
+                                     op,
+                                     stream);
+            }
+
+            /**
+             * Performs a point-to-point send
+             *
+             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+             *
+             * @tparam value_t the type of data to send
+             * @param buf pointer to array of data to send
+             * @param size number of elements in buf
+             * @param dest destination rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
+            {
+                impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
+            }
+
+            /**
+             * Performs a point-to-point receive
+             *
+             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+             *
+             * @tparam value_t the type of data to be received
+             * @param buf pointer to (initialized) array that will hold received data
+             * @param size number of elements in buf
+             * @param source source rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
+            {
+                impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
+            }
+
+            /**
+             * Performs a point-to-point send/receive
+             *
+             * @tparam value_t the type of data to be sent & received
+             * @param sendbuf pointer to array of data to send
+             * @param sendsize number of elements in sendbuf
+             * @param dest destination rank
+             * @param recvbuf pointer to (initialized) array that will hold received data
+             * @param recvsize number of elements in recvbuf
+             * @param source source rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_sendrecv(const value_t* sendbuf,
+                                 size_t sendsize,
+                                 int dest,
+                                 value_t* recvbuf,
+                                 size_t recvsize,
+                                 int source,
+                                 cudaStream_t stream) const
+            {
+                impl_->device_sendrecv(static_cast<const void*>(sendbuf),
+                                       sendsize * sizeof(value_t),
+                                       dest,
+                                       static_cast<void*>(recvbuf),
+                                       recvsize * sizeof(value_t),
+                                       source,
+                                       stream);
+            }
+
+            /**
+             * Performs a multicast send/receive
+             *
+             * @tparam value_t the type of data to be sent & received
+             * @param sendbuf pointer to array of data to send
+             * @param sendsizes numbers of elements to send
+             * @param sendoffsets offsets in a number of elements from sendbuf
+             * @param dests destination ranks
+             * @param recvbuf pointer to (initialized) array that will hold received data
+             * @param recvsizes numbers of elements to recv
+             * @param recvoffsets offsets in a number of elements from recvbuf
+             * @param sources source ranks
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_multicast_sendrecv(const value_t* sendbuf,
+                                           std::vector<size_t> const& sendsizes,
+                                           std::vector<size_t> const& sendoffsets,
+                                           std::vector<int> const& dests,
+                                           value_t* recvbuf,
+                                           std::vector<size_t> const& recvsizes,
+                                           std::vector<size_t> const& recvoffsets,
+                                           std::vector<int> const& sources,
+                                           cudaStream_t stream) const
+            {
+                auto sendbytesizes   = sendsizes;
+                auto sendbyteoffsets = sendoffsets;
+                for (size_t i = 0; i < sendsizes.size(); ++i) {
+                    sendbytesizes[i] *= sizeof(value_t);
+                    sendbyteoffsets[i] *= sizeof(value_t);
+                }
+                auto recvbytesizes   = recvsizes;
+                auto recvbyteoffsets = recvoffsets;
+                for (size_t i = 0; i < recvsizes.size(); ++i) {
+                    recvbytesizes[i] *= sizeof(value_t);
+                    recvbyteoffsets[i] *= sizeof(value_t);
+                }
+                impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
+                                                 sendbytesizes,
+                                                 sendbyteoffsets,
+                                                 dests,
+                                                 static_cast<void*>(recvbuf),
+                                                 recvbytesizes,
+                                                 recvbyteoffsets,
+                                                 sources,
+                                                 stream);
+            }
+
+        private:
+            std::unique_ptr<comms_iface> impl_;
+        };
+
+    }  // namespace comms
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft_runtime/cublas_macros.hpp b/cpp/include/raft_runtime/cublas_macros.hpp
new file mode 100644
index 0000000000..6344333b3d
--- /dev/null
+++ b/cpp/include/raft_runtime/cublas_macros.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_CUBLAS_MACROS_H
+#define __RAFT_RT_CUBLAS_MACROS_H
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <raft/error.hpp>
+
+///@todo: enable this once we have logger enabled
+//#include <cuml/common/logger.hpp>
+
+#include <cstdint>
+
+#define _CUBLAS_ERR_TO_STR(err) \
+  case err: return #err
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuBLAS error is encountered.
+ */
+    struct cublas_error : public raft::exception {
+        explicit cublas_error(char const* const message) : raft::exception(message) {}
+        explicit cublas_error(std::string const& message) : raft::exception(message) {}
+    };
+
+    namespace linalg {
+        namespace detail {
+
+            inline const char* cublas_error_to_string(cublasStatus_t err)
+            {
+                switch (err) {
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
+                    default: return "CUBLAS_STATUS_UNKNOWN";
+                };
+            }
+
+        }  // namespace detail
+    }  // namespace linalg
+}  // namespace raft
+
+#undef _CUBLAS_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuBLAS runtime API functions.
+ *
+ * Invokes a cuBLAS runtime API function call, if the call does not return
+ * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
+ */
+#define RAFT_CUBLAS_TRY(call)                                              \
+  do {                                                                     \
+    cublasStatus_t const status = (call);                                  \
+    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                   \
+      SET_ERROR_MSG(msg,                                                   \
+                    "cuBLAS error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                             \
+                    #call,                                                 \
+                    status,                                                \
+                    raft::linalg::detail::cublas_error_to_string(status)); \
+      throw raft::cublas_error(msg);                                       \
+    }                                                                      \
+  } while (0)
+
+// FIXME: Remove after consumers rename
+#ifndef CUBLAS_TRY
+#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUBLAS_TRY_NO_THROW(call)                               \
+  do {                                                               \
+    cublasStatus_t const status = call;                              \
+    if (CUBLAS_STATUS_SUCCESS != status) {                           \
+      printf("CUBLAS call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                  \
+             __FILE__,                                               \
+             __LINE__,                                               \
+             raft::linalg::detail::cublas_error_to_string(status));  \
+    }                                                                \
+  } while (0)
+
+/** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK
+#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
+#endif
+
+/** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK_NO_THROW
+#define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft_runtime/cudart_utils.hpp b/cpp/include/raft_runtime/cudart_utils.hpp
new file mode 100644
index 0000000000..d4d3d86a3d
--- /dev/null
+++ b/cpp/include/raft_runtime/cudart_utils.hpp
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_CUDART_UTILS_H
+#define __RAFT_RT_CUDART_UTILS_H
+
+#pragma once
+
+#include <raft/error.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <cuda_runtime.h>
+
+#include <chrono>
+#include <cstdio>
+#include <execinfo.h>
+#include <iomanip>
+#include <iostream>
+#include <mutex>
+#include <unordered_map>
+
+///@todo: enable once logging has been enabled in raft
+//#include "logger.hpp"
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a CUDA error is encountered.
+ */
+    struct cuda_error : public raft::exception {
+        explicit cuda_error(char const* const message) : raft::exception(message) {}
+        explicit cuda_error(std::string const& message) : raft::exception(message) {}
+    };
+
+}  // namespace raft
+
+/**
+ * @brief Error checking macro for CUDA runtime API functions.
+ *
+ * Invokes a CUDA runtime API function call, if the call does not return
+ * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
+ * exception detailing the CUDA error that occurred
+ *
+ */
+#define RAFT_CUDA_TRY(call)                        \
+  do {                                             \
+    cudaError_t const status = call;               \
+    if (status != cudaSuccess) {                   \
+      cudaGetLastError();                          \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "CUDA error encountered at: ", \
+                    "call='%s', Reason=%s:%s",     \
+                    #call,                         \
+                    cudaGetErrorName(status),      \
+                    cudaGetErrorString(status));   \
+      throw raft::cuda_error(msg);                 \
+    }                                              \
+  } while (0)
+
+// FIXME: Remove after consumers rename
+#ifndef CUDA_TRY
+#define CUDA_TRY(call) RAFT_CUDA_TRY(call)
+#endif
+
+/**
+ * @brief Debug macro to check for CUDA errors
+ *
+ * In a non-release build, this macro will synchronize the specified stream
+ * before error checking. In both release and non-release builds, this macro
+ * checks for any pending CUDA errors from previous calls. If an error is
+ * reported, an exception is thrown detailing the CUDA error that occurred.
+ *
+ * The intent of this macro is to provide a mechanism for synchronous and
+ * deterministic execution for debugging asynchronous CUDA execution. It should
+ * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an
+ * asynchronous kernel launch.
+ */
+#ifndef NDEBUG
+#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+#else
+#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError());
+#endif
+
+// FIXME: Remove after consumers rename
+#ifndef CHECK_CUDA
+#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call)
+#endif
+
+/** FIXME: remove after cuml rename */
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(call) RAFT_CUDA_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUDA_TRY_NO_THROW(call)                               \
+  do {                                                             \
+    cudaError_t const status = call;                               \
+    if (cudaSuccess != status) {                                   \
+      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                \
+             __FILE__,                                             \
+             __LINE__,                                             \
+             cudaGetErrorString(status));                          \
+    }                                                              \
+  } while (0)
+
+// FIXME: Remove after cuml rename
+#ifndef CUDA_CHECK_NO_THROW
+#define CUDA_CHECK_NO_THROW(call) RAFT_CUDA_TRY_NO_THROW(call)
+#endif
+
+/**
+ * Alias to raft scope for now.
+ * TODO: Rename original implementations in 22.04 to fix
+ * https://github.com/rapidsai/raft/issues/128
+ */
+
+namespace raft {
+
+/** Helper method to get to know warp size in device code */
+    __host__ __device__ constexpr inline int warp_size() { return 32; }
+
+    __host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
+
+/**
+ * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
+ * elements to threads.
+ */
+    class grid_1d_thread_t {
+    public:
+        int const block_size{0};
+        int const num_blocks{0};
+
+        /**
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         * @param elements_per_thread Typically, a single kernel thread processes more than a single
+         * element; this affects the number of threads the grid must contain
+         */
+        grid_1d_thread_t(size_t overall_num_elements,
+                         size_t num_threads_per_block,
+                         size_t max_num_blocks_1d,
+                         size_t elements_per_thread = 1)
+                : block_size(num_threads_per_block),
+                  num_blocks(
+                          std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
+                                   (elements_per_thread * num_threads_per_block),
+                                   max_num_blocks_1d))
+        {
+            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                         "num_threads_per_block / warp_size() must be > 0");
+            RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
+        }
+    };
+
+/**
+ * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
+ * elements to warps.
+ */
+    class grid_1d_warp_t {
+    public:
+        int const block_size{0};
+        int const num_blocks{0};
+
+        /**
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         */
+        grid_1d_warp_t(size_t overall_num_elements,
+                       size_t num_threads_per_block,
+                       size_t max_num_blocks_1d)
+                : block_size(num_threads_per_block),
+                  num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
+                                      (num_threads_per_block / warp_size()),
+                                      max_num_blocks_1d))
+        {
+            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                         "num_threads_per_block / warp_size() must be > 0");
+        }
+    };
+
+/**
+ * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
+ * elements to blocks.
+ */
+    class grid_1d_block_t {
+    public:
+        int const block_size{0};
+        int const num_blocks{0};
+
+        /**
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         */
+        grid_1d_block_t(size_t overall_num_elements,
+                        size_t num_threads_per_block,
+                        size_t max_num_blocks_1d)
+                : block_size(num_threads_per_block),
+                  num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
+        {
+            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                         "num_threads_per_block / warp_size() must be > 0");
+        }
+    };
+
+/**
+ * @brief Generic copy method for all kinds of transfers
+ * @tparam Type data type
+ * @param dst destination pointer
+ * @param src source pointer
+ * @param len lenth of the src/dst buffers in terms of number of elements
+ * @param stream cuda stream
+ */
+    template <typename Type>
+    void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
+    {
+        CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+    }
+
+/**
+ * @defgroup Copy Copy methods
+ * These are here along with the generic 'copy' method in order to improve
+ * code readability using explicitly specified function names
+ * @{
+ */
+/** performs a host to device copy */
+    template <typename Type>
+    void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
+    {
+        copy(d_ptr, h_ptr, len, stream);
+    }
+
+/** performs a device to host copy */
+    template <typename Type>
+    void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
+    {
+        copy(h_ptr, d_ptr, len, stream);
+    }
+
+    template <typename Type>
+    void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
+    {
+        CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+    }
+/** @} */
+
+/**
+ * @defgroup Debug Utils for debugging host/device buffers
+ * @{
+ */
+    template <class T, class OutStream>
+    void print_host_vector(const char* variable_name,
+                           const T* host_mem,
+                           size_t componentsCount,
+                           OutStream& out)
+    {
+        out << variable_name << "=[";
+        for (size_t i = 0; i < componentsCount; ++i) {
+            if (i != 0) out << ",";
+            out << host_mem[i];
+        }
+        out << "];\n";
+    }
+
+    template <class T, class OutStream>
+    void print_device_vector(const char* variable_name,
+                             const T* devMem,
+                             size_t componentsCount,
+                             OutStream& out)
+    {
+        T* host_mem = new T[componentsCount];
+        CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
+        print_host_vector(variable_name, host_mem, componentsCount, out);
+        delete[] host_mem;
+    }
+/** @} */
+
+    static std::mutex mutex_;
+    static std::unordered_map<void*, size_t> allocations;
+
+    template <typename Type>
+    void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false)
+    {
+        size_t size = len * sizeof(Type);
+        ptr         = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
+        if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
+
+        std::lock_guard<std::mutex> _(mutex_);
+        allocations[ptr] = size;
+    }
+
+    template <typename Type>
+    void deallocate(Type*& ptr, rmm::cuda_stream_view stream)
+    {
+        std::lock_guard<std::mutex> _(mutex_);
+        size_t size = allocations[ptr];
+        allocations.erase(ptr);
+        rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
+    }
+
+    inline void deallocate_all(rmm::cuda_stream_view stream)
+    {
+        std::lock_guard<std::mutex> _(mutex_);
+        for (auto& alloc : allocations) {
+            void* ptr   = alloc.first;
+            size_t size = alloc.second;
+            rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
+        }
+        allocations.clear();
+    }
+
+/** helper method to get max usable shared mem per block parameter */
+    inline int getSharedMemPerBlock()
+    {
+        int devId;
+        RAFT_CUDA_TRY(cudaGetDevice(&devId));
+        int smemPerBlk;
+        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
+        return smemPerBlk;
+    }
+
+/** helper method to get multi-processor count parameter */
+    inline int getMultiProcessorCount()
+    {
+        int devId;
+        RAFT_CUDA_TRY(cudaGetDevice(&devId));
+        int mpCount;
+        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+        return mpCount;
+    }
+
+/** helper method to convert an array on device to a string on host */
+    template <typename T>
+    std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
+    {
+        std::stringstream ss;
+
+        T* arr_h = (T*)malloc(size * sizeof(T));
+        update_host(arr_h, arr, size, stream);
+        RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+        ss << name << " = [ ";
+        for (int i = 0; i < size; i++) {
+            ss << std::setw(width) << arr_h[i];
+
+            if (i < size - 1) ss << ", ";
+        }
+        ss << " ]" << std::endl;
+
+        free(arr_h);
+
+        return ss.str();
+    }
+
+/** this seems to be unused, but may be useful in the future */
+    template <typename T>
+    void ASSERT_DEVICE_MEM(T* ptr, std::string name)
+    {
+        cudaPointerAttributes s_att;
+        cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
+
+        if (s_err != 0 || s_att.device == -1)
+            std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
+                      << ", err=" << s_err << std::endl;
+    }
+
+    inline uint32_t curTimeMillis()
+    {
+        auto now      = std::chrono::high_resolution_clock::now();
+        auto duration = now.time_since_epoch();
+        return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+    }
+
+/** Helper function to calculate need memory for allocate to store dense matrix.
+ * @param rows number of rows in matrix
+ * @param columns number of columns in matrix
+ * @return need number of items to allocate via allocate()
+ * @sa allocate()
+ */
+    inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
+
+/** Helper function to check alignment of pointer.
+ * @param ptr the pointer to check
+ * @param alignment to be checked for
+ * @return true if address in bytes is a multiple of alignment
+ */
+    template <typename Type>
+    bool is_aligned(Type* ptr, size_t alignment)
+    {
+        return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+    }
+
+/** calculate greatest common divisor of two numbers
+ * @a integer
+ * @b integer
+ * @ return gcd of a and b
+ */
+    template <typename IntType>
+    IntType gcd(IntType a, IntType b)
+    {
+        while (b != 0) {
+            IntType tmp = b;
+            b           = a % b;
+            a           = tmp;
+        }
+        return a;
+    }
+
+}  // namespace raft
+
+#endif
diff --git a/cpp/include/raft_runtime/cusolver_macros.hpp b/cpp/include/raft_runtime/cusolver_macros.hpp
new file mode 100644
index 0000000000..93ad422051
--- /dev/null
+++ b/cpp/include/raft_runtime/cusolver_macros.hpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_CUSOLVER_MACROS_H
+#define __RAFT_RT_CUSOLVER_MACROS_H
+
+#pragma once
+
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+///@todo: enable this once logging is enabled
+//#include <cuml/common/logger.hpp>
+#include <raft/cudart_utils.h>
+#include <type_traits>
+
+#define _CUSOLVER_ERR_TO_STR(err) \
+  case err: return #err;
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuSOLVER error is encountered.
+ */
+    struct cusolver_error : public raft::exception {
+        explicit cusolver_error(char const* const message) : raft::exception(message) {}
+        explicit cusolver_error(std::string const& message) : raft::exception(message) {}
+    };
+
+    namespace linalg {
+
+        inline const char* cusolver_error_to_string(cusolverStatus_t err)
+        {
+            switch (err) {
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
+                default: return "CUSOLVER_STATUS_UNKNOWN";
+            };
+        }
+
+    }  // namespace linalg
+}  // namespace raft
+
+#undef _CUSOLVER_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuSOLVER runtime API functions.
+ *
+ * Invokes a cuSOLVER runtime API function call, if the call does not return
+ * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
+ */
+#define RAFT_CUSOLVER_TRY(call)                                              \
+  do {                                                                       \
+    cusolverStatus_t const status = (call);                                  \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                     \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSOLVER error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
+                    raft::linalg::detail::cusolver_error_to_string(status)); \
+      throw raft::cusolver_error(msg);                                       \
+    }                                                                        \
+  } while (0)
+
+// FIXME: remove after consumer rename
+#ifndef CUSOLVER_TRY
+#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUSOLVER_TRY_NO_THROW(call)                               \
+  do {                                                                 \
+    cusolverStatus_t const status = call;                              \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                           \
+      printf("CUSOLVER call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                    \
+             __FILE__,                                                 \
+             __LINE__,                                                 \
+             raft::linalg::detail::cusolver_error_to_string(status));  \
+    }                                                                  \
+  } while (0)
+
+// FIXME: remove after cuml rename
+#ifndef CUSOLVER_CHECK
+#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
+#endif
+
+#ifndef CUSOLVER_CHECK_NO_THROW
+#define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft_runtime/cusparse_macros.hpp b/cpp/include/raft_runtime/cusparse_macros.hpp
new file mode 100644
index 0000000000..2584513e0a
--- /dev/null
+++ b/cpp/include/raft_runtime/cusparse_macros.hpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_CUSPARSE_MACROS_H
+#define __RAFT_RT_CUSPARSE_MACROS_H
+
+#pragma once
+
+#include <cusparse.h>
+#include "error.hpp"
+///@todo: enable this once logging is enabled
+//#include <cuml/common/logger.hpp>
+
+#define _CUSPARSE_ERR_TO_STR(err) \
+  case err: return #err;
+
+// Notes:
+//(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
+//(2.) to enforce a lower version,
+//
+//`#define CUDA_ENFORCE_LOWER
+// #include <raft/sparse/detail/cusparse_wrappers.h>`
+//
+// (i.e., before including this header)
+//
+#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10100)
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuSparse error is encountered.
+ */
+    struct cusparse_error : public raft::exception {
+        explicit cusparse_error(char const* const message) : raft::exception(message) {}
+        explicit cusparse_error(std::string const& message) : raft::exception(message) {}
+    };
+
+    namespace sparse {
+        namespace detail {
+
+            inline const char* cusparse_error_to_string(cusparseStatus_t err)
+            {
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
+                return cusparseGetErrorString(err);
+#else   // CUDART_VERSION
+                switch (err) {
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+                    default: return "CUSPARSE_STATUS_UNKNOWN";
+                };
+#endif  // CUDART_VERSION
+            }
+
+        }  // namespace detail
+    }  // namespace sparse
+}  // namespace raft
+
+#undef _CUSPARSE_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuSparse runtime API functions.
+ *
+ * Invokes a cuSparse runtime API function call, if the call does not return
+ * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred
+ */
+#define RAFT_CUSPARSE_TRY(call)                                              \
+  do {                                                                       \
+    cusparseStatus_t const status = (call);                                  \
+    if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                     \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSparse error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
+                    raft::sparse::detail::cusparse_error_to_string(status)); \
+      throw raft::cusparse_error(msg);                                       \
+    }                                                                        \
+  } while (0)
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_TRY
+#define CUSPARSE_TRY(call) RAFT_CUSPARSE_TRY(call)
+#endif
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_CHECK
+#define CUSPARSE_CHECK(call) CUSPARSE_TRY(call)
+#endif
+
+//@todo: use logger here once logging is enabled
+/** check for cusparse runtime API errors but do not assert */
+#define RAFT_CUSPARSE_TRY_NO_THROW(call)                           \
+  do {                                                             \
+    cusparseStatus_t err = call;                                   \
+    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
+      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
+             #call,                                                \
+             err,                                                  \
+             raft::sparse::detail::cusparse_error_to_string(err)); \
+    }                                                              \
+  } while (0)
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_CHECK_NO_THROW
+#define CUSPARSE_CHECK_NO_THROW(call) RAFT_CUSPARSE_TRY_NO_THROW(call)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft_runtime/error.hpp b/cpp/include/raft_runtime/error.hpp
new file mode 100644
index 0000000000..b3ed70c5d2
--- /dev/null
+++ b/cpp/include/raft_runtime/error.hpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef __RAFT_RT_ERROR
+#define __RAFT_RT_ERROR
+
+#pragma once
+
+#include <cstdio>
+#include <execinfo.h>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace raft {
+
+/** base exception class for the whole of raft */
+    class exception : public std::exception {
+    public:
+        /** default ctor */
+        explicit exception() noexcept : std::exception(), msg_() {}
+
+        /** copy ctor */
+        exception(exception const& src) noexcept : std::exception(), msg_(src.what())
+        {
+            collect_call_stack();
+        }
+
+        /** ctor from an input message */
+        explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
+        {
+            collect_call_stack();
+        }
+
+        /** get the message associated with this exception */
+        char const* what() const noexcept override { return msg_.c_str(); }
+
+    private:
+        /** message associated with this exception */
+        std::string msg_;
+
+        /** append call stack info to this exception's message for ease of debug */
+        // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
+        void collect_call_stack() noexcept
+        {
+#ifdef __GNUC__
+            constexpr int kMaxStackDepth = 64;
+    void* stack[kMaxStackDepth];  // NOLINT
+    auto depth = backtrace(stack, kMaxStackDepth);
+    std::ostringstream oss;
+    oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
+    char** strings = backtrace_symbols(stack, depth);
+    if (strings == nullptr) {
+      oss << "But no stack trace could be found!" << std::endl;
+      msg_ += oss.str();
+      return;
+    }
+    ///@todo: support for demangling of C++ symbol names
+    for (int i = 0; i < depth; ++i) {
+      oss << "#" << i << " in " << strings[i] << std::endl;
+    }
+    free(strings);
+    msg_ += oss.str();
+#endif  // __GNUC__
+        }
+    };
+
+/**
+ * @brief Exception thrown when logical precondition is violated.
+ *
+ * This exception should not be thrown directly and is instead thrown by the
+ * RAFT_EXPECTS and  RAFT_FAIL macros.
+ *
+ */
+    struct logic_error : public raft::exception {
+        explicit logic_error(char const* const message) : raft::exception(message) {}
+        explicit logic_error(std::string const& message) : raft::exception(message) {}
+    };
+
+}  // namespace raft
+
+// FIXME: Need to be replaced with RAFT_FAIL
+/** macro to throw a runtime error */
+#define THROW(fmt, ...)                                                                      \
+  do {                                                                                       \
+    int size1 =                                                                              \
+      std::snprintf(nullptr, 0, "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
+    int size2 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                               \
+    if (size1 < 0 || size2 < 0)                                                              \
+      throw raft::exception("Error in snprintf, cannot handle raft exception.");             \
+    auto size = size1 + size2 + 1; /* +1 for final '\0' */                                   \
+    auto buf  = std::make_unique<char[]>(size_t(size));                                      \
+    std::snprintf(buf.get(),                                                                 \
+                  size1 + 1 /* +1 for '\0' */,                                               \
+                  "exception occured! file=%s line=%d: ",                                    \
+                  __FILE__,                                                                  \
+                  __LINE__);                                                                 \
+    std::snprintf(buf.get() + size1, size2 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);       \
+    std::string msg(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
+    throw raft::exception(msg);                                                              \
+  } while (0)
+
+// FIXME: Need to be replaced with RAFT_EXPECTS
+/** macro to check for a conditional and assert on failure */
+#define ASSERT(check, fmt, ...)              \
+  do {                                       \
+    if (!(check)) THROW(fmt, ##__VA_ARGS__); \
+  } while (0)
+
+/**
+ * Macro to append error message to first argument.
+ * This should only be called in contexts where it is OK to throw exceptions!
+ */
+#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                           \
+  do {                                                                                          \
+    int size1 = std::snprintf(nullptr, 0, "%s", location_prefix);                               \
+    int size2 = std::snprintf(nullptr, 0, "file=%s line=%d: ", __FILE__, __LINE__);             \
+    int size3 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                                  \
+    if (size1 < 0 || size2 < 0 || size3 < 0)                                                    \
+      throw raft::exception("Error in snprintf, cannot handle raft exception.");                \
+    auto size = size1 + size2 + size3 + 1; /* +1 for final '\0' */                              \
+    auto buf  = std::make_unique<char[]>(size_t(size));                                         \
+    std::snprintf(buf.get(), size1 + 1 /* +1 for '\0' */, "%s", location_prefix);               \
+    std::snprintf(                                                                              \
+      buf.get() + size1, size2 + 1 /* +1 for '\0' */, "file=%s line=%d: ", __FILE__, __LINE__); \
+    std::snprintf(buf.get() + size1 + size2, size3 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);  \
+    msg += std::string(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
+  } while (0)
+
+/**
+ * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false
+ *
+ * @param[in] cond Expression that evaluates to true or false
+ * @param[in] fmt String literal description of the reason that cond is expected to be true with
+ * optinal format tagas
+ * @throw raft::logic_error if the condition evaluates to false.
+ */
+#define RAFT_EXPECTS(cond, fmt, ...)                              \
+  do {                                                            \
+    if (!(cond)) {                                                \
+      std::string msg{};                                          \
+      SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
+      throw raft::logic_error(msg);                               \
+    }                                                             \
+  } while (0)
+
+/**
+ * @brief Indicates that an erroneous code path has been taken.
+ *
+ * @param[in] fmt String literal description of the reason that this code path is erroneous with
+ * optinal format tagas
+ * @throw always throws raft::logic_error
+ */
+#define RAFT_FAIL(fmt, ...)                                     \
+  do {                                                          \
+    std::string msg{};                                          \
+    SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
+    throw raft::logic_error(msg);                               \
+  } while (0)
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft_runtime/handle.hpp b/cpp/include/raft_runtime/handle.hpp
new file mode 100644
index 0000000000..b8bafedc69
--- /dev/null
+++ b/cpp/include/raft_runtime/handle.hpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_HANDLE
+#define __RAFT_RT_HANDLE
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+#include <cusparse.h>
+
+///@todo: enable once we have migrated cuml-comms layer too
+//#include <common/cuml_comms_int.hpp>
+
+#include "cudart_utils.h"
+
+#include <raft/comms/comms.hpp>
+#include <raft/interruptible.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
+#include <raft/sparse/detail/cusparse_macros.h>
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace raft {
+
+/**
+ * @brief Main handle object that stores all necessary context used for calling
+ *        necessary cuda kernels and/or libraries
+ */
+    class handle_t {
+    public:
+        // delete copy/move constructors and assignment operators as
+        // copying and moving underlying resources is unsafe
+        handle_t(const handle_t&) = delete;
+        handle_t& operator=(const handle_t&) = delete;
+        handle_t(handle_t&&)                 = delete;
+        handle_t& operator=(handle_t&&) = delete;
+
+        /**
+         * @brief Construct a handle with a stream view and stream pool
+         *
+         * @param[in] stream_view the default stream (which has the default per-thread stream if
+         * unspecified)
+         * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
+         */
+        handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
+                 std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
+                : dev_id_([]() -> int {
+            int cur_dev = -1;
+            RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
+            return cur_dev;
+        }()),
+                  stream_view_{stream_view},
+                  stream_pool_{stream_pool}
+        {
+            create_resources();
+        }
+
+        /** Destroys all held-up resources */
+        virtual ~handle_t() { destroy_resources(); }
+
+        int get_device() const { return dev_id_; }
+
+        cublasHandle_t get_cublas_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cublas_initialized_) {
+                RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
+                RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
+                cublas_initialized_ = true;
+            }
+            return cublas_handle_;
+        }
+
+        cusolverDnHandle_t get_cusolver_dn_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cusolver_dn_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
+                cusolver_dn_initialized_ = true;
+            }
+            return cusolver_dn_handle_;
+        }
+
+        cusolverSpHandle_t get_cusolver_sp_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cusolver_sp_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
+                cusolver_sp_initialized_ = true;
+            }
+            return cusolver_sp_handle_;
+        }
+
+        cusparseHandle_t get_cusparse_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cusparse_initialized_) {
+                RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
+                RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
+                cusparse_initialized_ = true;
+            }
+            return cusparse_handle_;
+        }
+
+        rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
+
+        /**
+         * @brief synchronize a stream on the handle
+         */
+        void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
+
+        /**
+         * @brief synchronize main stream on the handle
+         */
+        void sync_stream() const { sync_stream(stream_view_); }
+
+        /**
+         * @brief returns main stream on the handle
+         */
+        rmm::cuda_stream_view get_stream() const { return stream_view_; }
+
+        /**
+         * @brief returns whether stream pool was initialized on the handle
+         */
+
+        bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
+
+        /**
+         * @brief returns stream pool on the handle
+         */
+        const rmm::cuda_stream_pool& get_stream_pool() const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            return *stream_pool_;
+        }
+
+        std::size_t get_stream_pool_size() const
+        {
+            return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
+        }
+
+        /**
+         * @brief return stream from pool
+         */
+        rmm::cuda_stream_view get_stream_from_stream_pool() const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            return stream_pool_->get_stream();
+        }
+
+        /**
+         * @brief return stream from pool at index
+         */
+        rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            return stream_pool_->get_stream(stream_idx);
+        }
+
+        /**
+         * @brief return stream from pool if size > 0, else main stream on handle
+         */
+        rmm::cuda_stream_view get_next_usable_stream() const
+        {
+            return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
+        }
+
+        /**
+         * @brief return stream from pool at index if size > 0, else main stream on handle
+         *
+         * @param[in] stream_idx the required index of the stream in the stream pool if available
+         */
+        rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
+        {
+            return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
+        }
+
+        /**
+         * @brief synchronize the stream pool on the handle
+         */
+        void sync_stream_pool() const
+        {
+            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+                sync_stream(stream_pool_->get_stream(i));
+            }
+        }
+
+        /**
+         * @brief synchronize subset of stream pool
+         *
+         * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+         */
+        void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            for (const auto& stream_index : stream_indices) {
+                sync_stream(stream_pool_->get_stream(stream_index));
+            }
+        }
+
+        /**
+         * @brief ask stream pool to wait on last event in main stream
+         */
+        void wait_stream_pool_on_stream() const
+        {
+            RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
+            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+                RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
+            }
+        }
+
+        void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
+
+        const comms::comms_t& get_comms() const
+        {
+            RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
+            return *communicator_;
+        }
+
+        void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+        {
+            subcomms_[key] = subcomm;
+        }
+
+        const comms::comms_t& get_subcomm(std::string key) const
+        {
+            RAFT_EXPECTS(
+                    subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
+
+            auto subcomm = subcomms_.at(key);
+
+            RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
+
+            return *subcomm;
+        }
+
+        bool comms_initialized() const { return (nullptr != communicator_.get()); }
+
+        const cudaDeviceProp& get_device_properties() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!device_prop_initialized_) {
+                RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
+                device_prop_initialized_ = true;
+            }
+            return prop_;
+        }
+
+    private:
+        std::shared_ptr<comms::comms_t> communicator_;
+        std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
+
+        const int dev_id_;
+        mutable cublasHandle_t cublas_handle_;
+        mutable bool cublas_initialized_{false};
+        mutable cusolverDnHandle_t cusolver_dn_handle_;
+        mutable bool cusolver_dn_initialized_{false};
+        mutable cusolverSpHandle_t cusolver_sp_handle_;
+        mutable bool cusolver_sp_initialized_{false};
+        mutable cusparseHandle_t cusparse_handle_;
+        mutable bool cusparse_initialized_{false};
+        std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
+        rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
+        std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
+        cudaEvent_t event_;
+        mutable cudaDeviceProp prop_;
+        mutable bool device_prop_initialized_{false};
+        mutable std::mutex mutex_;
+
+        void create_resources()
+        {
+            thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
+
+            RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+        }
+
+        void destroy_resources()
+        {
+            if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
+            if (cusolver_dn_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+            }
+            if (cusolver_sp_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+            }
+            if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
+            RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
+        }
+    };  // class handle_t
+
+/**
+ * @brief RAII approach to synchronizing across all streams in the handle
+ */
+    class stream_syncer {
+    public:
+        explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
+        ~stream_syncer()
+        {
+            handle_.wait_stream_pool_on_stream();
+            handle_.sync_stream_pool();
+        }
+
+        stream_syncer(const stream_syncer& other) = delete;
+        stream_syncer& operator=(const stream_syncer& other) = delete;
+
+    private:
+        const handle_t& handle_;
+    };  // class stream_syncer
+
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft_runtime/interruptible.hpp b/cpp/include/raft_runtime/interruptible.hpp
new file mode 100644
index 0000000000..a6bb80533c
--- /dev/null
+++ b/cpp/include/raft_runtime/interruptible.hpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_RT_INTERRUPTIBLE_H
+#define __RAFT_RT_INTERRUPTIBLE_H
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <raft/cudart_utils.h>
+#include <raft/error.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <thread>
+#include <unordered_map>
+
+namespace raft {
+
+/**
+ * @brief Exception thrown during `interruptible::synchronize` call when it detects a request
+ * to cancel the work performed in this CPU thread.
+ */
+    struct interrupted_exception : public raft::exception {
+        using raft::exception::exception;
+    };
+
+/**
+ * @brief Cooperative-style interruptible execution.
+ *
+ * This class provides facilities for interrupting execution of a C++ thread at designated points
+ * in code from outside of the thread. In particular, it provides an interruptible version of the
+ * blocking CUDA synchronization function, that allows dropping a long-running GPU work.
+ *
+ *
+ * **Important:** Although CUDA synchronize calls serve as cancellation points, the interruptible
+ * machinery has nothing to do with CUDA streams or events. In other words, when you call `cancel`,
+ * it’s the CPU waiting function what is interrupted, not the GPU stream work. This means, when the
+ * `interrupted_exception` is raised, any unfinished GPU stream work continues to run. It’s the
+ * responsibility of the developer then to make sure the unfinished stream work does not affect the
+ * program in an undesirable way.
+ *
+ *
+ * What can happen to CUDA stream when the `synchronize` is cancelled? If you catch the
+ * `interrupted_exception` immediately, you can safely wait on the stream again.
+ * Otherwise, some of the allocated resources may be released before the active kernel finishes
+ * using them, which will result in writing into deallocated or reallocated memory and undefined
+ * behavior in general. A dead-locked kernel may never finish (or may crash if you’re lucky). In
+ * practice, the outcome is usually acceptable for the use case of emergency program interruption
+ * (e.g., CTRL+C), but extra effort on the use side is required to allow safe interrupting and
+ * resuming of the GPU stream work.
+ */
+    class interruptible {
+    public:
+        /**
+         * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
+         * called on this CPU thread.
+         *
+         * @param [in] stream a CUDA stream.
+         *
+         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+         * thread before the currently captured work has been finished.
+         * @throw raft::cuda_error if another CUDA error happens.
+         */
+        static inline void synchronize(rmm::cuda_stream_view stream)
+        {
+            get_token()->synchronize_impl(cudaStreamQuery, stream);
+        }
+
+        /**
+         * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
+         * called on this CPU thread.
+         *
+         * @param [in] event a CUDA event.
+         *
+         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+         * thread before the currently captured work has been finished.
+         * @throw raft::cuda_error if another CUDA error happens.
+         */
+        static inline void synchronize(cudaEvent_t event)
+        {
+            get_token()->synchronize_impl(cudaEventQuery, event);
+        }
+
+        /**
+         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+         * `interruptible::cancel`.
+         *
+         * This is a cancellation point for an interruptible thread. It's called in the internals of
+         * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
+         * recommended to call `interruptible::yield()` in between to make sure the thread does not become
+         * unresponsive for too long.
+         *
+         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+         *
+         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+         * thread.
+         */
+        static inline void yield() { get_token()->yield_impl(); }
+
+        /**
+         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+         * `interruptible::cancel`.
+         *
+         * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
+         *
+         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+         *
+         * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
+         */
+        static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
+
+        /**
+         * @brief Get a cancellation token for this CPU thread.
+         *
+         * @return an object that can be used to cancel the GPU work waited on this CPU thread.
+         */
+        static inline auto get_token() -> std::shared_ptr<interruptible>
+        {
+            // NB: using static thread-local storage to keep the token alive once it is initialized
+            static thread_local std::shared_ptr<interruptible> s(
+                    get_token_impl<true>(std::this_thread::get_id()));
+            return s;
+        }
+
+        /**
+         * @brief Get a cancellation token for a CPU thread given by its id.
+         *
+         * The returned token may live longer than the associated thread. In that case, using its
+         * `cancel` method has no effect.
+         *
+         * @param [in] thread_id an id of a C++ CPU thread.
+         * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
+         */
+        static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+        {
+            return get_token_impl<false>(thread_id);
+        }
+
+        /**
+         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+         * CPU thread given by the `thread_id`
+         *
+         * Note, this function uses a mutex to safely get a cancellation token that may be shared
+         * among multiple threads. If you plan to use it from a signal handler, consider the non-static
+         * `cancel()` instead.
+         *
+         * @param [in] thread_id a CPU thread, in which the work should be interrupted.
+         */
+        static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
+
+        /**
+         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+         * CPU thread given by this `interruptible` token.
+         *
+         * Note, this function does not involve thread synchronization/locks and does not throw any
+         * exceptions, so it's safe to call from a signal handler.
+         */
+        inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
+
+        // don't allow the token to leave the shared_ptr
+        interruptible(interruptible const&) = delete;
+        interruptible(interruptible&&)      = delete;
+        auto operator=(interruptible const&) -> interruptible& = delete;
+        auto operator=(interruptible&&) -> interruptible& = delete;
+
+    private:
+        /** Global registry of thread-local cancellation stores. */
+        static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
+        /** Protect the access to the registry. */
+        static inline std::mutex mutex_;
+
+        /**
+         * Create a new interruptible token or get an existing from the global registry_.
+         *
+         * Presumptions:
+         *
+         *   1. get_token_impl<true> must be called at most once per thread.
+         *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
+         *   3. get_token_impl<false> can be called as many times as needed, producing a valid
+         *      token for any input thread_id, independent of whether a C++ thread with this
+         *      id exists or not.
+         *
+         * @tparam Claim whether to bind the token to the given thread.
+         * @param [in] thread_id the id of the associated C++ thread.
+         * @return new or existing interruptible token.
+         */
+        template <bool Claim>
+        static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+        {
+            std::lock_guard<std::mutex> guard_get(mutex_);
+            // the following constructs an empty shared_ptr if the key does not exist.
+            auto& weak_store  = registry_[thread_id];
+            auto thread_store = weak_store.lock();
+            if (!thread_store || (Claim && thread_store->claimed_)) {
+                // Create a new thread_store in two cases:
+                //  1. It does not exist in the map yet
+                //  2. The previous store in the map has not yet been deleted
+                thread_store.reset(new interruptible(), [thread_id](auto ts) {
+                    std::lock_guard<std::mutex> guard_erase(mutex_);
+                    auto found = registry_.find(thread_id);
+                    if (found != registry_.end()) {
+                        auto stored = found->second.lock();
+                        // thread_store is not moveable, thus retains its original location.
+                        // Not equal pointers below imply the new store has been already placed
+                        // in the registry_ by the same std::thread::id
+                        if (!stored || stored.get() == ts) { registry_.erase(found); }
+                    }
+                    delete ts;
+                });
+                std::weak_ptr<interruptible>(thread_store).swap(weak_store);
+            }
+            // The thread_store is "claimed" by the thread
+            if constexpr (Claim) { thread_store->claimed_ = true; }
+            return thread_store;
+        }
+
+        /**
+         * Communicate whether the thread is in a cancelled state or can continue execution.
+         *
+         * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
+         * These are the only two places where it's used.
+         */
+        std::atomic_flag continue_;
+        /** This flag is set to true when the created token is placed into a thread-local storage. */
+        bool claimed_ = false;
+
+        interruptible() noexcept { yield_no_throw_impl(); }
+
+        void yield_impl()
+        {
+            if (!yield_no_throw_impl()) {
+                throw interrupted_exception("The work in this thread was cancelled.");
+            }
+        }
+
+        auto yield_no_throw_impl() noexcept -> bool
+        {
+            return continue_.test_and_set(std::memory_order_relaxed);
+        }
+
+        template <typename Query, typename Object>
+        inline void synchronize_impl(Query query, Object object)
+        {
+            cudaError_t query_result;
+            while (true) {
+                yield_impl();
+                query_result = query(object);
+                if (query_result != cudaErrorNotReady) { break; }
+                std::this_thread::yield();
+            }
+            RAFT_CUDA_TRY(query_result);
+        }
+    };
+
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft_runtime/raft.hpp b/cpp/include/raft_runtime/raft.hpp
new file mode 100644
index 0000000000..00879bad1a
--- /dev/null
+++ b/cpp/include/raft_runtime/raft.hpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "comms.hpp"
+#include "error.hpp"
+#include "handle.hpp"
+#include "interruptible.hpp"

From cb40df964f36170f24b5c6149a6bb9add2d5d009 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 8 Mar 2022 14:14:16 -0500
Subject: [PATCH 014/167] Toy example

---
 cpp/CMakeLists.txt                           | 88 ++++++++++++++------
 cpp/cmake/thirdparty/get_cuco.cmake          |  1 +
 cpp/cmake/thirdparty/get_libcudacxx.cmake    |  4 +-
 cpp/cmake/thirdparty/get_mdspan.cmake        |  1 +
 cpp/cmake/thirdparty/get_rmm.cmake           |  5 +-
 cpp/include/raft/comms/comms.hpp             |  1 +
 cpp/include/raft_runtime/comms.hpp           |  1 +
 cpp/include/raft_runtime/cublas_macros.hpp   |  2 +-
 cpp/include/raft_runtime/cudart_utils.hpp    |  2 +-
 cpp/include/raft_runtime/cusolver_macros.hpp |  2 +-
 cpp/include/raft_runtime/cusparse_macros.hpp |  2 +-
 cpp/include/raft_runtime/handle.hpp          | 12 +--
 cpp/include/raft_runtime/interruptible.hpp   |  4 +-
 13 files changed, 83 insertions(+), 42 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c13ee03a33..73762fc521 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -49,6 +49,7 @@ option(DISABLE_DEPRECATION_WARNINGS "Disable depreaction warnings " ON)
 option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(NVTX "Enable nvtx markers" OFF)
 option(RAFT_STATIC_LINK_LIBRARIES "Statically link compiled libraft libraries")
+option(RAFT_RUNTIME_ONLY "Install only RAFT runtime" OFF)
 
 option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" ON)
 option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations" OFF)
@@ -97,8 +98,8 @@ endif()
 # * enable the CMake CUDA language
 # * set other CUDA compilation flags
 rapids_find_package(CUDAToolkit REQUIRED
-    BUILD_EXPORT_SET raft-exports
-    INSTALL_EXPORT_SET raft-exports
+    BUILD_EXPORT_SET raft-runtime-exports
+    INSTALL_EXPORT_SET raft-runtime-exports
     )
 include(cmake/modules/ConfigureCUDA.cmake)
 
@@ -122,10 +123,32 @@ if(BUILD_TESTS)
   include(cmake/thirdparty/get_ucx.cmake)
 endif()
 
+##############################################################################
+# - raft_runtime -------------------------------------------------------------
+
+add_library(raft_runtime INTERFACE)
+
+if(TARGET raft_runtime AND (NOT TARGET raft::runtime))
+  add_library(raft::runtime ALIAS raft_runtime)
+endif()
+
+set_target_properties(raft_runtime PROPERTIES EXPORT_NAME runtime)
+
+target_link_libraries(raft_runtime INTERFACE
+                      CUDA::cublas
+                      CUDA::curand
+                      CUDA::cusolver
+                      CUDA::cudart
+                      CUDA::cusparse
+                      rmm::rmm)
+
+target_compile_definitions(raft_runtime INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
+target_compile_features(raft_runtime INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+
 ##############################################################################
 # - raft ---------------------------------------------------------------------
 
-add_library(raft INTERFACE)
+add_library(raft INTERFACE EXCLUDE_FROM_ALL)
 add_library(raft::raft ALIAS raft)
 
 target_include_directories(raft INTERFACE
@@ -134,13 +157,8 @@ target_include_directories(raft INTERFACE
 
 target_link_libraries(raft INTERFACE
   raft::Thrust
-  CUDA::cublas
-  CUDA::curand
-  CUDA::cusolver
-  CUDA::cudart
-  CUDA::cusparse
+  raft::runtime
   $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
-  rmm::rmm
   cuco::cuco
   std::mdspan)
 
@@ -213,15 +231,15 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
           )
-  target_compile_definitions(raft_distance_lib
-          INTERFACE "RAFT_DISTANCE_COMPILED")
+  target_compile_definitions(raft_distance_lib INTERFACE "RAFT_DISTANCE_COMPILED")
 
   # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
   target_link_options(raft_distance_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
 endif()
 
-target_link_libraries(raft_distance INTERFACE raft::raft
+# TODO: Create public header(s) for exposed distance functions
+target_link_libraries(raft_distance INTERFACE
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )
@@ -259,12 +277,12 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
   # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
   target_link_options(raft_nn_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
-  target_compile_definitions(raft_nn_lib
-          INTERFACE "RAFT_NN_COMPILED")
+  target_compile_definitions(raft_nn_lib INTERFACE "RAFT_NN_COMPILED")
 
 endif()
 
-target_link_libraries(raft_nn INTERFACE raft::raft faiss::faiss
+# TODO: Create public header(s) for exposed nn functions
+target_link_libraries(raft_nn INTERFACE faiss::faiss
     $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 
@@ -274,6 +292,9 @@ rapids_cmake_install_lib_dir( lib_dir )
 include(GNUInstallDirs)
 include(CPack)
 
+install(TARGETS raft_runtime
+        DESTINATION ${lib_dir}
+        EXPORT raft-runtime-exports)
 install(TARGETS raft
         DESTINATION ${lib_dir}
         EXPORT raft-exports)
@@ -296,14 +317,12 @@ if(TARGET raft_nn_lib)
           EXPORT raft-nn-lib-exports)
 endif()
 
-
-install(DIRECTORY include/raft/
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft
-        )
+install(DIRECTORY include/raft_runtime
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_runtime)
 
 # Temporary install of raft.hpp while the file is removed
-install(FILES include/raft.hpp
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft)
+install(FILES include/raft_runtime/raft.hpp
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_runtime)
 
 ##############################################################################
 # - install export -----------------------------------------------------------
@@ -311,9 +330,8 @@ set(doc_string
 [=[
 Provide targets for the RAFT: RAPIDS Analytics Framework Toolkit.
 
-RAPIDS Analytics Framework Toolkit contains shared representations,
-mathematical computational primitives, and utilities that accelerate
-building analytics and data science algorithms in the RAPIDS ecosystem.
+RAFT (Reusable Analytics Functions and other Tools) contains fundamental
+widely-used algorithms and primitives for data science, graph, and ml.
 
 Optional Components:
   - nn
@@ -321,6 +339,7 @@ Optional Components:
 
 Imported Targets:
   - raft::raft
+  - raft::runtime
   - raft::nn brought in by the `nn` optional component
   - raft::distance brought in by the `distance` optional component
 
@@ -329,7 +348,7 @@ Imported Targets:
 set(code_string
 [=[
 if(NOT TARGET raft::Thrust)
-  thrust_create_target(raft::Thrust FROM_OPTIONS)
+  thrust_create_target(raft::Thrust FROM_OPTIONS EXCLUDE_FROM_ALL TRUE)
 endif()
 
 if(distance IN_LIST raft_FIND_COMPONENTS)
@@ -350,10 +369,18 @@ endif()
 
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
+raft_export(INSTALL runtime
+        EXPORT_SET raft-runtime-exports
+        GLOBAL_TARGETS runtime raft
+        NAMESPACE raft::
+        DOCUMENTATION doc_string
+        FINAL_CODE_BLOCK code_string
+        )
+
 raft_export(INSTALL raft
     EXPORT_SET raft-exports
     COMPONENTS nn distance
-    GLOBAL_TARGETS raft nn distance
+    GLOBAL_TARGETS runtime raft nn distance
     NAMESPACE raft::
     DOCUMENTATION doc_string
     FINAL_CODE_BLOCK code_string
@@ -361,11 +388,18 @@ raft_export(INSTALL raft
 
 ##############################################################################
 # - build export -------------------------------------------------------------
+raft_export(BUILD runtime
+        EXPORT_SET raft-runtime-exports
+        GLOBAL_TARGETS runtime raft
+        DOCUMENTATION doc_string
+        NAMESPACE raft::
+        FINAL_CODE_BLOCK code_string
+        )
 
 raft_export(BUILD raft
     EXPORT_SET raft-exports
     COMPONENTS nn distance
-    GLOBAL_TARGETS raft raft_distance raft_nn
+    GLOBAL_TARGETS runtime raft raft_distance raft_nn
     DOCUMENTATION doc_string
     NAMESPACE raft::
     FINAL_CODE_BLOCK code_string
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 381addb03c..da733d0ef1 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -21,6 +21,7 @@ function(find_and_configure_cuco VERSION)
       BUILD_EXPORT_SET    raft-exports
       INSTALL_EXPORT_SET  raft-exports
       CPM_ARGS
+        EXCLUDE_FROM_ALL TRUE
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
         GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
         OPTIONS        "BUILD_TESTS OFF"
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 5343250dca..7d0f2d7d86 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,7 +16,9 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports)
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
+                        INSTALL_EXPORT_SET raft-exports
+                        EXCLUDE_FROM_ALL TRUE)
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index c88d4e6857..820bb0a4d1 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -5,6 +5,7 @@ function(find_and_configure_mdspan VERSION)
     BUILD_EXPORT_SET    raft-exports
     INSTALL_EXPORT_SET  raft-exports
     CPM_ARGS
+      EXCLUDE_FROM_ALL TRUE
       GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
       GIT_TAG b3042485358d2ee168ae2b486c98c2c61ec5aec1
       OPTIONS "MDSPAN_ENABLE_CUDA ON"
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 7c155d446f..d1e078ea1a 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -19,8 +19,9 @@ function(find_and_configure_rmm)
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
     rapids_cpm_rmm(
         GLOBAL_TARGETS      rmm::rmm
-        BUILD_EXPORT_SET    raft-exports
-        INSTALL_EXPORT_SET  raft-exports
+        BUILD_EXPORT_SET    raft-runtime-exports
+        INSTALL_EXPORT_SET  raft-runtime-exports
+        EXCLUDE_FROM_ALL TRUE
     )
 
 endfunction()
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index e8855192ba..623c32299b 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -24,6 +24,7 @@
 
 #pragma once
 
+#include <raft_runtime/error.hpp>
 #include <memory>
 #include <vector>
 
diff --git a/cpp/include/raft_runtime/comms.hpp b/cpp/include/raft_runtime/comms.hpp
index cf3ffe350c..664b05acaa 100644
--- a/cpp/include/raft_runtime/comms.hpp
+++ b/cpp/include/raft_runtime/comms.hpp
@@ -19,6 +19,7 @@
 
 #pragma once
 
+#include <raft_runtime/error.hpp>
 #include <memory>
 #include <vector>
 
diff --git a/cpp/include/raft_runtime/cublas_macros.hpp b/cpp/include/raft_runtime/cublas_macros.hpp
index 6344333b3d..c58bda3ff5 100644
--- a/cpp/include/raft_runtime/cublas_macros.hpp
+++ b/cpp/include/raft_runtime/cublas_macros.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <raft/error.hpp>
+#include <raft_runtime/error.hpp>
 
 ///@todo: enable this once we have logger enabled
 //#include <cuml/common/logger.hpp>
diff --git a/cpp/include/raft_runtime/cudart_utils.hpp b/cpp/include/raft_runtime/cudart_utils.hpp
index d4d3d86a3d..b0cc6eaaf4 100644
--- a/cpp/include/raft_runtime/cudart_utils.hpp
+++ b/cpp/include/raft_runtime/cudart_utils.hpp
@@ -19,7 +19,7 @@
 
 #pragma once
 
-#include <raft/error.hpp>
+#include <raft_runtime/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
diff --git a/cpp/include/raft_runtime/cusolver_macros.hpp b/cpp/include/raft_runtime/cusolver_macros.hpp
index 93ad422051..6f56e2b9a6 100644
--- a/cpp/include/raft_runtime/cusolver_macros.hpp
+++ b/cpp/include/raft_runtime/cusolver_macros.hpp
@@ -23,7 +23,7 @@
 #include <cusolverSp.h>
 ///@todo: enable this once logging is enabled
 //#include <cuml/common/logger.hpp>
-#include <raft/cudart_utils.h>
+#include <raft_runtime/cudart_utils.hpp>
 #include <type_traits>
 
 #define _CUSOLVER_ERR_TO_STR(err) \
diff --git a/cpp/include/raft_runtime/cusparse_macros.hpp b/cpp/include/raft_runtime/cusparse_macros.hpp
index 2584513e0a..c4187722c1 100644
--- a/cpp/include/raft_runtime/cusparse_macros.hpp
+++ b/cpp/include/raft_runtime/cusparse_macros.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <cusparse.h>
-#include "error.hpp"
+#include <raft_runtime/error.hpp>
 ///@todo: enable this once logging is enabled
 //#include <cuml/common/logger.hpp>
 
diff --git a/cpp/include/raft_runtime/handle.hpp b/cpp/include/raft_runtime/handle.hpp
index b8bafedc69..1ceb704992 100644
--- a/cpp/include/raft_runtime/handle.hpp
+++ b/cpp/include/raft_runtime/handle.hpp
@@ -35,13 +35,13 @@
 ///@todo: enable once we have migrated cuml-comms layer too
 //#include <common/cuml_comms_int.hpp>
 
-#include "cudart_utils.h"
+#include <raft_runtime/cudart_utils.hpp>
 
-#include <raft/comms/comms.hpp>
-#include <raft/interruptible.hpp>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/detail/cusolver_wrappers.hpp>
-#include <raft/sparse/detail/cusparse_macros.h>
+#include <raft_runtime/comms.hpp>
+#include <raft_runtime/interruptible.hpp>
+#include <raft_runtime/cublas_macros.hpp>
+#include <raft_runtime/cusolver_macros.hpp>
+#include <raft_runtime/cusparse_macros.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft_runtime/interruptible.hpp b/cpp/include/raft_runtime/interruptible.hpp
index a6bb80533c..bc1faf3bf8 100644
--- a/cpp/include/raft_runtime/interruptible.hpp
+++ b/cpp/include/raft_runtime/interruptible.hpp
@@ -22,8 +22,8 @@
 #include <memory>
 #include <mutex>
 #include <optional>
-#include <raft/cudart_utils.h>
-#include <raft/error.hpp>
+#include <raft_runtime/cudart_utils.hpp>
+#include <raft_runtime/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <thread>
 #include <unordered_map>

From a5cea8e3e0acdabf8659a550e5cd79f74c3e262e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 8 Mar 2022 17:03:04 -0500
Subject: [PATCH 015/167] Making raft::runtime the core dependency and having
 all other targets link to it. Using raft::raft only as internal (build)
 target

---
 cpp/CMakeLists.txt | 60 ++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 31 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 73762fc521..2318465825 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -49,7 +49,6 @@ option(DISABLE_DEPRECATION_WARNINGS "Disable depreaction warnings " ON)
 option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(NVTX "Enable nvtx markers" OFF)
 option(RAFT_STATIC_LINK_LIBRARIES "Statically link compiled libraft libraries")
-option(RAFT_RUNTIME_ONLY "Install only RAFT runtime" OFF)
 
 option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" ON)
 option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations" OFF)
@@ -127,11 +126,7 @@ endif()
 # - raft_runtime -------------------------------------------------------------
 
 add_library(raft_runtime INTERFACE)
-
-if(TARGET raft_runtime AND (NOT TARGET raft::runtime))
-  add_library(raft::runtime ALIAS raft_runtime)
-endif()
-
+add_library(raft::runtime ALIAS raft_runtime)
 set_target_properties(raft_runtime PROPERTIES EXPORT_NAME runtime)
 
 target_link_libraries(raft_runtime INTERFACE
@@ -231,7 +226,8 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
           )
-  target_compile_definitions(raft_distance_lib INTERFACE "RAFT_DISTANCE_COMPILED")
+  target_compile_definitions(raft_distance_lib
+          INTERFACE "RAFT_DISTANCE_COMPILED")
 
   # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
   target_link_options(raft_distance_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
@@ -239,7 +235,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
 endif()
 
 # TODO: Create public header(s) for exposed distance functions
-target_link_libraries(raft_distance INTERFACE
+target_link_libraries(raft_distance INTERFACE raft::runtime
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )
@@ -277,12 +273,13 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
   # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
   target_link_options(raft_nn_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
-  target_compile_definitions(raft_nn_lib INTERFACE "RAFT_NN_COMPILED")
+  target_compile_definitions(raft_nn_lib
+          INTERFACE "RAFT_NN_COMPILED")
 
 endif()
 
 # TODO: Create public header(s) for exposed nn functions
-target_link_libraries(raft_nn INTERFACE faiss::faiss
+target_link_libraries(raft_nn INTERFACE raft::runtime faiss::faiss
     $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 
@@ -295,9 +292,9 @@ include(CPack)
 install(TARGETS raft_runtime
         DESTINATION ${lib_dir}
         EXPORT raft-runtime-exports)
-install(TARGETS raft
-        DESTINATION ${lib_dir}
-        EXPORT raft-exports)
+#install(TARGETS raft
+#        DESTINATION ${lib_dir}
+#        EXPORT raft-exports)
 install(TARGETS raft_distance
         DESTINATION ${lib_dir}
         EXPORT raft-distance-exports)
@@ -370,40 +367,41 @@ endif()
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
 raft_export(INSTALL runtime
+        COMPONENTS nn distance
         EXPORT_SET raft-runtime-exports
-        GLOBAL_TARGETS runtime raft
+        GLOBAL_TARGETS runtime raft nn distance
         NAMESPACE raft::
         DOCUMENTATION doc_string
         FINAL_CODE_BLOCK code_string
         )
 
-raft_export(INSTALL raft
-    EXPORT_SET raft-exports
-    COMPONENTS nn distance
-    GLOBAL_TARGETS runtime raft nn distance
-    NAMESPACE raft::
-    DOCUMENTATION doc_string
-    FINAL_CODE_BLOCK code_string
-    )
+#raft_export(INSTALL raft
+#    EXPORT_SET raft-exports
+#    GLOBAL_TARGETS raft
+#    NAMESPACE raft::
+#    DOCUMENTATION doc_string
+#    FINAL_CODE_BLOCK code_string
+#    )
 
 ##############################################################################
 # - build export -------------------------------------------------------------
 raft_export(BUILD runtime
         EXPORT_SET raft-runtime-exports
-        GLOBAL_TARGETS runtime raft
+        COMPONENTS nn distance
+        GLOBAL_TARGETS runtime raft raft_distance raft_nn
         DOCUMENTATION doc_string
         NAMESPACE raft::
         FINAL_CODE_BLOCK code_string
         )
 
-raft_export(BUILD raft
-    EXPORT_SET raft-exports
-    COMPONENTS nn distance
-    GLOBAL_TARGETS runtime raft raft_distance raft_nn
-    DOCUMENTATION doc_string
-    NAMESPACE raft::
-    FINAL_CODE_BLOCK code_string
-    )
+#raft_export(BUILD raft
+#    EXPORT_SET raft-exports
+##    COMPONENTS nn distance
+#    GLOBAL_TARGETS raft
+#    DOCUMENTATION doc_string
+#    NAMESPACE raft::
+#    FINAL_CODE_BLOCK code_string
+#    )
 
 ##############################################################################
 # - export/install optional components  --------------------------------------

From 4322d0b7e50477055ee24f9db7dc7a0b8472fab2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 8 Mar 2022 17:04:46 -0500
Subject: [PATCH 016/167] fixing style

---
 cpp/include/raft/comms/comms.hpp             | 1138 +++++++++---------
 cpp/include/raft/cudart_utils.h              |  451 ++++---
 cpp/include/raft/error.hpp                   |   71 +-
 cpp/include/raft/handle.hpp                  |  556 ++++-----
 cpp/include/raft/interruptible.hpp           |  379 +++---
 cpp/include/raft/linalg/cublas_macros.h      |   54 +-
 cpp/include/raft/linalg/cusolver_macros.h    |   51 +-
 cpp/include/raft_runtime/comms.hpp           | 1138 +++++++++---------
 cpp/include/raft_runtime/cublas_macros.hpp   |   54 +-
 cpp/include/raft_runtime/cudart_utils.hpp    |  450 +++----
 cpp/include/raft_runtime/cusolver_macros.hpp |   50 +-
 cpp/include/raft_runtime/cusparse_macros.hpp |   46 +-
 cpp/include/raft_runtime/error.hpp           |   71 +-
 cpp/include/raft_runtime/handle.hpp          |  558 ++++-----
 cpp/include/raft_runtime/interruptible.hpp   |  378 +++---
 15 files changed, 2720 insertions(+), 2725 deletions(-)

diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 623c32299b..c8a77a00bb 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -24,620 +24,620 @@
 
 #pragma once
 
-#include <raft_runtime/error.hpp>
 #include <memory>
+#include <raft_runtime/error.hpp>
 #include <vector>
 
 namespace raft {
-    namespace comms {
+namespace comms {
 
-        typedef unsigned int request_t;
-        enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
-        enum class op_t { SUM, PROD, MIN, MAX };
+typedef unsigned int request_t;
+enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
+enum class op_t { SUM, PROD, MIN, MAX };
 
 /**
  * The resulting status of distributed stream synchronization
  */
-        enum class status_t {
-            SUCCESS,  // Synchronization successful
-            ERROR,    // An error occured querying sync status
-            ABORT     // A failure occurred in sync, queued operations aborted
-        };
+enum class status_t {
+  SUCCESS,  // Synchronization successful
+  ERROR,    // An error occured querying sync status
+  ABORT     // A failure occurred in sync, queued operations aborted
+};
+
+template <typename value_t>
+constexpr datatype_t
 
-        template <typename value_t>
-        constexpr datatype_t
+get_type();
 
-        get_type();
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<char>()
+{
+  return datatype_t::CHAR;
+}
 
-        get_type<char>()
-        {
-            return datatype_t::CHAR;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<uint8_t>()
+{
+  return datatype_t::UINT8;
+}
 
-        get_type<uint8_t>()
-        {
-            return datatype_t::UINT8;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<int>()
+{
+  return datatype_t::INT32;
+}
 
-        get_type<int>()
-        {
-            return datatype_t::INT32;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<uint32_t>()
+{
+  return datatype_t::UINT32;
+}
 
-        get_type<uint32_t>()
-        {
-            return datatype_t::UINT32;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<int64_t>()
+{
+  return datatype_t::INT64;
+}
 
-        get_type<int64_t>()
-        {
-            return datatype_t::INT64;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<uint64_t>()
+{
+  return datatype_t::UINT64;
+}
 
-        get_type<uint64_t>()
-        {
-            return datatype_t::UINT64;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<float>()
+{
+  return datatype_t::FLOAT32;
+}
 
-        get_type<float>()
-        {
-            return datatype_t::FLOAT32;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<double>()
+{
+  return datatype_t::FLOAT64;
+}
 
-        get_type<double>()
-        {
-            return datatype_t::FLOAT64;
-        }
+class comms_iface {
+ public:
+  virtual ~comms_iface() {}
 
-        class comms_iface {
-        public:
-            virtual ~comms_iface() {}
+  virtual int get_size() const = 0;
 
-            virtual int get_size() const = 0;
+  virtual int get_rank() const = 0;
 
-            virtual int get_rank() const = 0;
+  virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
 
-            virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
+  virtual void barrier() const = 0;
 
-            virtual void barrier() const = 0;
+  virtual status_t sync_stream(cudaStream_t stream) const = 0;
 
-            virtual status_t sync_stream(cudaStream_t stream) const = 0;
+  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
 
-            virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
+  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
 
-            virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
+  virtual void waitall(int count, request_t array_of_requests[]) const = 0;
 
-            virtual void waitall(int count, request_t array_of_requests[]) const = 0;
+  virtual void allreduce(const void* sendbuff,
+                         void* recvbuff,
+                         size_t count,
+                         datatype_t datatype,
+                         op_t op,
+                         cudaStream_t stream) const = 0;
 
-            virtual void allreduce(const void* sendbuff,
-                                   void* recvbuff,
-                                   size_t count,
-                                   datatype_t datatype,
-                                   op_t op,
-                                   cudaStream_t stream) const = 0;
+  virtual void bcast(
+    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
 
-            virtual void bcast(
-                    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
+  virtual void bcast(const void* sendbuff,
+                     void* recvbuff,
+                     size_t count,
+                     datatype_t datatype,
+                     int root,
+                     cudaStream_t stream) const = 0;
 
-            virtual void bcast(const void* sendbuff,
-                               void* recvbuff,
-                               size_t count,
-                               datatype_t datatype,
-                               int root,
+  virtual void reduce(const void* sendbuff,
+                      void* recvbuff,
+                      size_t count,
+                      datatype_t datatype,
+                      op_t op,
+                      int root,
+                      cudaStream_t stream) const = 0;
+
+  virtual void allgather(const void* sendbuff,
+                         void* recvbuff,
+                         size_t sendcount,
+                         datatype_t datatype,
+                         cudaStream_t stream) const = 0;
+
+  virtual void allgatherv(const void* sendbuf,
+                          void* recvbuf,
+                          const size_t* recvcounts,
+                          const size_t* displs,
+                          datatype_t datatype,
+                          cudaStream_t stream) const = 0;
+
+  virtual void gather(const void* sendbuff,
+                      void* recvbuff,
+                      size_t sendcount,
+                      datatype_t datatype,
+                      int root,
+                      cudaStream_t stream) const = 0;
+
+  virtual void gatherv(const void* sendbuf,
+                       void* recvbuf,
+                       size_t sendcount,
+                       const size_t* recvcounts,
+                       const size_t* displs,
+                       datatype_t datatype,
+                       int root,
+                       cudaStream_t stream) const = 0;
+
+  virtual void reducescatter(const void* sendbuff,
+                             void* recvbuff,
+                             size_t recvcount,
+                             datatype_t datatype,
+                             op_t op,
+                             cudaStream_t stream) const = 0;
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
+
+  virtual void device_sendrecv(const void* sendbuf,
+                               size_t sendsize,
+                               int dest,
+                               void* recvbuf,
+                               size_t recvsize,
+                               int source,
                                cudaStream_t stream) const = 0;
 
-            virtual void reduce(const void* sendbuff,
-                                void* recvbuff,
-                                size_t count,
-                                datatype_t datatype,
-                                op_t op,
-                                int root,
-                                cudaStream_t stream) const = 0;
-
-            virtual void allgather(const void* sendbuff,
-                                   void* recvbuff,
-                                   size_t sendcount,
-                                   datatype_t datatype,
-                                   cudaStream_t stream) const = 0;
-
-            virtual void allgatherv(const void* sendbuf,
-                                    void* recvbuf,
-                                    const size_t* recvcounts,
-                                    const size_t* displs,
-                                    datatype_t datatype,
-                                    cudaStream_t stream) const = 0;
-
-            virtual void gather(const void* sendbuff,
-                                void* recvbuff,
-                                size_t sendcount,
-                                datatype_t datatype,
-                                int root,
-                                cudaStream_t stream) const = 0;
-
-            virtual void gatherv(const void* sendbuf,
-                                 void* recvbuf,
-                                 size_t sendcount,
-                                 const size_t* recvcounts,
-                                 const size_t* displs,
-                                 datatype_t datatype,
-                                 int root,
-                                 cudaStream_t stream) const = 0;
-
-            virtual void reducescatter(const void* sendbuff,
-                                       void* recvbuff,
-                                       size_t recvcount,
-                                       datatype_t datatype,
-                                       op_t op,
-                                       cudaStream_t stream) const = 0;
-
-            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-            virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
-
-            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-            virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
-
-            virtual void device_sendrecv(const void* sendbuf,
-                                         size_t sendsize,
-                                         int dest,
+  virtual void device_multicast_sendrecv(const void* sendbuf,
+                                         std::vector<size_t> const& sendsizes,
+                                         std::vector<size_t> const& sendoffsets,
+                                         std::vector<int> const& dests,
                                          void* recvbuf,
-                                         size_t recvsize,
-                                         int source,
+                                         std::vector<size_t> const& recvsizes,
+                                         std::vector<size_t> const& recvoffsets,
+                                         std::vector<int> const& sources,
                                          cudaStream_t stream) const = 0;
-
-            virtual void device_multicast_sendrecv(const void* sendbuf,
-                                                   std::vector<size_t> const& sendsizes,
-                                                   std::vector<size_t> const& sendoffsets,
-                                                   std::vector<int> const& dests,
-                                                   void* recvbuf,
-                                                   std::vector<size_t> const& recvsizes,
-                                                   std::vector<size_t> const& recvoffsets,
-                                                   std::vector<int> const& sources,
-                                                   cudaStream_t stream) const = 0;
-        };
-
-        class comms_t {
-        public:
-            comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
-            {
-                ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
-            }
-
-            /**
-             * Virtual Destructor to enable polymorphism
-             */
-            virtual ~comms_t() {}
-
-            /**
-             * Returns the size of the communicator clique
-             */
-
-            int get_size() const { return impl_->get_size(); }
-
-            /**
-             * Returns the local rank
-             */
-            int get_rank() const { return impl_->get_rank(); }
-
-            /**
-             * Splits the current communicator clique into sub-cliques matching
-             * the given color and key
-             *
-             * @param color ranks w/ the same color are placed in the same communicator
-             * @param key controls rank assignment
-             */
-            std::unique_ptr<comms_iface> comm_split(int color, int key) const
-            {
-                return impl_->comm_split(color, key);
-            }
-
-            /**
-             * Performs a collective barrier synchronization
-             */
-            void barrier() const { impl_->barrier(); }
-
-            /**
-             * Some collective communications implementations (eg. NCCL) might use asynchronous
-             * collectives that are explicitly synchronized. It's important to always synchronize
-             * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
-             * to prevent the potential for deadlocks.
-             *
-             * @param stream the cuda stream to sync collective operations on
-             */
-            status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
-
-            /**
-             * Performs an asynchronous point-to-point send
-             * @tparam value_t the type of data to send
-             * @param buf pointer to array of data to send
-             * @param size number of elements in buf
-             * @param dest destination rank
-             * @param tag a tag to use for the receiver to filter
-             * @param request pointer to hold returned request_t object.
-             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-             */
-            template <typename value_t>
-            void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
-            {
-                impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
-            }
-
-            /**
-             * Performs an asynchronous point-to-point receive
-             * @tparam value_t the type of data to be received
-             * @param buf pointer to (initialized) array that will hold received data
-             * @param size number of elements in buf
-             * @param source source rank
-             * @param tag a tag to use for message filtering
-             * @param request pointer to hold returned request_t object.
-             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-             */
-            template <typename value_t>
-            void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
-            {
-                impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
-            }
-
-            /**
-             * Synchronize on an array of request_t objects returned from isend/irecv
-             * @param count number of requests to synchronize on
-             * @param array_of_requests an array of request_t objects returned from isend/irecv
-             */
-            void waitall(int count, request_t array_of_requests[]) const
-            {
-                impl_->waitall(count, array_of_requests);
-            }
-
-            /**
-             * Perform an allreduce collective
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff data to reduce
-             * @param recvbuff buffer to hold the reduced result
-             * @param count number of elements in sendbuff
-             * @param op reduction operation to perform
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void allreduce(
-                    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
-            {
-                impl_->allreduce(static_cast<const void*>(sendbuff),
-                                 static_cast<void*>(recvbuff),
-                                 count,
-                                 get_type<value_t>(),
-                                 op,
-                                 stream);
-            }
-
-            /**
-             * Broadcast data from one rank to the rest
-             * @tparam value_t datatype of underlying buffers
-             * @param buff buffer to send
-             * @param count number of elements if buff
-             * @param root the rank initiating the broadcast
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
-            {
-                impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
-            }
-
-            /**
-             * Broadcast data from one rank to the rest
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to broadcast (only used in root)
-             * @param recvbuff buffer to receive broadcasted data
-             * @param count number of elements if buff
-             * @param root the rank initiating the broadcast
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void bcast(
-                    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
-            {
-                impl_->bcast(static_cast<const void*>(sendbuff),
-                             static_cast<void*>(recvbuff),
-                             count,
-                             get_type<value_t>(),
-                             root,
-                             stream);
-            }
-
-            /**
-             * Reduce data from many ranks down to a single rank
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to reduce
-             * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
-             * @param count number of elements in sendbuff
-             * @param op reduction operation to perform
-             * @param root rank to store the results
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void reduce(const value_t* sendbuff,
-                        value_t* recvbuff,
-                        size_t count,
-                        op_t op,
-                        int root,
-                        cudaStream_t stream) const
-            {
-                impl_->reduce(static_cast<const void*>(sendbuff),
-                              static_cast<void*>(recvbuff),
-                              count,
-                              get_type<value_t>(),
-                              op,
-                              root,
-                              stream);
-            }
-
-            /**
-             * Gathers data from each rank onto all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to gather
-             * @param recvbuff buffer containing gathered data from all ranks
-             * @param sendcount number of elements in send buffer
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void allgather(const value_t* sendbuff,
-                           value_t* recvbuff,
-                           size_t sendcount,
-                           cudaStream_t stream) const
-            {
-                impl_->allgather(static_cast<const void*>(sendbuff),
-                                 static_cast<void*>(recvbuff),
-                                 sendcount,
-                                 get_type<value_t>(),
-                                 stream);
-            }
-
-            /**
-             * Gathers data from all ranks and delivers to combined data to all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuf buffer containing data to send
-             * @param recvbuf buffer containing data to receive
-             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-             *                   elements that are to be received from each rank
-             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-             *               (relative to recvbuf) at which to place the incoming data from each rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void allgatherv(const value_t* sendbuf,
-                            value_t* recvbuf,
-                            const size_t* recvcounts,
-                            const size_t* displs,
-                            cudaStream_t stream) const
-            {
-                impl_->allgatherv(static_cast<const void*>(sendbuf),
-                                  static_cast<void*>(recvbuf),
-                                  recvcounts,
-                                  displs,
-                                  get_type<value_t>(),
-                                  stream);
-            }
-
-            /**
-             * Gathers data from each rank onto all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to gather
-             * @param recvbuff buffer containing gathered data from all ranks
-             * @param sendcount number of elements in send buffer
-             * @param root rank to store the results
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void gather(const value_t* sendbuff,
-                        value_t* recvbuff,
-                        size_t sendcount,
-                        int root,
-                        cudaStream_t stream) const
-            {
-                impl_->gather(static_cast<const void*>(sendbuff),
-                              static_cast<void*>(recvbuff),
-                              sendcount,
-                              get_type<value_t>(),
-                              root,
-                              stream);
-            }
-
-            /**
-             * Gathers data from all ranks and delivers to combined data to all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuf buffer containing data to send
-             * @param recvbuf buffer containing data to receive
-             * @param sendcount number of elements in send buffer
-             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-             *                   elements that are to be received from each rank
-             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-             *               (relative to recvbuf) at which to place the incoming data from each rank
-             * @param root rank to store the results
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void gatherv(const value_t* sendbuf,
-                         value_t* recvbuf,
-                         size_t sendcount,
-                         const size_t* recvcounts,
-                         const size_t* displs,
-                         int root,
-                         cudaStream_t stream) const
-            {
-                impl_->gatherv(static_cast<const void*>(sendbuf),
-                               static_cast<void*>(recvbuf),
-                               sendcount,
-                               recvcounts,
-                               displs,
-                               get_type<value_t>(),
-                               root,
-                               stream);
-            }
-
-            /**
-             * Reduces data from all ranks then scatters the result across ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
-             * @param recvbuff buffer containing received data
-             * @param recvcount number of items to receive
-             * @param op reduction operation to perform
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void reducescatter(const value_t* sendbuff,
-                               value_t* recvbuff,
-                               size_t recvcount,
-                               op_t op,
-                               cudaStream_t stream) const
-            {
-                impl_->reducescatter(static_cast<const void*>(sendbuff),
-                                     static_cast<void*>(recvbuff),
-                                     recvcount,
-                                     get_type<value_t>(),
-                                     op,
-                                     stream);
-            }
-
-            /**
-             * Performs a point-to-point send
-             *
-             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-             *
-             * @tparam value_t the type of data to send
-             * @param buf pointer to array of data to send
-             * @param size number of elements in buf
-             * @param dest destination rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
-            {
-                impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
-            }
-
-            /**
-             * Performs a point-to-point receive
-             *
-             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-             *
-             * @tparam value_t the type of data to be received
-             * @param buf pointer to (initialized) array that will hold received data
-             * @param size number of elements in buf
-             * @param source source rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
-            {
-                impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
-            }
-
-            /**
-             * Performs a point-to-point send/receive
-             *
-             * @tparam value_t the type of data to be sent & received
-             * @param sendbuf pointer to array of data to send
-             * @param sendsize number of elements in sendbuf
-             * @param dest destination rank
-             * @param recvbuf pointer to (initialized) array that will hold received data
-             * @param recvsize number of elements in recvbuf
-             * @param source source rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_sendrecv(const value_t* sendbuf,
-                                 size_t sendsize,
-                                 int dest,
+};
+
+class comms_t {
+ public:
+  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
+  {
+    ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
+  }
+
+  /**
+   * Virtual Destructor to enable polymorphism
+   */
+  virtual ~comms_t() {}
+
+  /**
+   * Returns the size of the communicator clique
+   */
+
+  int get_size() const { return impl_->get_size(); }
+
+  /**
+   * Returns the local rank
+   */
+  int get_rank() const { return impl_->get_rank(); }
+
+  /**
+   * Splits the current communicator clique into sub-cliques matching
+   * the given color and key
+   *
+   * @param color ranks w/ the same color are placed in the same communicator
+   * @param key controls rank assignment
+   */
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
+    return impl_->comm_split(color, key);
+  }
+
+  /**
+   * Performs a collective barrier synchronization
+   */
+  void barrier() const { impl_->barrier(); }
+
+  /**
+   * Some collective communications implementations (eg. NCCL) might use asynchronous
+   * collectives that are explicitly synchronized. It's important to always synchronize
+   * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
+   * to prevent the potential for deadlocks.
+   *
+   * @param stream the cuda stream to sync collective operations on
+   */
+  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
+
+  /**
+   * Performs an asynchronous point-to-point send
+   * @tparam value_t the type of data to send
+   * @param buf pointer to array of data to send
+   * @param size number of elements in buf
+   * @param dest destination rank
+   * @param tag a tag to use for the receiver to filter
+   * @param request pointer to hold returned request_t object.
+   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+   */
+  template <typename value_t>
+  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
+  }
+
+  /**
+   * Performs an asynchronous point-to-point receive
+   * @tparam value_t the type of data to be received
+   * @param buf pointer to (initialized) array that will hold received data
+   * @param size number of elements in buf
+   * @param source source rank
+   * @param tag a tag to use for message filtering
+   * @param request pointer to hold returned request_t object.
+   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+   */
+  template <typename value_t>
+  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
+  }
+
+  /**
+   * Synchronize on an array of request_t objects returned from isend/irecv
+   * @param count number of requests to synchronize on
+   * @param array_of_requests an array of request_t objects returned from isend/irecv
+   */
+  void waitall(int count, request_t array_of_requests[]) const
+  {
+    impl_->waitall(count, array_of_requests);
+  }
+
+  /**
+   * Perform an allreduce collective
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff data to reduce
+   * @param recvbuff buffer to hold the reduced result
+   * @param count number of elements in sendbuff
+   * @param op reduction operation to perform
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allreduce(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
+  {
+    impl_->allreduce(static_cast<const void*>(sendbuff),
+                     static_cast<void*>(recvbuff),
+                     count,
+                     get_type<value_t>(),
+                     op,
+                     stream);
+  }
+
+  /**
+   * Broadcast data from one rank to the rest
+   * @tparam value_t datatype of underlying buffers
+   * @param buff buffer to send
+   * @param count number of elements if buff
+   * @param root the rank initiating the broadcast
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
+  {
+    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
+  }
+
+  /**
+   * Broadcast data from one rank to the rest
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to broadcast (only used in root)
+   * @param recvbuff buffer to receive broadcasted data
+   * @param count number of elements if buff
+   * @param root the rank initiating the broadcast
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void bcast(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
+  {
+    impl_->bcast(static_cast<const void*>(sendbuff),
+                 static_cast<void*>(recvbuff),
+                 count,
+                 get_type<value_t>(),
+                 root,
+                 stream);
+  }
+
+  /**
+   * Reduce data from many ranks down to a single rank
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to reduce
+   * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
+   * @param count number of elements in sendbuff
+   * @param op reduction operation to perform
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void reduce(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t count,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    impl_->reduce(static_cast<const void*>(sendbuff),
+                  static_cast<void*>(recvbuff),
+                  count,
+                  get_type<value_t>(),
+                  op,
+                  root,
+                  stream);
+  }
+
+  /**
+   * Gathers data from each rank onto all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to gather
+   * @param recvbuff buffer containing gathered data from all ranks
+   * @param sendcount number of elements in send buffer
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allgather(const value_t* sendbuff,
+                 value_t* recvbuff,
+                 size_t sendcount,
+                 cudaStream_t stream) const
+  {
+    impl_->allgather(static_cast<const void*>(sendbuff),
+                     static_cast<void*>(recvbuff),
+                     sendcount,
+                     get_type<value_t>(),
+                     stream);
+  }
+
+  /**
+   * Gathers data from all ranks and delivers to combined data to all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuf buffer containing data to send
+   * @param recvbuf buffer containing data to receive
+   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+   *                   elements that are to be received from each rank
+   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+   *               (relative to recvbuf) at which to place the incoming data from each rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allgatherv(const value_t* sendbuf,
+                  value_t* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  cudaStream_t stream) const
+  {
+    impl_->allgatherv(static_cast<const void*>(sendbuf),
+                      static_cast<void*>(recvbuf),
+                      recvcounts,
+                      displs,
+                      get_type<value_t>(),
+                      stream);
+  }
+
+  /**
+   * Gathers data from each rank onto all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to gather
+   * @param recvbuff buffer containing gathered data from all ranks
+   * @param sendcount number of elements in send buffer
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void gather(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t sendcount,
+              int root,
+              cudaStream_t stream) const
+  {
+    impl_->gather(static_cast<const void*>(sendbuff),
+                  static_cast<void*>(recvbuff),
+                  sendcount,
+                  get_type<value_t>(),
+                  root,
+                  stream);
+  }
+
+  /**
+   * Gathers data from all ranks and delivers to combined data to all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuf buffer containing data to send
+   * @param recvbuf buffer containing data to receive
+   * @param sendcount number of elements in send buffer
+   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+   *                   elements that are to be received from each rank
+   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+   *               (relative to recvbuf) at which to place the incoming data from each rank
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void gatherv(const value_t* sendbuf,
+               value_t* recvbuf,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               int root,
+               cudaStream_t stream) const
+  {
+    impl_->gatherv(static_cast<const void*>(sendbuf),
+                   static_cast<void*>(recvbuf),
+                   sendcount,
+                   recvcounts,
+                   displs,
+                   get_type<value_t>(),
+                   root,
+                   stream);
+  }
+
+  /**
+   * Reduces data from all ranks then scatters the result across ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
+   * @param recvbuff buffer containing received data
+   * @param recvcount number of items to receive
+   * @param op reduction operation to perform
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void reducescatter(const value_t* sendbuff,
+                     value_t* recvbuff,
+                     size_t recvcount,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    impl_->reducescatter(static_cast<const void*>(sendbuff),
+                         static_cast<void*>(recvbuff),
+                         recvcount,
+                         get_type<value_t>(),
+                         op,
+                         stream);
+  }
+
+  /**
+   * Performs a point-to-point send
+   *
+   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+   *
+   * @tparam value_t the type of data to send
+   * @param buf pointer to array of data to send
+   * @param size number of elements in buf
+   * @param dest destination rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
+  {
+    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
+  }
+
+  /**
+   * Performs a point-to-point receive
+   *
+   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+   *
+   * @tparam value_t the type of data to be received
+   * @param buf pointer to (initialized) array that will hold received data
+   * @param size number of elements in buf
+   * @param source source rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
+  {
+    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
+  }
+
+  /**
+   * Performs a point-to-point send/receive
+   *
+   * @tparam value_t the type of data to be sent & received
+   * @param sendbuf pointer to array of data to send
+   * @param sendsize number of elements in sendbuf
+   * @param dest destination rank
+   * @param recvbuf pointer to (initialized) array that will hold received data
+   * @param recvsize number of elements in recvbuf
+   * @param source source rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_sendrecv(const value_t* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       value_t* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
+    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
+                           sendsize * sizeof(value_t),
+                           dest,
+                           static_cast<void*>(recvbuf),
+                           recvsize * sizeof(value_t),
+                           source,
+                           stream);
+  }
+
+  /**
+   * Performs a multicast send/receive
+   *
+   * @tparam value_t the type of data to be sent & received
+   * @param sendbuf pointer to array of data to send
+   * @param sendsizes numbers of elements to send
+   * @param sendoffsets offsets in a number of elements from sendbuf
+   * @param dests destination ranks
+   * @param recvbuf pointer to (initialized) array that will hold received data
+   * @param recvsizes numbers of elements to recv
+   * @param recvoffsets offsets in a number of elements from recvbuf
+   * @param sources source ranks
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_multicast_sendrecv(const value_t* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
                                  value_t* recvbuf,
-                                 size_t recvsize,
-                                 int source,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
                                  cudaStream_t stream) const
-            {
-                impl_->device_sendrecv(static_cast<const void*>(sendbuf),
-                                       sendsize * sizeof(value_t),
-                                       dest,
-                                       static_cast<void*>(recvbuf),
-                                       recvsize * sizeof(value_t),
-                                       source,
-                                       stream);
-            }
-
-            /**
-             * Performs a multicast send/receive
-             *
-             * @tparam value_t the type of data to be sent & received
-             * @param sendbuf pointer to array of data to send
-             * @param sendsizes numbers of elements to send
-             * @param sendoffsets offsets in a number of elements from sendbuf
-             * @param dests destination ranks
-             * @param recvbuf pointer to (initialized) array that will hold received data
-             * @param recvsizes numbers of elements to recv
-             * @param recvoffsets offsets in a number of elements from recvbuf
-             * @param sources source ranks
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_multicast_sendrecv(const value_t* sendbuf,
-                                           std::vector<size_t> const& sendsizes,
-                                           std::vector<size_t> const& sendoffsets,
-                                           std::vector<int> const& dests,
-                                           value_t* recvbuf,
-                                           std::vector<size_t> const& recvsizes,
-                                           std::vector<size_t> const& recvoffsets,
-                                           std::vector<int> const& sources,
-                                           cudaStream_t stream) const
-            {
-                auto sendbytesizes   = sendsizes;
-                auto sendbyteoffsets = sendoffsets;
-                for (size_t i = 0; i < sendsizes.size(); ++i) {
-                    sendbytesizes[i] *= sizeof(value_t);
-                    sendbyteoffsets[i] *= sizeof(value_t);
-                }
-                auto recvbytesizes   = recvsizes;
-                auto recvbyteoffsets = recvoffsets;
-                for (size_t i = 0; i < recvsizes.size(); ++i) {
-                    recvbytesizes[i] *= sizeof(value_t);
-                    recvbyteoffsets[i] *= sizeof(value_t);
-                }
-                impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
-                                                 sendbytesizes,
-                                                 sendbyteoffsets,
-                                                 dests,
-                                                 static_cast<void*>(recvbuf),
-                                                 recvbytesizes,
-                                                 recvbyteoffsets,
-                                                 sources,
-                                                 stream);
-            }
-
-        private:
-            std::unique_ptr<comms_iface> impl_;
-        };
-
-    }  // namespace comms
+  {
+    auto sendbytesizes   = sendsizes;
+    auto sendbyteoffsets = sendoffsets;
+    for (size_t i = 0; i < sendsizes.size(); ++i) {
+      sendbytesizes[i] *= sizeof(value_t);
+      sendbyteoffsets[i] *= sizeof(value_t);
+    }
+    auto recvbytesizes   = recvsizes;
+    auto recvbyteoffsets = recvoffsets;
+    for (size_t i = 0; i < recvsizes.size(); ++i) {
+      recvbytesizes[i] *= sizeof(value_t);
+      recvbyteoffsets[i] *= sizeof(value_t);
+    }
+    impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
+                                     sendbytesizes,
+                                     sendbyteoffsets,
+                                     dests,
+                                     static_cast<void*>(recvbuf),
+                                     recvbytesizes,
+                                     recvbyteoffsets,
+                                     sources,
+                                     stream);
+  }
+
+ private:
+  std::unique_ptr<comms_iface> impl_;
+};
+
+}  // namespace comms
 }  // namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 8fa1c114e3..b84fabbe6a 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -19,7 +19,6 @@
  * Please use raft_runtime/cudart_utils.hpp instead.
  */
 
-
 #ifndef __RAFT_RT_CUDART_UTILS_H
 #define __RAFT_RT_CUDART_UTILS_H
 
@@ -47,10 +46,10 @@ namespace raft {
 /**
  * @brief Exception thrown when a CUDA error is encountered.
  */
-    struct cuda_error : public raft::exception {
-        explicit cuda_error(char const* const message) : raft::exception(message) {}
-        explicit cuda_error(std::string const& message) : raft::exception(message) {}
-    };
+struct cuda_error : public raft::exception {
+  explicit cuda_error(char const* const message) : raft::exception(message) {}
+  explicit cuda_error(std::string const& message) : raft::exception(message) {}
+};
 
 }  // namespace raft
 
@@ -142,99 +141,99 @@ namespace raft {
 namespace raft {
 
 /** Helper method to get to know warp size in device code */
-    __host__ __device__ constexpr inline int warp_size() { return 32; }
+__host__ __device__ constexpr inline int warp_size() { return 32; }
 
-    __host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
+__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to threads.
  */
-    class grid_1d_thread_t {
-    public:
-        int const block_size{0};
-        int const num_blocks{0};
-
-        /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         * @param elements_per_thread Typically, a single kernel thread processes more than a single
-         * element; this affects the number of threads the grid must contain
-         */
-        grid_1d_thread_t(size_t overall_num_elements,
-                         size_t num_threads_per_block,
-                         size_t max_num_blocks_1d,
-                         size_t elements_per_thread = 1)
-                : block_size(num_threads_per_block),
-                  num_blocks(
-                          std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
-                                   (elements_per_thread * num_threads_per_block),
-                                   max_num_blocks_1d))
-        {
-            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                         "num_threads_per_block / warp_size() must be > 0");
-            RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
-        }
-    };
+class grid_1d_thread_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param elements_per_thread Typically, a single kernel thread processes more than a single
+   * element; this affects the number of threads the grid must contain
+   */
+  grid_1d_thread_t(size_t overall_num_elements,
+                   size_t num_threads_per_block,
+                   size_t max_num_blocks_1d,
+                   size_t elements_per_thread = 1)
+    : block_size(num_threads_per_block),
+      num_blocks(
+        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
+                   (elements_per_thread * num_threads_per_block),
+                 max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+    RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
+  }
+};
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to warps.
  */
-    class grid_1d_warp_t {
-    public:
-        int const block_size{0};
-        int const num_blocks{0};
-
-        /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         */
-        grid_1d_warp_t(size_t overall_num_elements,
-                       size_t num_threads_per_block,
-                       size_t max_num_blocks_1d)
-                : block_size(num_threads_per_block),
-                  num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
-                                      (num_threads_per_block / warp_size()),
-                                      max_num_blocks_1d))
-        {
-            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                         "num_threads_per_block / warp_size() must be > 0");
-        }
-    };
+class grid_1d_warp_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   */
+  grid_1d_warp_t(size_t overall_num_elements,
+                 size_t num_threads_per_block,
+                 size_t max_num_blocks_1d)
+    : block_size(num_threads_per_block),
+      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
+                            (num_threads_per_block / warp_size()),
+                          max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+  }
+};
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to blocks.
  */
-    class grid_1d_block_t {
-    public:
-        int const block_size{0};
-        int const num_blocks{0};
-
-        /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         */
-        grid_1d_block_t(size_t overall_num_elements,
-                        size_t num_threads_per_block,
-                        size_t max_num_blocks_1d)
-                : block_size(num_threads_per_block),
-                  num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
-        {
-            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                         "num_threads_per_block / warp_size() must be > 0");
-        }
-    };
+class grid_1d_block_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   */
+  grid_1d_block_t(size_t overall_num_elements,
+                  size_t num_threads_per_block,
+                  size_t max_num_blocks_1d)
+    : block_size(num_threads_per_block),
+      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+  }
+};
 
 /**
  * @brief Generic copy method for all kinds of transfers
@@ -244,11 +243,11 @@ namespace raft {
  * @param len lenth of the src/dst buffers in terms of number of elements
  * @param stream cuda stream
  */
-    template <typename Type>
-    void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
-    {
-        CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
-    }
+template <typename Type>
+void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+}
 
 /**
  * @defgroup Copy Copy methods
@@ -257,152 +256,152 @@ namespace raft {
  * @{
  */
 /** performs a host to device copy */
-    template <typename Type>
-    void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
-    {
-        copy(d_ptr, h_ptr, len, stream);
-    }
+template <typename Type>
+void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
+{
+  copy(d_ptr, h_ptr, len, stream);
+}
 
 /** performs a device to host copy */
-    template <typename Type>
-    void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
-    {
-        copy(h_ptr, d_ptr, len, stream);
-    }
-
-    template <typename Type>
-    void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
-    {
-        CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
-    }
+template <typename Type>
+void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
+{
+  copy(h_ptr, d_ptr, len, stream);
+}
+
+template <typename Type>
+void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+}
 /** @} */
 
 /**
  * @defgroup Debug Utils for debugging host/device buffers
  * @{
  */
-    template <class T, class OutStream>
-    void print_host_vector(const char* variable_name,
-                           const T* host_mem,
-                           size_t componentsCount,
-                           OutStream& out)
-    {
-        out << variable_name << "=[";
-        for (size_t i = 0; i < componentsCount; ++i) {
-            if (i != 0) out << ",";
-            out << host_mem[i];
-        }
-        out << "];\n";
-    }
-
-    template <class T, class OutStream>
-    void print_device_vector(const char* variable_name,
-                             const T* devMem,
-                             size_t componentsCount,
-                             OutStream& out)
-    {
-        T* host_mem = new T[componentsCount];
-        CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
-        print_host_vector(variable_name, host_mem, componentsCount, out);
-        delete[] host_mem;
-    }
+template <class T, class OutStream>
+void print_host_vector(const char* variable_name,
+                       const T* host_mem,
+                       size_t componentsCount,
+                       OutStream& out)
+{
+  out << variable_name << "=[";
+  for (size_t i = 0; i < componentsCount; ++i) {
+    if (i != 0) out << ",";
+    out << host_mem[i];
+  }
+  out << "];\n";
+}
+
+template <class T, class OutStream>
+void print_device_vector(const char* variable_name,
+                         const T* devMem,
+                         size_t componentsCount,
+                         OutStream& out)
+{
+  T* host_mem = new T[componentsCount];
+  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
+  print_host_vector(variable_name, host_mem, componentsCount, out);
+  delete[] host_mem;
+}
 /** @} */
 
-    static std::mutex mutex_;
-    static std::unordered_map<void*, size_t> allocations;
-
-    template <typename Type>
-    void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false)
-    {
-        size_t size = len * sizeof(Type);
-        ptr         = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
-        if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
-
-        std::lock_guard<std::mutex> _(mutex_);
-        allocations[ptr] = size;
-    }
-
-    template <typename Type>
-    void deallocate(Type*& ptr, rmm::cuda_stream_view stream)
-    {
-        std::lock_guard<std::mutex> _(mutex_);
-        size_t size = allocations[ptr];
-        allocations.erase(ptr);
-        rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
-    }
-
-    inline void deallocate_all(rmm::cuda_stream_view stream)
-    {
-        std::lock_guard<std::mutex> _(mutex_);
-        for (auto& alloc : allocations) {
-            void* ptr   = alloc.first;
-            size_t size = alloc.second;
-            rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
-        }
-        allocations.clear();
-    }
+static std::mutex mutex_;
+static std::unordered_map<void*, size_t> allocations;
+
+template <typename Type>
+void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false)
+{
+  size_t size = len * sizeof(Type);
+  ptr         = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
+  if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
+
+  std::lock_guard<std::mutex> _(mutex_);
+  allocations[ptr] = size;
+}
+
+template <typename Type>
+void deallocate(Type*& ptr, rmm::cuda_stream_view stream)
+{
+  std::lock_guard<std::mutex> _(mutex_);
+  size_t size = allocations[ptr];
+  allocations.erase(ptr);
+  rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
+}
+
+inline void deallocate_all(rmm::cuda_stream_view stream)
+{
+  std::lock_guard<std::mutex> _(mutex_);
+  for (auto& alloc : allocations) {
+    void* ptr   = alloc.first;
+    size_t size = alloc.second;
+    rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
+  }
+  allocations.clear();
+}
 
 /** helper method to get max usable shared mem per block parameter */
-    inline int getSharedMemPerBlock()
-    {
-        int devId;
-        RAFT_CUDA_TRY(cudaGetDevice(&devId));
-        int smemPerBlk;
-        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
-        return smemPerBlk;
-    }
+inline int getSharedMemPerBlock()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int smemPerBlk;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
+  return smemPerBlk;
+}
 
 /** helper method to get multi-processor count parameter */
-    inline int getMultiProcessorCount()
-    {
-        int devId;
-        RAFT_CUDA_TRY(cudaGetDevice(&devId));
-        int mpCount;
-        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
-        return mpCount;
-    }
+inline int getMultiProcessorCount()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int mpCount;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+  return mpCount;
+}
 
 /** helper method to convert an array on device to a string on host */
-    template <typename T>
-    std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
-    {
-        std::stringstream ss;
+template <typename T>
+std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
+{
+  std::stringstream ss;
 
-        T* arr_h = (T*)malloc(size * sizeof(T));
-        update_host(arr_h, arr, size, stream);
-        RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  T* arr_h = (T*)malloc(size * sizeof(T));
+  update_host(arr_h, arr, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
-        ss << name << " = [ ";
-        for (int i = 0; i < size; i++) {
-            ss << std::setw(width) << arr_h[i];
+  ss << name << " = [ ";
+  for (int i = 0; i < size; i++) {
+    ss << std::setw(width) << arr_h[i];
 
-            if (i < size - 1) ss << ", ";
-        }
-        ss << " ]" << std::endl;
+    if (i < size - 1) ss << ", ";
+  }
+  ss << " ]" << std::endl;
 
-        free(arr_h);
+  free(arr_h);
 
-        return ss.str();
-    }
+  return ss.str();
+}
 
 /** this seems to be unused, but may be useful in the future */
-    template <typename T>
-    void ASSERT_DEVICE_MEM(T* ptr, std::string name)
-    {
-        cudaPointerAttributes s_att;
-        cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
-
-        if (s_err != 0 || s_att.device == -1)
-            std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
-                      << ", err=" << s_err << std::endl;
-    }
-
-    inline uint32_t curTimeMillis()
-    {
-        auto now      = std::chrono::high_resolution_clock::now();
-        auto duration = now.time_since_epoch();
-        return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
-    }
+template <typename T>
+void ASSERT_DEVICE_MEM(T* ptr, std::string name)
+{
+  cudaPointerAttributes s_att;
+  cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
+
+  if (s_err != 0 || s_att.device == -1)
+    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
+              << ", err=" << s_err << std::endl;
+}
+
+inline uint32_t curTimeMillis()
+{
+  auto now      = std::chrono::high_resolution_clock::now();
+  auto duration = now.time_since_epoch();
+  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+}
 
 /** Helper function to calculate need memory for allocate to store dense matrix.
  * @param rows number of rows in matrix
@@ -410,34 +409,34 @@ namespace raft {
  * @return need number of items to allocate via allocate()
  * @sa allocate()
  */
-    inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
+inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
 
 /** Helper function to check alignment of pointer.
  * @param ptr the pointer to check
  * @param alignment to be checked for
  * @return true if address in bytes is a multiple of alignment
  */
-    template <typename Type>
-    bool is_aligned(Type* ptr, size_t alignment)
-    {
-        return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
-    }
+template <typename Type>
+bool is_aligned(Type* ptr, size_t alignment)
+{
+  return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+}
 
 /** calculate greatest common divisor of two numbers
  * @a integer
  * @b integer
  * @ return gcd of a and b
  */
-    template <typename IntType>
-    IntType gcd(IntType a, IntType b)
-    {
-        while (b != 0) {
-            IntType tmp = b;
-            b           = a % b;
-            a           = tmp;
-        }
-        return a;
-    }
+template <typename IntType>
+IntType gcd(IntType a, IntType b)
+{
+  while (b != 0) {
+    IntType tmp = b;
+    b           = a % b;
+    a           = tmp;
+  }
+  return a;
+}
 
 }  // namespace raft
 
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index 1dc6e3d755..04806e6ca2 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -19,7 +19,6 @@
  * Please use the include/raft_runtime/error.hpp instead.
  */
 
-
 #ifndef __RAFT_RT_ERROR
 #define __RAFT_RT_ERROR
 
@@ -36,36 +35,36 @@
 namespace raft {
 
 /** base exception class for the whole of raft */
-    class exception : public std::exception {
-    public:
-        /** default ctor */
-        explicit exception() noexcept : std::exception(), msg_() {}
-
-        /** copy ctor */
-        exception(exception const& src) noexcept : std::exception(), msg_(src.what())
-        {
-            collect_call_stack();
-        }
-
-        /** ctor from an input message */
-        explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
-        {
-            collect_call_stack();
-        }
-
-        /** get the message associated with this exception */
-        char const* what() const noexcept override { return msg_.c_str(); }
-
-    private:
-        /** message associated with this exception */
-        std::string msg_;
-
-        /** append call stack info to this exception's message for ease of debug */
-        // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-        void collect_call_stack() noexcept
-        {
+class exception : public std::exception {
+ public:
+  /** default ctor */
+  explicit exception() noexcept : std::exception(), msg_() {}
+
+  /** copy ctor */
+  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
+  {
+    collect_call_stack();
+  }
+
+  /** ctor from an input message */
+  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
+  {
+    collect_call_stack();
+  }
+
+  /** get the message associated with this exception */
+  char const* what() const noexcept override { return msg_.c_str(); }
+
+ private:
+  /** message associated with this exception */
+  std::string msg_;
+
+  /** append call stack info to this exception's message for ease of debug */
+  // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
+  void collect_call_stack() noexcept
+  {
 #ifdef __GNUC__
-            constexpr int kMaxStackDepth = 64;
+    constexpr int kMaxStackDepth = 64;
     void* stack[kMaxStackDepth];  // NOLINT
     auto depth = backtrace(stack, kMaxStackDepth);
     std::ostringstream oss;
@@ -83,8 +82,8 @@ namespace raft {
     free(strings);
     msg_ += oss.str();
 #endif  // __GNUC__
-        }
-    };
+  }
+};
 
 /**
  * @brief Exception thrown when logical precondition is violated.
@@ -93,10 +92,10 @@ namespace raft {
  * RAFT_EXPECTS and  RAFT_FAIL macros.
  *
  */
-    struct logic_error : public raft::exception {
-        explicit logic_error(char const* const message) : raft::exception(message) {}
-        explicit logic_error(std::string const& message) : raft::exception(message) {}
-    };
+struct logic_error : public raft::exception {
+  explicit logic_error(char const* const message) : raft::exception(message) {}
+  explicit logic_error(std::string const& message) : raft::exception(message) {}
+};
 
 }  // namespace raft
 
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 13a3fc26d9..158816f762 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -56,288 +56,288 @@ namespace raft {
  * @brief Main handle object that stores all necessary context used for calling
  *        necessary cuda kernels and/or libraries
  */
-    class handle_t {
-    public:
-        // delete copy/move constructors and assignment operators as
-        // copying and moving underlying resources is unsafe
-        handle_t(const handle_t&) = delete;
-        handle_t& operator=(const handle_t&) = delete;
-        handle_t(handle_t&&)                 = delete;
-        handle_t& operator=(handle_t&&) = delete;
-
-        /**
-         * @brief Construct a handle with a stream view and stream pool
-         *
-         * @param[in] stream_view the default stream (which has the default per-thread stream if
-         * unspecified)
-         * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
-         */
-        handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
-                 std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
-                : dev_id_([]() -> int {
-            int cur_dev = -1;
-            RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
-            return cur_dev;
-        }()),
-                  stream_view_{stream_view},
-                  stream_pool_{stream_pool}
-        {
-            create_resources();
-        }
-
-        /** Destroys all held-up resources */
-        virtual ~handle_t() { destroy_resources(); }
-
-        int get_device() const { return dev_id_; }
-
-        cublasHandle_t get_cublas_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cublas_initialized_) {
-                RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
-                RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
-                cublas_initialized_ = true;
-            }
-            return cublas_handle_;
-        }
-
-        cusolverDnHandle_t get_cusolver_dn_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cusolver_dn_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
-                cusolver_dn_initialized_ = true;
-            }
-            return cusolver_dn_handle_;
-        }
-
-        cusolverSpHandle_t get_cusolver_sp_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cusolver_sp_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
-                cusolver_sp_initialized_ = true;
-            }
-            return cusolver_sp_handle_;
-        }
-
-        cusparseHandle_t get_cusparse_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cusparse_initialized_) {
-                RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
-                RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
-                cusparse_initialized_ = true;
-            }
-            return cusparse_handle_;
-        }
-
-        rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
-
-        /**
-         * @brief synchronize a stream on the handle
-         */
-        void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
-
-        /**
-         * @brief synchronize main stream on the handle
-         */
-        void sync_stream() const { sync_stream(stream_view_); }
-
-        /**
-         * @brief returns main stream on the handle
-         */
-        rmm::cuda_stream_view get_stream() const { return stream_view_; }
-
-        /**
-         * @brief returns whether stream pool was initialized on the handle
-         */
-
-        bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
-
-        /**
-         * @brief returns stream pool on the handle
-         */
-        const rmm::cuda_stream_pool& get_stream_pool() const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            return *stream_pool_;
-        }
-
-        std::size_t get_stream_pool_size() const
-        {
-            return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
-        }
-
-        /**
-         * @brief return stream from pool
-         */
-        rmm::cuda_stream_view get_stream_from_stream_pool() const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            return stream_pool_->get_stream();
-        }
-
-        /**
-         * @brief return stream from pool at index
-         */
-        rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            return stream_pool_->get_stream(stream_idx);
-        }
-
-        /**
-         * @brief return stream from pool if size > 0, else main stream on handle
-         */
-        rmm::cuda_stream_view get_next_usable_stream() const
-        {
-            return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
-        }
-
-        /**
-         * @brief return stream from pool at index if size > 0, else main stream on handle
-         *
-         * @param[in] stream_idx the required index of the stream in the stream pool if available
-         */
-        rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
-        {
-            return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
-        }
-
-        /**
-         * @brief synchronize the stream pool on the handle
-         */
-        void sync_stream_pool() const
-        {
-            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-                sync_stream(stream_pool_->get_stream(i));
-            }
-        }
-
-        /**
-         * @brief synchronize subset of stream pool
-         *
-         * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
-         */
-        void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            for (const auto& stream_index : stream_indices) {
-                sync_stream(stream_pool_->get_stream(stream_index));
-            }
-        }
-
-        /**
-         * @brief ask stream pool to wait on last event in main stream
-         */
-        void wait_stream_pool_on_stream() const
-        {
-            RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
-            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-                RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
-            }
-        }
-
-        void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
-
-        const comms::comms_t& get_comms() const
-        {
-            RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
-            return *communicator_;
-        }
-
-        void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
-        {
-            subcomms_[key] = subcomm;
-        }
-
-        const comms::comms_t& get_subcomm(std::string key) const
-        {
-            RAFT_EXPECTS(
-                    subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
-
-            auto subcomm = subcomms_.at(key);
-
-            RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
-
-            return *subcomm;
-        }
-
-        bool comms_initialized() const { return (nullptr != communicator_.get()); }
-
-        const cudaDeviceProp& get_device_properties() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!device_prop_initialized_) {
-                RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
-                device_prop_initialized_ = true;
-            }
-            return prop_;
-        }
-
-    private:
-        std::shared_ptr<comms::comms_t> communicator_;
-        std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
-
-        const int dev_id_;
-        mutable cublasHandle_t cublas_handle_;
-        mutable bool cublas_initialized_{false};
-        mutable cusolverDnHandle_t cusolver_dn_handle_;
-        mutable bool cusolver_dn_initialized_{false};
-        mutable cusolverSpHandle_t cusolver_sp_handle_;
-        mutable bool cusolver_sp_initialized_{false};
-        mutable cusparseHandle_t cusparse_handle_;
-        mutable bool cusparse_initialized_{false};
-        std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
-        rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
-        std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
-        cudaEvent_t event_;
-        mutable cudaDeviceProp prop_;
-        mutable bool device_prop_initialized_{false};
-        mutable std::mutex mutex_;
-
-        void create_resources()
-        {
-            thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
-
-            RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-        }
-
-        void destroy_resources()
-        {
-            if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
-            if (cusolver_dn_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
-            }
-            if (cusolver_sp_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
-            }
-            if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
-            RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
-        }
-    };  // class handle_t
+class handle_t {
+ public:
+  // delete copy/move constructors and assignment operators as
+  // copying and moving underlying resources is unsafe
+  handle_t(const handle_t&) = delete;
+  handle_t& operator=(const handle_t&) = delete;
+  handle_t(handle_t&&)                 = delete;
+  handle_t& operator=(handle_t&&) = delete;
+
+  /**
+   * @brief Construct a handle with a stream view and stream pool
+   *
+   * @param[in] stream_view the default stream (which has the default per-thread stream if
+   * unspecified)
+   * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
+   */
+  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
+           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
+    : dev_id_([]() -> int {
+        int cur_dev = -1;
+        RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
+        return cur_dev;
+      }()),
+      stream_view_{stream_view},
+      stream_pool_{stream_pool}
+  {
+    create_resources();
+  }
+
+  /** Destroys all held-up resources */
+  virtual ~handle_t() { destroy_resources(); }
+
+  int get_device() const { return dev_id_; }
+
+  cublasHandle_t get_cublas_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cublas_initialized_) {
+      RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
+      RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
+      cublas_initialized_ = true;
+    }
+    return cublas_handle_;
+  }
+
+  cusolverDnHandle_t get_cusolver_dn_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusolver_dn_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
+      cusolver_dn_initialized_ = true;
+    }
+    return cusolver_dn_handle_;
+  }
+
+  cusolverSpHandle_t get_cusolver_sp_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusolver_sp_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
+      cusolver_sp_initialized_ = true;
+    }
+    return cusolver_sp_handle_;
+  }
+
+  cusparseHandle_t get_cusparse_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusparse_initialized_) {
+      RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
+      RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
+      cusparse_initialized_ = true;
+    }
+    return cusparse_handle_;
+  }
+
+  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
+
+  /**
+   * @brief synchronize a stream on the handle
+   */
+  void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
+
+  /**
+   * @brief synchronize main stream on the handle
+   */
+  void sync_stream() const { sync_stream(stream_view_); }
+
+  /**
+   * @brief returns main stream on the handle
+   */
+  rmm::cuda_stream_view get_stream() const { return stream_view_; }
+
+  /**
+   * @brief returns whether stream pool was initialized on the handle
+   */
+
+  bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
+
+  /**
+   * @brief returns stream pool on the handle
+   */
+  const rmm::cuda_stream_pool& get_stream_pool() const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return *stream_pool_;
+  }
+
+  std::size_t get_stream_pool_size() const
+  {
+    return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
+  }
+
+  /**
+   * @brief return stream from pool
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool() const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return stream_pool_->get_stream();
+  }
+
+  /**
+   * @brief return stream from pool at index
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return stream_pool_->get_stream(stream_idx);
+  }
+
+  /**
+   * @brief return stream from pool if size > 0, else main stream on handle
+   */
+  rmm::cuda_stream_view get_next_usable_stream() const
+  {
+    return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
+  }
+
+  /**
+   * @brief return stream from pool at index if size > 0, else main stream on handle
+   *
+   * @param[in] stream_idx the required index of the stream in the stream pool if available
+   */
+  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
+  {
+    return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
+  }
+
+  /**
+   * @brief synchronize the stream pool on the handle
+   */
+  void sync_stream_pool() const
+  {
+    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+      sync_stream(stream_pool_->get_stream(i));
+    }
+  }
+
+  /**
+   * @brief synchronize subset of stream pool
+   *
+   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+   */
+  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    for (const auto& stream_index : stream_indices) {
+      sync_stream(stream_pool_->get_stream(stream_index));
+    }
+  }
+
+  /**
+   * @brief ask stream pool to wait on last event in main stream
+   */
+  void wait_stream_pool_on_stream() const
+  {
+    RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
+    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
+    }
+  }
+
+  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
+
+  const comms::comms_t& get_comms() const
+  {
+    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
+    return *communicator_;
+  }
+
+  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+  {
+    subcomms_[key] = subcomm;
+  }
+
+  const comms::comms_t& get_subcomm(std::string key) const
+  {
+    RAFT_EXPECTS(
+      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
+
+    auto subcomm = subcomms_.at(key);
+
+    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
+
+    return *subcomm;
+  }
+
+  bool comms_initialized() const { return (nullptr != communicator_.get()); }
+
+  const cudaDeviceProp& get_device_properties() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!device_prop_initialized_) {
+      RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
+      device_prop_initialized_ = true;
+    }
+    return prop_;
+  }
+
+ private:
+  std::shared_ptr<comms::comms_t> communicator_;
+  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
+
+  const int dev_id_;
+  mutable cublasHandle_t cublas_handle_;
+  mutable bool cublas_initialized_{false};
+  mutable cusolverDnHandle_t cusolver_dn_handle_;
+  mutable bool cusolver_dn_initialized_{false};
+  mutable cusolverSpHandle_t cusolver_sp_handle_;
+  mutable bool cusolver_sp_initialized_{false};
+  mutable cusparseHandle_t cusparse_handle_;
+  mutable bool cusparse_initialized_{false};
+  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
+  rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
+  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
+  cudaEvent_t event_;
+  mutable cudaDeviceProp prop_;
+  mutable bool device_prop_initialized_{false};
+  mutable std::mutex mutex_;
+
+  void create_resources()
+  {
+    thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
+
+    RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+  }
+
+  void destroy_resources()
+  {
+    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
+    if (cusolver_dn_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+    }
+    if (cusolver_sp_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+    }
+    if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
+    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
+  }
+};  // class handle_t
 
 /**
  * @brief RAII approach to synchronizing across all streams in the handle
  */
-    class stream_syncer {
-    public:
-        explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
-        ~stream_syncer()
-        {
-            handle_.wait_stream_pool_on_stream();
-            handle_.sync_stream_pool();
-        }
-
-        stream_syncer(const stream_syncer& other) = delete;
-        stream_syncer& operator=(const stream_syncer& other) = delete;
-
-    private:
-        const handle_t& handle_;
-    };  // class stream_syncer
+class stream_syncer {
+ public:
+  explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
+  ~stream_syncer()
+  {
+    handle_.wait_stream_pool_on_stream();
+    handle_.sync_stream_pool();
+  }
+
+  stream_syncer(const stream_syncer& other) = delete;
+  stream_syncer& operator=(const stream_syncer& other) = delete;
+
+ private:
+  const handle_t& handle_;
+};  // class stream_syncer
 
 }  // namespace raft
 
diff --git a/cpp/include/raft/interruptible.hpp b/cpp/include/raft/interruptible.hpp
index 959d1063fb..6764065363 100644
--- a/cpp/include/raft/interruptible.hpp
+++ b/cpp/include/raft/interruptible.hpp
@@ -19,7 +19,6 @@
  * Please use the include/raft_runtime/interruptible.hpp instead.
  */
 
-
 #ifndef __RAFT_RT_INTERRUPTIBLE_H
 #define __RAFT_RT_INTERRUPTIBLE_H
 
@@ -40,9 +39,9 @@ namespace raft {
  * @brief Exception thrown during `interruptible::synchronize` call when it detects a request
  * to cancel the work performed in this CPU thread.
  */
-    struct interrupted_exception : public raft::exception {
-        using raft::exception::exception;
-    };
+struct interrupted_exception : public raft::exception {
+  using raft::exception::exception;
+};
 
 /**
  * @brief Cooperative-style interruptible execution.
@@ -69,208 +68,208 @@ namespace raft {
  * (e.g., CTRL+C), but extra effort on the use side is required to allow safe interrupting and
  * resuming of the GPU stream work.
  */
-    class interruptible {
-    public:
-        /**
-         * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
-         * called on this CPU thread.
-         *
-         * @param [in] stream a CUDA stream.
-         *
-         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-         * thread before the currently captured work has been finished.
-         * @throw raft::cuda_error if another CUDA error happens.
-         */
-        static inline void synchronize(rmm::cuda_stream_view stream)
-        {
-            get_token()->synchronize_impl(cudaStreamQuery, stream);
-        }
+class interruptible {
+ public:
+  /**
+   * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
+   * called on this CPU thread.
+   *
+   * @param [in] stream a CUDA stream.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread before the currently captured work has been finished.
+   * @throw raft::cuda_error if another CUDA error happens.
+   */
+  static inline void synchronize(rmm::cuda_stream_view stream)
+  {
+    get_token()->synchronize_impl(cudaStreamQuery, stream);
+  }
 
-        /**
-         * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
-         * called on this CPU thread.
-         *
-         * @param [in] event a CUDA event.
-         *
-         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-         * thread before the currently captured work has been finished.
-         * @throw raft::cuda_error if another CUDA error happens.
-         */
-        static inline void synchronize(cudaEvent_t event)
-        {
-            get_token()->synchronize_impl(cudaEventQuery, event);
-        }
+  /**
+   * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
+   * called on this CPU thread.
+   *
+   * @param [in] event a CUDA event.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread before the currently captured work has been finished.
+   * @throw raft::cuda_error if another CUDA error happens.
+   */
+  static inline void synchronize(cudaEvent_t event)
+  {
+    get_token()->synchronize_impl(cudaEventQuery, event);
+  }
 
-        /**
-         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-         * `interruptible::cancel`.
-         *
-         * This is a cancellation point for an interruptible thread. It's called in the internals of
-         * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
-         * recommended to call `interruptible::yield()` in between to make sure the thread does not become
-         * unresponsive for too long.
-         *
-         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-         *
-         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-         * thread.
-         */
-        static inline void yield() { get_token()->yield_impl(); }
+  /**
+   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+   * `interruptible::cancel`.
+   *
+   * This is a cancellation point for an interruptible thread. It's called in the internals of
+   * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
+   * recommended to call `interruptible::yield()` in between to make sure the thread does not become
+   * unresponsive for too long.
+   *
+   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread.
+   */
+  static inline void yield() { get_token()->yield_impl(); }
 
-        /**
-         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-         * `interruptible::cancel`.
-         *
-         * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
-         *
-         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-         *
-         * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
-         */
-        static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
+  /**
+   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+   * `interruptible::cancel`.
+   *
+   * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
+   *
+   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+   *
+   * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
+   */
+  static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
 
-        /**
-         * @brief Get a cancellation token for this CPU thread.
-         *
-         * @return an object that can be used to cancel the GPU work waited on this CPU thread.
-         */
-        static inline auto get_token() -> std::shared_ptr<interruptible>
-        {
-            // NB: using static thread-local storage to keep the token alive once it is initialized
-            static thread_local std::shared_ptr<interruptible> s(
-                    get_token_impl<true>(std::this_thread::get_id()));
-            return s;
-        }
+  /**
+   * @brief Get a cancellation token for this CPU thread.
+   *
+   * @return an object that can be used to cancel the GPU work waited on this CPU thread.
+   */
+  static inline auto get_token() -> std::shared_ptr<interruptible>
+  {
+    // NB: using static thread-local storage to keep the token alive once it is initialized
+    static thread_local std::shared_ptr<interruptible> s(
+      get_token_impl<true>(std::this_thread::get_id()));
+    return s;
+  }
 
-        /**
-         * @brief Get a cancellation token for a CPU thread given by its id.
-         *
-         * The returned token may live longer than the associated thread. In that case, using its
-         * `cancel` method has no effect.
-         *
-         * @param [in] thread_id an id of a C++ CPU thread.
-         * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
-         */
-        static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-        {
-            return get_token_impl<false>(thread_id);
-        }
+  /**
+   * @brief Get a cancellation token for a CPU thread given by its id.
+   *
+   * The returned token may live longer than the associated thread. In that case, using its
+   * `cancel` method has no effect.
+   *
+   * @param [in] thread_id an id of a C++ CPU thread.
+   * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
+   */
+  static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+  {
+    return get_token_impl<false>(thread_id);
+  }
 
-        /**
-         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-         * CPU thread given by the `thread_id`
-         *
-         * Note, this function uses a mutex to safely get a cancellation token that may be shared
-         * among multiple threads. If you plan to use it from a signal handler, consider the non-static
-         * `cancel()` instead.
-         *
-         * @param [in] thread_id a CPU thread, in which the work should be interrupted.
-         */
-        static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
+  /**
+   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+   * CPU thread given by the `thread_id`
+   *
+   * Note, this function uses a mutex to safely get a cancellation token that may be shared
+   * among multiple threads. If you plan to use it from a signal handler, consider the non-static
+   * `cancel()` instead.
+   *
+   * @param [in] thread_id a CPU thread, in which the work should be interrupted.
+   */
+  static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
 
-        /**
-         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-         * CPU thread given by this `interruptible` token.
-         *
-         * Note, this function does not involve thread synchronization/locks and does not throw any
-         * exceptions, so it's safe to call from a signal handler.
-         */
-        inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
+  /**
+   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+   * CPU thread given by this `interruptible` token.
+   *
+   * Note, this function does not involve thread synchronization/locks and does not throw any
+   * exceptions, so it's safe to call from a signal handler.
+   */
+  inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
 
-        // don't allow the token to leave the shared_ptr
-        interruptible(interruptible const&) = delete;
-        interruptible(interruptible&&)      = delete;
-        auto operator=(interruptible const&) -> interruptible& = delete;
-        auto operator=(interruptible&&) -> interruptible& = delete;
+  // don't allow the token to leave the shared_ptr
+  interruptible(interruptible const&) = delete;
+  interruptible(interruptible&&)      = delete;
+  auto operator=(interruptible const&) -> interruptible& = delete;
+  auto operator=(interruptible&&) -> interruptible& = delete;
 
-    private:
-        /** Global registry of thread-local cancellation stores. */
-        static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
-        /** Protect the access to the registry. */
-        static inline std::mutex mutex_;
+ private:
+  /** Global registry of thread-local cancellation stores. */
+  static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
+  /** Protect the access to the registry. */
+  static inline std::mutex mutex_;
 
-        /**
-         * Create a new interruptible token or get an existing from the global registry_.
-         *
-         * Presumptions:
-         *
-         *   1. get_token_impl<true> must be called at most once per thread.
-         *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
-         *   3. get_token_impl<false> can be called as many times as needed, producing a valid
-         *      token for any input thread_id, independent of whether a C++ thread with this
-         *      id exists or not.
-         *
-         * @tparam Claim whether to bind the token to the given thread.
-         * @param [in] thread_id the id of the associated C++ thread.
-         * @return new or existing interruptible token.
-         */
-        template <bool Claim>
-        static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-        {
-            std::lock_guard<std::mutex> guard_get(mutex_);
-            // the following constructs an empty shared_ptr if the key does not exist.
-            auto& weak_store  = registry_[thread_id];
-            auto thread_store = weak_store.lock();
-            if (!thread_store || (Claim && thread_store->claimed_)) {
-                // Create a new thread_store in two cases:
-                //  1. It does not exist in the map yet
-                //  2. The previous store in the map has not yet been deleted
-                thread_store.reset(new interruptible(), [thread_id](auto ts) {
-                    std::lock_guard<std::mutex> guard_erase(mutex_);
-                    auto found = registry_.find(thread_id);
-                    if (found != registry_.end()) {
-                        auto stored = found->second.lock();
-                        // thread_store is not moveable, thus retains its original location.
-                        // Not equal pointers below imply the new store has been already placed
-                        // in the registry_ by the same std::thread::id
-                        if (!stored || stored.get() == ts) { registry_.erase(found); }
-                    }
-                    delete ts;
-                });
-                std::weak_ptr<interruptible>(thread_store).swap(weak_store);
-            }
-            // The thread_store is "claimed" by the thread
-            if constexpr (Claim) { thread_store->claimed_ = true; }
-            return thread_store;
+  /**
+   * Create a new interruptible token or get an existing from the global registry_.
+   *
+   * Presumptions:
+   *
+   *   1. get_token_impl<true> must be called at most once per thread.
+   *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
+   *   3. get_token_impl<false> can be called as many times as needed, producing a valid
+   *      token for any input thread_id, independent of whether a C++ thread with this
+   *      id exists or not.
+   *
+   * @tparam Claim whether to bind the token to the given thread.
+   * @param [in] thread_id the id of the associated C++ thread.
+   * @return new or existing interruptible token.
+   */
+  template <bool Claim>
+  static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+  {
+    std::lock_guard<std::mutex> guard_get(mutex_);
+    // the following constructs an empty shared_ptr if the key does not exist.
+    auto& weak_store  = registry_[thread_id];
+    auto thread_store = weak_store.lock();
+    if (!thread_store || (Claim && thread_store->claimed_)) {
+      // Create a new thread_store in two cases:
+      //  1. It does not exist in the map yet
+      //  2. The previous store in the map has not yet been deleted
+      thread_store.reset(new interruptible(), [thread_id](auto ts) {
+        std::lock_guard<std::mutex> guard_erase(mutex_);
+        auto found = registry_.find(thread_id);
+        if (found != registry_.end()) {
+          auto stored = found->second.lock();
+          // thread_store is not moveable, thus retains its original location.
+          // Not equal pointers below imply the new store has been already placed
+          // in the registry_ by the same std::thread::id
+          if (!stored || stored.get() == ts) { registry_.erase(found); }
         }
+        delete ts;
+      });
+      std::weak_ptr<interruptible>(thread_store).swap(weak_store);
+    }
+    // The thread_store is "claimed" by the thread
+    if constexpr (Claim) { thread_store->claimed_ = true; }
+    return thread_store;
+  }
 
-        /**
-         * Communicate whether the thread is in a cancelled state or can continue execution.
-         *
-         * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
-         * These are the only two places where it's used.
-         */
-        std::atomic_flag continue_;
-        /** This flag is set to true when the created token is placed into a thread-local storage. */
-        bool claimed_ = false;
+  /**
+   * Communicate whether the thread is in a cancelled state or can continue execution.
+   *
+   * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
+   * These are the only two places where it's used.
+   */
+  std::atomic_flag continue_;
+  /** This flag is set to true when the created token is placed into a thread-local storage. */
+  bool claimed_ = false;
 
-        interruptible() noexcept { yield_no_throw_impl(); }
+  interruptible() noexcept { yield_no_throw_impl(); }
 
-        void yield_impl()
-        {
-            if (!yield_no_throw_impl()) {
-                throw interrupted_exception("The work in this thread was cancelled.");
-            }
-        }
+  void yield_impl()
+  {
+    if (!yield_no_throw_impl()) {
+      throw interrupted_exception("The work in this thread was cancelled.");
+    }
+  }
 
-        auto yield_no_throw_impl() noexcept -> bool
-        {
-            return continue_.test_and_set(std::memory_order_relaxed);
-        }
+  auto yield_no_throw_impl() noexcept -> bool
+  {
+    return continue_.test_and_set(std::memory_order_relaxed);
+  }
 
-        template <typename Query, typename Object>
-        inline void synchronize_impl(Query query, Object object)
-        {
-            cudaError_t query_result;
-            while (true) {
-                yield_impl();
-                query_result = query(object);
-                if (query_result != cudaErrorNotReady) { break; }
-                std::this_thread::yield();
-            }
-            RAFT_CUDA_TRY(query_result);
-        }
-    };
+  template <typename Query, typename Object>
+  inline void synchronize_impl(Query query, Object object)
+  {
+    cudaError_t query_result;
+    while (true) {
+      yield_impl();
+      query_result = query(object);
+      if (query_result != cudaErrorNotReady) { break; }
+      std::this_thread::yield();
+    }
+    RAFT_CUDA_TRY(query_result);
+  }
+};
 
 }  // namespace raft
 
diff --git a/cpp/include/raft/linalg/cublas_macros.h b/cpp/include/raft/linalg/cublas_macros.h
index 5a96444e45..0281c5c667 100644
--- a/cpp/include/raft/linalg/cublas_macros.h
+++ b/cpp/include/raft/linalg/cublas_macros.h
@@ -40,33 +40,33 @@ namespace raft {
 /**
  * @brief Exception thrown when a cuBLAS error is encountered.
  */
-    struct cublas_error : public raft::exception {
-        explicit cublas_error(char const* const message) : raft::exception(message) {}
-        explicit cublas_error(std::string const& message) : raft::exception(message) {}
-    };
-
-    namespace linalg {
-        namespace detail {
-
-            inline const char* cublas_error_to_string(cublasStatus_t err)
-            {
-                switch (err) {
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-                    default: return "CUBLAS_STATUS_UNKNOWN";
-                };
-            }
-
-        }  // namespace detail
-    }  // namespace linalg
+struct cublas_error : public raft::exception {
+  explicit cublas_error(char const* const message) : raft::exception(message) {}
+  explicit cublas_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+namespace detail {
+
+inline const char* cublas_error_to_string(cublasStatus_t err)
+{
+  switch (err) {
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
+    default: return "CUBLAS_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace detail
+}  // namespace linalg
 }  // namespace raft
 
 #undef _CUBLAS_ERR_TO_STR
diff --git a/cpp/include/raft/linalg/cusolver_macros.h b/cpp/include/raft/linalg/cusolver_macros.h
index a97c4d2332..df27f7ce26 100644
--- a/cpp/include/raft/linalg/cusolver_macros.h
+++ b/cpp/include/raft/linalg/cusolver_macros.h
@@ -19,7 +19,6 @@
  * Please use raft_runtime/cusolver_macros.hpp instead.
  */
 
-
 #ifndef __RAFT_RT_CUSOLVER_MACROS_H
 #define __RAFT_RT_CUSOLVER_MACROS_H
 
@@ -40,31 +39,31 @@ namespace raft {
 /**
  * @brief Exception thrown when a cuSOLVER error is encountered.
  */
-    struct cusolver_error : public raft::exception {
-        explicit cusolver_error(char const* const message) : raft::exception(message) {}
-        explicit cusolver_error(std::string const& message) : raft::exception(message) {}
-    };
-
-    namespace linalg {
-
-        inline const char* cusolver_error_to_string(cusolverStatus_t err)
-        {
-            switch (err) {
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-                default: return "CUSOLVER_STATUS_UNKNOWN";
-            };
-        }
-
-    }  // namespace linalg
+struct cusolver_error : public raft::exception {
+  explicit cusolver_error(char const* const message) : raft::exception(message) {}
+  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+
+inline const char* cusolver_error_to_string(cusolverStatus_t err)
+{
+  switch (err) {
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
+    default: return "CUSOLVER_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace linalg
 }  // namespace raft
 
 #undef _CUSOLVER_ERR_TO_STR
diff --git a/cpp/include/raft_runtime/comms.hpp b/cpp/include/raft_runtime/comms.hpp
index 664b05acaa..9e2aa9fa84 100644
--- a/cpp/include/raft_runtime/comms.hpp
+++ b/cpp/include/raft_runtime/comms.hpp
@@ -19,620 +19,620 @@
 
 #pragma once
 
-#include <raft_runtime/error.hpp>
 #include <memory>
+#include <raft_runtime/error.hpp>
 #include <vector>
 
 namespace raft {
-    namespace comms {
+namespace comms {
 
-        typedef unsigned int request_t;
-        enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
-        enum class op_t { SUM, PROD, MIN, MAX };
+typedef unsigned int request_t;
+enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
+enum class op_t { SUM, PROD, MIN, MAX };
 
 /**
  * The resulting status of distributed stream synchronization
  */
-        enum class status_t {
-            SUCCESS,  // Synchronization successful
-            ERROR,    // An error occured querying sync status
-            ABORT     // A failure occurred in sync, queued operations aborted
-        };
+enum class status_t {
+  SUCCESS,  // Synchronization successful
+  ERROR,    // An error occured querying sync status
+  ABORT     // A failure occurred in sync, queued operations aborted
+};
+
+template <typename value_t>
+constexpr datatype_t
 
-        template <typename value_t>
-        constexpr datatype_t
+get_type();
 
-        get_type();
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<char>()
+{
+  return datatype_t::CHAR;
+}
 
-        get_type<char>()
-        {
-            return datatype_t::CHAR;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<uint8_t>()
+{
+  return datatype_t::UINT8;
+}
 
-        get_type<uint8_t>()
-        {
-            return datatype_t::UINT8;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<int>()
+{
+  return datatype_t::INT32;
+}
 
-        get_type<int>()
-        {
-            return datatype_t::INT32;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<uint32_t>()
+{
+  return datatype_t::UINT32;
+}
 
-        get_type<uint32_t>()
-        {
-            return datatype_t::UINT32;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<int64_t>()
+{
+  return datatype_t::INT64;
+}
 
-        get_type<int64_t>()
-        {
-            return datatype_t::INT64;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<uint64_t>()
+{
+  return datatype_t::UINT64;
+}
 
-        get_type<uint64_t>()
-        {
-            return datatype_t::UINT64;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<float>()
+{
+  return datatype_t::FLOAT32;
+}
 
-        get_type<float>()
-        {
-            return datatype_t::FLOAT32;
-        }
+template <>
+constexpr datatype_t
 
-        template <>
-        constexpr datatype_t
+get_type<double>()
+{
+  return datatype_t::FLOAT64;
+}
 
-        get_type<double>()
-        {
-            return datatype_t::FLOAT64;
-        }
+class comms_iface {
+ public:
+  virtual ~comms_iface() {}
 
-        class comms_iface {
-        public:
-            virtual ~comms_iface() {}
+  virtual int get_size() const = 0;
 
-            virtual int get_size() const = 0;
+  virtual int get_rank() const = 0;
 
-            virtual int get_rank() const = 0;
+  virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
 
-            virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
+  virtual void barrier() const = 0;
 
-            virtual void barrier() const = 0;
+  virtual status_t sync_stream(cudaStream_t stream) const = 0;
 
-            virtual status_t sync_stream(cudaStream_t stream) const = 0;
+  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
 
-            virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
+  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
 
-            virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
+  virtual void waitall(int count, request_t array_of_requests[]) const = 0;
 
-            virtual void waitall(int count, request_t array_of_requests[]) const = 0;
+  virtual void allreduce(const void* sendbuff,
+                         void* recvbuff,
+                         size_t count,
+                         datatype_t datatype,
+                         op_t op,
+                         cudaStream_t stream) const = 0;
 
-            virtual void allreduce(const void* sendbuff,
-                                   void* recvbuff,
-                                   size_t count,
-                                   datatype_t datatype,
-                                   op_t op,
-                                   cudaStream_t stream) const = 0;
+  virtual void bcast(
+    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
 
-            virtual void bcast(
-                    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
+  virtual void bcast(const void* sendbuff,
+                     void* recvbuff,
+                     size_t count,
+                     datatype_t datatype,
+                     int root,
+                     cudaStream_t stream) const = 0;
 
-            virtual void bcast(const void* sendbuff,
-                               void* recvbuff,
-                               size_t count,
-                               datatype_t datatype,
-                               int root,
+  virtual void reduce(const void* sendbuff,
+                      void* recvbuff,
+                      size_t count,
+                      datatype_t datatype,
+                      op_t op,
+                      int root,
+                      cudaStream_t stream) const = 0;
+
+  virtual void allgather(const void* sendbuff,
+                         void* recvbuff,
+                         size_t sendcount,
+                         datatype_t datatype,
+                         cudaStream_t stream) const = 0;
+
+  virtual void allgatherv(const void* sendbuf,
+                          void* recvbuf,
+                          const size_t* recvcounts,
+                          const size_t* displs,
+                          datatype_t datatype,
+                          cudaStream_t stream) const = 0;
+
+  virtual void gather(const void* sendbuff,
+                      void* recvbuff,
+                      size_t sendcount,
+                      datatype_t datatype,
+                      int root,
+                      cudaStream_t stream) const = 0;
+
+  virtual void gatherv(const void* sendbuf,
+                       void* recvbuf,
+                       size_t sendcount,
+                       const size_t* recvcounts,
+                       const size_t* displs,
+                       datatype_t datatype,
+                       int root,
+                       cudaStream_t stream) const = 0;
+
+  virtual void reducescatter(const void* sendbuff,
+                             void* recvbuff,
+                             size_t recvcount,
+                             datatype_t datatype,
+                             op_t op,
+                             cudaStream_t stream) const = 0;
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
+
+  virtual void device_sendrecv(const void* sendbuf,
+                               size_t sendsize,
+                               int dest,
+                               void* recvbuf,
+                               size_t recvsize,
+                               int source,
                                cudaStream_t stream) const = 0;
 
-            virtual void reduce(const void* sendbuff,
-                                void* recvbuff,
-                                size_t count,
-                                datatype_t datatype,
-                                op_t op,
-                                int root,
-                                cudaStream_t stream) const = 0;
-
-            virtual void allgather(const void* sendbuff,
-                                   void* recvbuff,
-                                   size_t sendcount,
-                                   datatype_t datatype,
-                                   cudaStream_t stream) const = 0;
-
-            virtual void allgatherv(const void* sendbuf,
-                                    void* recvbuf,
-                                    const size_t* recvcounts,
-                                    const size_t* displs,
-                                    datatype_t datatype,
-                                    cudaStream_t stream) const = 0;
-
-            virtual void gather(const void* sendbuff,
-                                void* recvbuff,
-                                size_t sendcount,
-                                datatype_t datatype,
-                                int root,
-                                cudaStream_t stream) const = 0;
-
-            virtual void gatherv(const void* sendbuf,
-                                 void* recvbuf,
-                                 size_t sendcount,
-                                 const size_t* recvcounts,
-                                 const size_t* displs,
-                                 datatype_t datatype,
-                                 int root,
-                                 cudaStream_t stream) const = 0;
-
-            virtual void reducescatter(const void* sendbuff,
-                                       void* recvbuff,
-                                       size_t recvcount,
-                                       datatype_t datatype,
-                                       op_t op,
-                                       cudaStream_t stream) const = 0;
-
-            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-            virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
-
-            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-            virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
-
-            virtual void device_sendrecv(const void* sendbuf,
-                                         size_t sendsize,
-                                         int dest,
+  virtual void device_multicast_sendrecv(const void* sendbuf,
+                                         std::vector<size_t> const& sendsizes,
+                                         std::vector<size_t> const& sendoffsets,
+                                         std::vector<int> const& dests,
                                          void* recvbuf,
-                                         size_t recvsize,
-                                         int source,
+                                         std::vector<size_t> const& recvsizes,
+                                         std::vector<size_t> const& recvoffsets,
+                                         std::vector<int> const& sources,
                                          cudaStream_t stream) const = 0;
-
-            virtual void device_multicast_sendrecv(const void* sendbuf,
-                                                   std::vector<size_t> const& sendsizes,
-                                                   std::vector<size_t> const& sendoffsets,
-                                                   std::vector<int> const& dests,
-                                                   void* recvbuf,
-                                                   std::vector<size_t> const& recvsizes,
-                                                   std::vector<size_t> const& recvoffsets,
-                                                   std::vector<int> const& sources,
-                                                   cudaStream_t stream) const = 0;
-        };
-
-        class comms_t {
-        public:
-            comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
-            {
-                ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
-            }
-
-            /**
-             * Virtual Destructor to enable polymorphism
-             */
-            virtual ~comms_t() {}
-
-            /**
-             * Returns the size of the communicator clique
-             */
-
-            int get_size() const { return impl_->get_size(); }
-
-            /**
-             * Returns the local rank
-             */
-            int get_rank() const { return impl_->get_rank(); }
-
-            /**
-             * Splits the current communicator clique into sub-cliques matching
-             * the given color and key
-             *
-             * @param color ranks w/ the same color are placed in the same communicator
-             * @param key controls rank assignment
-             */
-            std::unique_ptr<comms_iface> comm_split(int color, int key) const
-            {
-                return impl_->comm_split(color, key);
-            }
-
-            /**
-             * Performs a collective barrier synchronization
-             */
-            void barrier() const { impl_->barrier(); }
-
-            /**
-             * Some collective communications implementations (eg. NCCL) might use asynchronous
-             * collectives that are explicitly synchronized. It's important to always synchronize
-             * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
-             * to prevent the potential for deadlocks.
-             *
-             * @param stream the cuda stream to sync collective operations on
-             */
-            status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
-
-            /**
-             * Performs an asynchronous point-to-point send
-             * @tparam value_t the type of data to send
-             * @param buf pointer to array of data to send
-             * @param size number of elements in buf
-             * @param dest destination rank
-             * @param tag a tag to use for the receiver to filter
-             * @param request pointer to hold returned request_t object.
-             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-             */
-            template <typename value_t>
-            void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
-            {
-                impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
-            }
-
-            /**
-             * Performs an asynchronous point-to-point receive
-             * @tparam value_t the type of data to be received
-             * @param buf pointer to (initialized) array that will hold received data
-             * @param size number of elements in buf
-             * @param source source rank
-             * @param tag a tag to use for message filtering
-             * @param request pointer to hold returned request_t object.
-             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-             */
-            template <typename value_t>
-            void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
-            {
-                impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
-            }
-
-            /**
-             * Synchronize on an array of request_t objects returned from isend/irecv
-             * @param count number of requests to synchronize on
-             * @param array_of_requests an array of request_t objects returned from isend/irecv
-             */
-            void waitall(int count, request_t array_of_requests[]) const
-            {
-                impl_->waitall(count, array_of_requests);
-            }
-
-            /**
-             * Perform an allreduce collective
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff data to reduce
-             * @param recvbuff buffer to hold the reduced result
-             * @param count number of elements in sendbuff
-             * @param op reduction operation to perform
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void allreduce(
-                    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
-            {
-                impl_->allreduce(static_cast<const void*>(sendbuff),
-                                 static_cast<void*>(recvbuff),
-                                 count,
-                                 get_type<value_t>(),
-                                 op,
-                                 stream);
-            }
-
-            /**
-             * Broadcast data from one rank to the rest
-             * @tparam value_t datatype of underlying buffers
-             * @param buff buffer to send
-             * @param count number of elements if buff
-             * @param root the rank initiating the broadcast
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
-            {
-                impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
-            }
-
-            /**
-             * Broadcast data from one rank to the rest
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to broadcast (only used in root)
-             * @param recvbuff buffer to receive broadcasted data
-             * @param count number of elements if buff
-             * @param root the rank initiating the broadcast
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void bcast(
-                    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
-            {
-                impl_->bcast(static_cast<const void*>(sendbuff),
-                             static_cast<void*>(recvbuff),
-                             count,
-                             get_type<value_t>(),
-                             root,
-                             stream);
-            }
-
-            /**
-             * Reduce data from many ranks down to a single rank
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to reduce
-             * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
-             * @param count number of elements in sendbuff
-             * @param op reduction operation to perform
-             * @param root rank to store the results
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void reduce(const value_t* sendbuff,
-                        value_t* recvbuff,
-                        size_t count,
-                        op_t op,
-                        int root,
-                        cudaStream_t stream) const
-            {
-                impl_->reduce(static_cast<const void*>(sendbuff),
-                              static_cast<void*>(recvbuff),
-                              count,
-                              get_type<value_t>(),
-                              op,
-                              root,
-                              stream);
-            }
-
-            /**
-             * Gathers data from each rank onto all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to gather
-             * @param recvbuff buffer containing gathered data from all ranks
-             * @param sendcount number of elements in send buffer
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void allgather(const value_t* sendbuff,
-                           value_t* recvbuff,
-                           size_t sendcount,
-                           cudaStream_t stream) const
-            {
-                impl_->allgather(static_cast<const void*>(sendbuff),
-                                 static_cast<void*>(recvbuff),
-                                 sendcount,
-                                 get_type<value_t>(),
-                                 stream);
-            }
-
-            /**
-             * Gathers data from all ranks and delivers to combined data to all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuf buffer containing data to send
-             * @param recvbuf buffer containing data to receive
-             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-             *                   elements that are to be received from each rank
-             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-             *               (relative to recvbuf) at which to place the incoming data from each rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void allgatherv(const value_t* sendbuf,
-                            value_t* recvbuf,
-                            const size_t* recvcounts,
-                            const size_t* displs,
-                            cudaStream_t stream) const
-            {
-                impl_->allgatherv(static_cast<const void*>(sendbuf),
-                                  static_cast<void*>(recvbuf),
-                                  recvcounts,
-                                  displs,
-                                  get_type<value_t>(),
-                                  stream);
-            }
-
-            /**
-             * Gathers data from each rank onto all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to gather
-             * @param recvbuff buffer containing gathered data from all ranks
-             * @param sendcount number of elements in send buffer
-             * @param root rank to store the results
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void gather(const value_t* sendbuff,
-                        value_t* recvbuff,
-                        size_t sendcount,
-                        int root,
-                        cudaStream_t stream) const
-            {
-                impl_->gather(static_cast<const void*>(sendbuff),
-                              static_cast<void*>(recvbuff),
-                              sendcount,
-                              get_type<value_t>(),
-                              root,
-                              stream);
-            }
-
-            /**
-             * Gathers data from all ranks and delivers to combined data to all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuf buffer containing data to send
-             * @param recvbuf buffer containing data to receive
-             * @param sendcount number of elements in send buffer
-             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-             *                   elements that are to be received from each rank
-             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-             *               (relative to recvbuf) at which to place the incoming data from each rank
-             * @param root rank to store the results
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void gatherv(const value_t* sendbuf,
-                         value_t* recvbuf,
-                         size_t sendcount,
-                         const size_t* recvcounts,
-                         const size_t* displs,
-                         int root,
-                         cudaStream_t stream) const
-            {
-                impl_->gatherv(static_cast<const void*>(sendbuf),
-                               static_cast<void*>(recvbuf),
-                               sendcount,
-                               recvcounts,
-                               displs,
-                               get_type<value_t>(),
-                               root,
-                               stream);
-            }
-
-            /**
-             * Reduces data from all ranks then scatters the result across ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
-             * @param recvbuff buffer containing received data
-             * @param recvcount number of items to receive
-             * @param op reduction operation to perform
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void reducescatter(const value_t* sendbuff,
-                               value_t* recvbuff,
-                               size_t recvcount,
-                               op_t op,
-                               cudaStream_t stream) const
-            {
-                impl_->reducescatter(static_cast<const void*>(sendbuff),
-                                     static_cast<void*>(recvbuff),
-                                     recvcount,
-                                     get_type<value_t>(),
-                                     op,
-                                     stream);
-            }
-
-            /**
-             * Performs a point-to-point send
-             *
-             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-             *
-             * @tparam value_t the type of data to send
-             * @param buf pointer to array of data to send
-             * @param size number of elements in buf
-             * @param dest destination rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
-            {
-                impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
-            }
-
-            /**
-             * Performs a point-to-point receive
-             *
-             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-             *
-             * @tparam value_t the type of data to be received
-             * @param buf pointer to (initialized) array that will hold received data
-             * @param size number of elements in buf
-             * @param source source rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
-            {
-                impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
-            }
-
-            /**
-             * Performs a point-to-point send/receive
-             *
-             * @tparam value_t the type of data to be sent & received
-             * @param sendbuf pointer to array of data to send
-             * @param sendsize number of elements in sendbuf
-             * @param dest destination rank
-             * @param recvbuf pointer to (initialized) array that will hold received data
-             * @param recvsize number of elements in recvbuf
-             * @param source source rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_sendrecv(const value_t* sendbuf,
-                                 size_t sendsize,
-                                 int dest,
+};
+
+class comms_t {
+ public:
+  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
+  {
+    ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
+  }
+
+  /**
+   * Virtual Destructor to enable polymorphism
+   */
+  virtual ~comms_t() {}
+
+  /**
+   * Returns the size of the communicator clique
+   */
+
+  int get_size() const { return impl_->get_size(); }
+
+  /**
+   * Returns the local rank
+   */
+  int get_rank() const { return impl_->get_rank(); }
+
+  /**
+   * Splits the current communicator clique into sub-cliques matching
+   * the given color and key
+   *
+   * @param color ranks w/ the same color are placed in the same communicator
+   * @param key controls rank assignment
+   */
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
+    return impl_->comm_split(color, key);
+  }
+
+  /**
+   * Performs a collective barrier synchronization
+   */
+  void barrier() const { impl_->barrier(); }
+
+  /**
+   * Some collective communications implementations (eg. NCCL) might use asynchronous
+   * collectives that are explicitly synchronized. It's important to always synchronize
+   * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
+   * to prevent the potential for deadlocks.
+   *
+   * @param stream the cuda stream to sync collective operations on
+   */
+  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
+
+  /**
+   * Performs an asynchronous point-to-point send
+   * @tparam value_t the type of data to send
+   * @param buf pointer to array of data to send
+   * @param size number of elements in buf
+   * @param dest destination rank
+   * @param tag a tag to use for the receiver to filter
+   * @param request pointer to hold returned request_t object.
+   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+   */
+  template <typename value_t>
+  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
+  }
+
+  /**
+   * Performs an asynchronous point-to-point receive
+   * @tparam value_t the type of data to be received
+   * @param buf pointer to (initialized) array that will hold received data
+   * @param size number of elements in buf
+   * @param source source rank
+   * @param tag a tag to use for message filtering
+   * @param request pointer to hold returned request_t object.
+   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+   */
+  template <typename value_t>
+  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
+  }
+
+  /**
+   * Synchronize on an array of request_t objects returned from isend/irecv
+   * @param count number of requests to synchronize on
+   * @param array_of_requests an array of request_t objects returned from isend/irecv
+   */
+  void waitall(int count, request_t array_of_requests[]) const
+  {
+    impl_->waitall(count, array_of_requests);
+  }
+
+  /**
+   * Perform an allreduce collective
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff data to reduce
+   * @param recvbuff buffer to hold the reduced result
+   * @param count number of elements in sendbuff
+   * @param op reduction operation to perform
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allreduce(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
+  {
+    impl_->allreduce(static_cast<const void*>(sendbuff),
+                     static_cast<void*>(recvbuff),
+                     count,
+                     get_type<value_t>(),
+                     op,
+                     stream);
+  }
+
+  /**
+   * Broadcast data from one rank to the rest
+   * @tparam value_t datatype of underlying buffers
+   * @param buff buffer to send
+   * @param count number of elements if buff
+   * @param root the rank initiating the broadcast
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
+  {
+    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
+  }
+
+  /**
+   * Broadcast data from one rank to the rest
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to broadcast (only used in root)
+   * @param recvbuff buffer to receive broadcasted data
+   * @param count number of elements if buff
+   * @param root the rank initiating the broadcast
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void bcast(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
+  {
+    impl_->bcast(static_cast<const void*>(sendbuff),
+                 static_cast<void*>(recvbuff),
+                 count,
+                 get_type<value_t>(),
+                 root,
+                 stream);
+  }
+
+  /**
+   * Reduce data from many ranks down to a single rank
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to reduce
+   * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
+   * @param count number of elements in sendbuff
+   * @param op reduction operation to perform
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void reduce(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t count,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    impl_->reduce(static_cast<const void*>(sendbuff),
+                  static_cast<void*>(recvbuff),
+                  count,
+                  get_type<value_t>(),
+                  op,
+                  root,
+                  stream);
+  }
+
+  /**
+   * Gathers data from each rank onto all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to gather
+   * @param recvbuff buffer containing gathered data from all ranks
+   * @param sendcount number of elements in send buffer
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allgather(const value_t* sendbuff,
+                 value_t* recvbuff,
+                 size_t sendcount,
+                 cudaStream_t stream) const
+  {
+    impl_->allgather(static_cast<const void*>(sendbuff),
+                     static_cast<void*>(recvbuff),
+                     sendcount,
+                     get_type<value_t>(),
+                     stream);
+  }
+
+  /**
+   * Gathers data from all ranks and delivers to combined data to all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuf buffer containing data to send
+   * @param recvbuf buffer containing data to receive
+   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+   *                   elements that are to be received from each rank
+   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+   *               (relative to recvbuf) at which to place the incoming data from each rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allgatherv(const value_t* sendbuf,
+                  value_t* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  cudaStream_t stream) const
+  {
+    impl_->allgatherv(static_cast<const void*>(sendbuf),
+                      static_cast<void*>(recvbuf),
+                      recvcounts,
+                      displs,
+                      get_type<value_t>(),
+                      stream);
+  }
+
+  /**
+   * Gathers data from each rank onto all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to gather
+   * @param recvbuff buffer containing gathered data from all ranks
+   * @param sendcount number of elements in send buffer
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void gather(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t sendcount,
+              int root,
+              cudaStream_t stream) const
+  {
+    impl_->gather(static_cast<const void*>(sendbuff),
+                  static_cast<void*>(recvbuff),
+                  sendcount,
+                  get_type<value_t>(),
+                  root,
+                  stream);
+  }
+
+  /**
+   * Gathers data from all ranks and delivers to combined data to all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuf buffer containing data to send
+   * @param recvbuf buffer containing data to receive
+   * @param sendcount number of elements in send buffer
+   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+   *                   elements that are to be received from each rank
+   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+   *               (relative to recvbuf) at which to place the incoming data from each rank
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void gatherv(const value_t* sendbuf,
+               value_t* recvbuf,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               int root,
+               cudaStream_t stream) const
+  {
+    impl_->gatherv(static_cast<const void*>(sendbuf),
+                   static_cast<void*>(recvbuf),
+                   sendcount,
+                   recvcounts,
+                   displs,
+                   get_type<value_t>(),
+                   root,
+                   stream);
+  }
+
+  /**
+   * Reduces data from all ranks then scatters the result across ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
+   * @param recvbuff buffer containing received data
+   * @param recvcount number of items to receive
+   * @param op reduction operation to perform
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void reducescatter(const value_t* sendbuff,
+                     value_t* recvbuff,
+                     size_t recvcount,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    impl_->reducescatter(static_cast<const void*>(sendbuff),
+                         static_cast<void*>(recvbuff),
+                         recvcount,
+                         get_type<value_t>(),
+                         op,
+                         stream);
+  }
+
+  /**
+   * Performs a point-to-point send
+   *
+   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+   *
+   * @tparam value_t the type of data to send
+   * @param buf pointer to array of data to send
+   * @param size number of elements in buf
+   * @param dest destination rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
+  {
+    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
+  }
+
+  /**
+   * Performs a point-to-point receive
+   *
+   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+   *
+   * @tparam value_t the type of data to be received
+   * @param buf pointer to (initialized) array that will hold received data
+   * @param size number of elements in buf
+   * @param source source rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
+  {
+    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
+  }
+
+  /**
+   * Performs a point-to-point send/receive
+   *
+   * @tparam value_t the type of data to be sent & received
+   * @param sendbuf pointer to array of data to send
+   * @param sendsize number of elements in sendbuf
+   * @param dest destination rank
+   * @param recvbuf pointer to (initialized) array that will hold received data
+   * @param recvsize number of elements in recvbuf
+   * @param source source rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_sendrecv(const value_t* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       value_t* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
+    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
+                           sendsize * sizeof(value_t),
+                           dest,
+                           static_cast<void*>(recvbuf),
+                           recvsize * sizeof(value_t),
+                           source,
+                           stream);
+  }
+
+  /**
+   * Performs a multicast send/receive
+   *
+   * @tparam value_t the type of data to be sent & received
+   * @param sendbuf pointer to array of data to send
+   * @param sendsizes numbers of elements to send
+   * @param sendoffsets offsets in a number of elements from sendbuf
+   * @param dests destination ranks
+   * @param recvbuf pointer to (initialized) array that will hold received data
+   * @param recvsizes numbers of elements to recv
+   * @param recvoffsets offsets in a number of elements from recvbuf
+   * @param sources source ranks
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_multicast_sendrecv(const value_t* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
                                  value_t* recvbuf,
-                                 size_t recvsize,
-                                 int source,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
                                  cudaStream_t stream) const
-            {
-                impl_->device_sendrecv(static_cast<const void*>(sendbuf),
-                                       sendsize * sizeof(value_t),
-                                       dest,
-                                       static_cast<void*>(recvbuf),
-                                       recvsize * sizeof(value_t),
-                                       source,
-                                       stream);
-            }
-
-            /**
-             * Performs a multicast send/receive
-             *
-             * @tparam value_t the type of data to be sent & received
-             * @param sendbuf pointer to array of data to send
-             * @param sendsizes numbers of elements to send
-             * @param sendoffsets offsets in a number of elements from sendbuf
-             * @param dests destination ranks
-             * @param recvbuf pointer to (initialized) array that will hold received data
-             * @param recvsizes numbers of elements to recv
-             * @param recvoffsets offsets in a number of elements from recvbuf
-             * @param sources source ranks
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_multicast_sendrecv(const value_t* sendbuf,
-                                           std::vector<size_t> const& sendsizes,
-                                           std::vector<size_t> const& sendoffsets,
-                                           std::vector<int> const& dests,
-                                           value_t* recvbuf,
-                                           std::vector<size_t> const& recvsizes,
-                                           std::vector<size_t> const& recvoffsets,
-                                           std::vector<int> const& sources,
-                                           cudaStream_t stream) const
-            {
-                auto sendbytesizes   = sendsizes;
-                auto sendbyteoffsets = sendoffsets;
-                for (size_t i = 0; i < sendsizes.size(); ++i) {
-                    sendbytesizes[i] *= sizeof(value_t);
-                    sendbyteoffsets[i] *= sizeof(value_t);
-                }
-                auto recvbytesizes   = recvsizes;
-                auto recvbyteoffsets = recvoffsets;
-                for (size_t i = 0; i < recvsizes.size(); ++i) {
-                    recvbytesizes[i] *= sizeof(value_t);
-                    recvbyteoffsets[i] *= sizeof(value_t);
-                }
-                impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
-                                                 sendbytesizes,
-                                                 sendbyteoffsets,
-                                                 dests,
-                                                 static_cast<void*>(recvbuf),
-                                                 recvbytesizes,
-                                                 recvbyteoffsets,
-                                                 sources,
-                                                 stream);
-            }
-
-        private:
-            std::unique_ptr<comms_iface> impl_;
-        };
-
-    }  // namespace comms
+  {
+    auto sendbytesizes   = sendsizes;
+    auto sendbyteoffsets = sendoffsets;
+    for (size_t i = 0; i < sendsizes.size(); ++i) {
+      sendbytesizes[i] *= sizeof(value_t);
+      sendbyteoffsets[i] *= sizeof(value_t);
+    }
+    auto recvbytesizes   = recvsizes;
+    auto recvbyteoffsets = recvoffsets;
+    for (size_t i = 0; i < recvsizes.size(); ++i) {
+      recvbytesizes[i] *= sizeof(value_t);
+      recvbyteoffsets[i] *= sizeof(value_t);
+    }
+    impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
+                                     sendbytesizes,
+                                     sendbyteoffsets,
+                                     dests,
+                                     static_cast<void*>(recvbuf),
+                                     recvbytesizes,
+                                     recvbyteoffsets,
+                                     sources,
+                                     stream);
+  }
+
+ private:
+  std::unique_ptr<comms_iface> impl_;
+};
+
+}  // namespace comms
 }  // namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft_runtime/cublas_macros.hpp b/cpp/include/raft_runtime/cublas_macros.hpp
index c58bda3ff5..eb944e282f 100644
--- a/cpp/include/raft_runtime/cublas_macros.hpp
+++ b/cpp/include/raft_runtime/cublas_macros.hpp
@@ -35,33 +35,33 @@ namespace raft {
 /**
  * @brief Exception thrown when a cuBLAS error is encountered.
  */
-    struct cublas_error : public raft::exception {
-        explicit cublas_error(char const* const message) : raft::exception(message) {}
-        explicit cublas_error(std::string const& message) : raft::exception(message) {}
-    };
-
-    namespace linalg {
-        namespace detail {
-
-            inline const char* cublas_error_to_string(cublasStatus_t err)
-            {
-                switch (err) {
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-                    default: return "CUBLAS_STATUS_UNKNOWN";
-                };
-            }
-
-        }  // namespace detail
-    }  // namespace linalg
+struct cublas_error : public raft::exception {
+  explicit cublas_error(char const* const message) : raft::exception(message) {}
+  explicit cublas_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+namespace detail {
+
+inline const char* cublas_error_to_string(cublasStatus_t err)
+{
+  switch (err) {
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
+    default: return "CUBLAS_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace detail
+}  // namespace linalg
 }  // namespace raft
 
 #undef _CUBLAS_ERR_TO_STR
diff --git a/cpp/include/raft_runtime/cudart_utils.hpp b/cpp/include/raft_runtime/cudart_utils.hpp
index b0cc6eaaf4..e0f941d777 100644
--- a/cpp/include/raft_runtime/cudart_utils.hpp
+++ b/cpp/include/raft_runtime/cudart_utils.hpp
@@ -41,10 +41,10 @@ namespace raft {
 /**
  * @brief Exception thrown when a CUDA error is encountered.
  */
-    struct cuda_error : public raft::exception {
-        explicit cuda_error(char const* const message) : raft::exception(message) {}
-        explicit cuda_error(std::string const& message) : raft::exception(message) {}
-    };
+struct cuda_error : public raft::exception {
+  explicit cuda_error(char const* const message) : raft::exception(message) {}
+  explicit cuda_error(std::string const& message) : raft::exception(message) {}
+};
 
 }  // namespace raft
 
@@ -136,99 +136,99 @@ namespace raft {
 namespace raft {
 
 /** Helper method to get to know warp size in device code */
-    __host__ __device__ constexpr inline int warp_size() { return 32; }
+__host__ __device__ constexpr inline int warp_size() { return 32; }
 
-    __host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
+__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to threads.
  */
-    class grid_1d_thread_t {
-    public:
-        int const block_size{0};
-        int const num_blocks{0};
-
-        /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         * @param elements_per_thread Typically, a single kernel thread processes more than a single
-         * element; this affects the number of threads the grid must contain
-         */
-        grid_1d_thread_t(size_t overall_num_elements,
-                         size_t num_threads_per_block,
-                         size_t max_num_blocks_1d,
-                         size_t elements_per_thread = 1)
-                : block_size(num_threads_per_block),
-                  num_blocks(
-                          std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
-                                   (elements_per_thread * num_threads_per_block),
-                                   max_num_blocks_1d))
-        {
-            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                         "num_threads_per_block / warp_size() must be > 0");
-            RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
-        }
-    };
+class grid_1d_thread_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param elements_per_thread Typically, a single kernel thread processes more than a single
+   * element; this affects the number of threads the grid must contain
+   */
+  grid_1d_thread_t(size_t overall_num_elements,
+                   size_t num_threads_per_block,
+                   size_t max_num_blocks_1d,
+                   size_t elements_per_thread = 1)
+    : block_size(num_threads_per_block),
+      num_blocks(
+        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
+                   (elements_per_thread * num_threads_per_block),
+                 max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+    RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
+  }
+};
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to warps.
  */
-    class grid_1d_warp_t {
-    public:
-        int const block_size{0};
-        int const num_blocks{0};
-
-        /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         */
-        grid_1d_warp_t(size_t overall_num_elements,
-                       size_t num_threads_per_block,
-                       size_t max_num_blocks_1d)
-                : block_size(num_threads_per_block),
-                  num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
-                                      (num_threads_per_block / warp_size()),
-                                      max_num_blocks_1d))
-        {
-            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                         "num_threads_per_block / warp_size() must be > 0");
-        }
-    };
+class grid_1d_warp_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   */
+  grid_1d_warp_t(size_t overall_num_elements,
+                 size_t num_threads_per_block,
+                 size_t max_num_blocks_1d)
+    : block_size(num_threads_per_block),
+      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
+                            (num_threads_per_block / warp_size()),
+                          max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+  }
+};
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to blocks.
  */
-    class grid_1d_block_t {
-    public:
-        int const block_size{0};
-        int const num_blocks{0};
-
-        /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         */
-        grid_1d_block_t(size_t overall_num_elements,
-                        size_t num_threads_per_block,
-                        size_t max_num_blocks_1d)
-                : block_size(num_threads_per_block),
-                  num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
-        {
-            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                         "num_threads_per_block / warp_size() must be > 0");
-        }
-    };
+class grid_1d_block_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   */
+  grid_1d_block_t(size_t overall_num_elements,
+                  size_t num_threads_per_block,
+                  size_t max_num_blocks_1d)
+    : block_size(num_threads_per_block),
+      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+  }
+};
 
 /**
  * @brief Generic copy method for all kinds of transfers
@@ -238,11 +238,11 @@ namespace raft {
  * @param len lenth of the src/dst buffers in terms of number of elements
  * @param stream cuda stream
  */
-    template <typename Type>
-    void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
-    {
-        CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
-    }
+template <typename Type>
+void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+}
 
 /**
  * @defgroup Copy Copy methods
@@ -251,152 +251,152 @@ namespace raft {
  * @{
  */
 /** performs a host to device copy */
-    template <typename Type>
-    void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
-    {
-        copy(d_ptr, h_ptr, len, stream);
-    }
+template <typename Type>
+void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
+{
+  copy(d_ptr, h_ptr, len, stream);
+}
 
 /** performs a device to host copy */
-    template <typename Type>
-    void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
-    {
-        copy(h_ptr, d_ptr, len, stream);
-    }
-
-    template <typename Type>
-    void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
-    {
-        CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
-    }
+template <typename Type>
+void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
+{
+  copy(h_ptr, d_ptr, len, stream);
+}
+
+template <typename Type>
+void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+}
 /** @} */
 
 /**
  * @defgroup Debug Utils for debugging host/device buffers
  * @{
  */
-    template <class T, class OutStream>
-    void print_host_vector(const char* variable_name,
-                           const T* host_mem,
-                           size_t componentsCount,
-                           OutStream& out)
-    {
-        out << variable_name << "=[";
-        for (size_t i = 0; i < componentsCount; ++i) {
-            if (i != 0) out << ",";
-            out << host_mem[i];
-        }
-        out << "];\n";
-    }
-
-    template <class T, class OutStream>
-    void print_device_vector(const char* variable_name,
-                             const T* devMem,
-                             size_t componentsCount,
-                             OutStream& out)
-    {
-        T* host_mem = new T[componentsCount];
-        CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
-        print_host_vector(variable_name, host_mem, componentsCount, out);
-        delete[] host_mem;
-    }
+template <class T, class OutStream>
+void print_host_vector(const char* variable_name,
+                       const T* host_mem,
+                       size_t componentsCount,
+                       OutStream& out)
+{
+  out << variable_name << "=[";
+  for (size_t i = 0; i < componentsCount; ++i) {
+    if (i != 0) out << ",";
+    out << host_mem[i];
+  }
+  out << "];\n";
+}
+
+template <class T, class OutStream>
+void print_device_vector(const char* variable_name,
+                         const T* devMem,
+                         size_t componentsCount,
+                         OutStream& out)
+{
+  T* host_mem = new T[componentsCount];
+  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
+  print_host_vector(variable_name, host_mem, componentsCount, out);
+  delete[] host_mem;
+}
 /** @} */
 
-    static std::mutex mutex_;
-    static std::unordered_map<void*, size_t> allocations;
-
-    template <typename Type>
-    void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false)
-    {
-        size_t size = len * sizeof(Type);
-        ptr         = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
-        if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
-
-        std::lock_guard<std::mutex> _(mutex_);
-        allocations[ptr] = size;
-    }
-
-    template <typename Type>
-    void deallocate(Type*& ptr, rmm::cuda_stream_view stream)
-    {
-        std::lock_guard<std::mutex> _(mutex_);
-        size_t size = allocations[ptr];
-        allocations.erase(ptr);
-        rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
-    }
-
-    inline void deallocate_all(rmm::cuda_stream_view stream)
-    {
-        std::lock_guard<std::mutex> _(mutex_);
-        for (auto& alloc : allocations) {
-            void* ptr   = alloc.first;
-            size_t size = alloc.second;
-            rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
-        }
-        allocations.clear();
-    }
+static std::mutex mutex_;
+static std::unordered_map<void*, size_t> allocations;
+
+template <typename Type>
+void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false)
+{
+  size_t size = len * sizeof(Type);
+  ptr         = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
+  if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
+
+  std::lock_guard<std::mutex> _(mutex_);
+  allocations[ptr] = size;
+}
+
+template <typename Type>
+void deallocate(Type*& ptr, rmm::cuda_stream_view stream)
+{
+  std::lock_guard<std::mutex> _(mutex_);
+  size_t size = allocations[ptr];
+  allocations.erase(ptr);
+  rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
+}
+
+inline void deallocate_all(rmm::cuda_stream_view stream)
+{
+  std::lock_guard<std::mutex> _(mutex_);
+  for (auto& alloc : allocations) {
+    void* ptr   = alloc.first;
+    size_t size = alloc.second;
+    rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
+  }
+  allocations.clear();
+}
 
 /** helper method to get max usable shared mem per block parameter */
-    inline int getSharedMemPerBlock()
-    {
-        int devId;
-        RAFT_CUDA_TRY(cudaGetDevice(&devId));
-        int smemPerBlk;
-        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
-        return smemPerBlk;
-    }
+inline int getSharedMemPerBlock()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int smemPerBlk;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
+  return smemPerBlk;
+}
 
 /** helper method to get multi-processor count parameter */
-    inline int getMultiProcessorCount()
-    {
-        int devId;
-        RAFT_CUDA_TRY(cudaGetDevice(&devId));
-        int mpCount;
-        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
-        return mpCount;
-    }
+inline int getMultiProcessorCount()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int mpCount;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+  return mpCount;
+}
 
 /** helper method to convert an array on device to a string on host */
-    template <typename T>
-    std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
-    {
-        std::stringstream ss;
+template <typename T>
+std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
+{
+  std::stringstream ss;
 
-        T* arr_h = (T*)malloc(size * sizeof(T));
-        update_host(arr_h, arr, size, stream);
-        RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  T* arr_h = (T*)malloc(size * sizeof(T));
+  update_host(arr_h, arr, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
-        ss << name << " = [ ";
-        for (int i = 0; i < size; i++) {
-            ss << std::setw(width) << arr_h[i];
+  ss << name << " = [ ";
+  for (int i = 0; i < size; i++) {
+    ss << std::setw(width) << arr_h[i];
 
-            if (i < size - 1) ss << ", ";
-        }
-        ss << " ]" << std::endl;
+    if (i < size - 1) ss << ", ";
+  }
+  ss << " ]" << std::endl;
 
-        free(arr_h);
+  free(arr_h);
 
-        return ss.str();
-    }
+  return ss.str();
+}
 
 /** this seems to be unused, but may be useful in the future */
-    template <typename T>
-    void ASSERT_DEVICE_MEM(T* ptr, std::string name)
-    {
-        cudaPointerAttributes s_att;
-        cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
-
-        if (s_err != 0 || s_att.device == -1)
-            std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
-                      << ", err=" << s_err << std::endl;
-    }
-
-    inline uint32_t curTimeMillis()
-    {
-        auto now      = std::chrono::high_resolution_clock::now();
-        auto duration = now.time_since_epoch();
-        return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
-    }
+template <typename T>
+void ASSERT_DEVICE_MEM(T* ptr, std::string name)
+{
+  cudaPointerAttributes s_att;
+  cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
+
+  if (s_err != 0 || s_att.device == -1)
+    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
+              << ", err=" << s_err << std::endl;
+}
+
+inline uint32_t curTimeMillis()
+{
+  auto now      = std::chrono::high_resolution_clock::now();
+  auto duration = now.time_since_epoch();
+  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+}
 
 /** Helper function to calculate need memory for allocate to store dense matrix.
  * @param rows number of rows in matrix
@@ -404,34 +404,34 @@ namespace raft {
  * @return need number of items to allocate via allocate()
  * @sa allocate()
  */
-    inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
+inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
 
 /** Helper function to check alignment of pointer.
  * @param ptr the pointer to check
  * @param alignment to be checked for
  * @return true if address in bytes is a multiple of alignment
  */
-    template <typename Type>
-    bool is_aligned(Type* ptr, size_t alignment)
-    {
-        return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
-    }
+template <typename Type>
+bool is_aligned(Type* ptr, size_t alignment)
+{
+  return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+}
 
 /** calculate greatest common divisor of two numbers
  * @a integer
  * @b integer
  * @ return gcd of a and b
  */
-    template <typename IntType>
-    IntType gcd(IntType a, IntType b)
-    {
-        while (b != 0) {
-            IntType tmp = b;
-            b           = a % b;
-            a           = tmp;
-        }
-        return a;
-    }
+template <typename IntType>
+IntType gcd(IntType a, IntType b)
+{
+  while (b != 0) {
+    IntType tmp = b;
+    b           = a % b;
+    a           = tmp;
+  }
+  return a;
+}
 
 }  // namespace raft
 
diff --git a/cpp/include/raft_runtime/cusolver_macros.hpp b/cpp/include/raft_runtime/cusolver_macros.hpp
index 6f56e2b9a6..c1ac54a7cb 100644
--- a/cpp/include/raft_runtime/cusolver_macros.hpp
+++ b/cpp/include/raft_runtime/cusolver_macros.hpp
@@ -34,31 +34,31 @@ namespace raft {
 /**
  * @brief Exception thrown when a cuSOLVER error is encountered.
  */
-    struct cusolver_error : public raft::exception {
-        explicit cusolver_error(char const* const message) : raft::exception(message) {}
-        explicit cusolver_error(std::string const& message) : raft::exception(message) {}
-    };
-
-    namespace linalg {
-
-        inline const char* cusolver_error_to_string(cusolverStatus_t err)
-        {
-            switch (err) {
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-                default: return "CUSOLVER_STATUS_UNKNOWN";
-            };
-        }
-
-    }  // namespace linalg
+struct cusolver_error : public raft::exception {
+  explicit cusolver_error(char const* const message) : raft::exception(message) {}
+  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+
+inline const char* cusolver_error_to_string(cusolverStatus_t err)
+{
+  switch (err) {
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
+    default: return "CUSOLVER_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace linalg
 }  // namespace raft
 
 #undef _CUSOLVER_ERR_TO_STR
diff --git a/cpp/include/raft_runtime/cusparse_macros.hpp b/cpp/include/raft_runtime/cusparse_macros.hpp
index c4187722c1..9cb69f9551 100644
--- a/cpp/include/raft_runtime/cusparse_macros.hpp
+++ b/cpp/include/raft_runtime/cusparse_macros.hpp
@@ -43,35 +43,35 @@ namespace raft {
 /**
  * @brief Exception thrown when a cuSparse error is encountered.
  */
-    struct cusparse_error : public raft::exception {
-        explicit cusparse_error(char const* const message) : raft::exception(message) {}
-        explicit cusparse_error(std::string const& message) : raft::exception(message) {}
-    };
+struct cusparse_error : public raft::exception {
+  explicit cusparse_error(char const* const message) : raft::exception(message) {}
+  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
+};
 
-    namespace sparse {
-        namespace detail {
+namespace sparse {
+namespace detail {
 
-            inline const char* cusparse_error_to_string(cusparseStatus_t err)
-            {
+inline const char* cusparse_error_to_string(cusparseStatus_t err)
+{
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
-                return cusparseGetErrorString(err);
+  return cusparseGetErrorString(err);
 #else   // CUDART_VERSION
-                switch (err) {
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-                    default: return "CUSPARSE_STATUS_UNKNOWN";
-                };
+  switch (err) {
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    default: return "CUSPARSE_STATUS_UNKNOWN";
+  };
 #endif  // CUDART_VERSION
-            }
+}
 
-        }  // namespace detail
-    }  // namespace sparse
+}  // namespace detail
+}  // namespace sparse
 }  // namespace raft
 
 #undef _CUSPARSE_ERR_TO_STR
diff --git a/cpp/include/raft_runtime/error.hpp b/cpp/include/raft_runtime/error.hpp
index b3ed70c5d2..33a1861b22 100644
--- a/cpp/include/raft_runtime/error.hpp
+++ b/cpp/include/raft_runtime/error.hpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-
 #ifndef __RAFT_RT_ERROR
 #define __RAFT_RT_ERROR
 
@@ -31,36 +30,36 @@
 namespace raft {
 
 /** base exception class for the whole of raft */
-    class exception : public std::exception {
-    public:
-        /** default ctor */
-        explicit exception() noexcept : std::exception(), msg_() {}
-
-        /** copy ctor */
-        exception(exception const& src) noexcept : std::exception(), msg_(src.what())
-        {
-            collect_call_stack();
-        }
-
-        /** ctor from an input message */
-        explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
-        {
-            collect_call_stack();
-        }
-
-        /** get the message associated with this exception */
-        char const* what() const noexcept override { return msg_.c_str(); }
-
-    private:
-        /** message associated with this exception */
-        std::string msg_;
-
-        /** append call stack info to this exception's message for ease of debug */
-        // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-        void collect_call_stack() noexcept
-        {
+class exception : public std::exception {
+ public:
+  /** default ctor */
+  explicit exception() noexcept : std::exception(), msg_() {}
+
+  /** copy ctor */
+  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
+  {
+    collect_call_stack();
+  }
+
+  /** ctor from an input message */
+  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
+  {
+    collect_call_stack();
+  }
+
+  /** get the message associated with this exception */
+  char const* what() const noexcept override { return msg_.c_str(); }
+
+ private:
+  /** message associated with this exception */
+  std::string msg_;
+
+  /** append call stack info to this exception's message for ease of debug */
+  // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
+  void collect_call_stack() noexcept
+  {
 #ifdef __GNUC__
-            constexpr int kMaxStackDepth = 64;
+    constexpr int kMaxStackDepth = 64;
     void* stack[kMaxStackDepth];  // NOLINT
     auto depth = backtrace(stack, kMaxStackDepth);
     std::ostringstream oss;
@@ -78,8 +77,8 @@ namespace raft {
     free(strings);
     msg_ += oss.str();
 #endif  // __GNUC__
-        }
-    };
+  }
+};
 
 /**
  * @brief Exception thrown when logical precondition is violated.
@@ -88,10 +87,10 @@ namespace raft {
  * RAFT_EXPECTS and  RAFT_FAIL macros.
  *
  */
-    struct logic_error : public raft::exception {
-        explicit logic_error(char const* const message) : raft::exception(message) {}
-        explicit logic_error(std::string const& message) : raft::exception(message) {}
-    };
+struct logic_error : public raft::exception {
+  explicit logic_error(char const* const message) : raft::exception(message) {}
+  explicit logic_error(std::string const& message) : raft::exception(message) {}
+};
 
 }  // namespace raft
 
diff --git a/cpp/include/raft_runtime/handle.hpp b/cpp/include/raft_runtime/handle.hpp
index 1ceb704992..d99cc08601 100644
--- a/cpp/include/raft_runtime/handle.hpp
+++ b/cpp/include/raft_runtime/handle.hpp
@@ -38,10 +38,10 @@
 #include <raft_runtime/cudart_utils.hpp>
 
 #include <raft_runtime/comms.hpp>
-#include <raft_runtime/interruptible.hpp>
 #include <raft_runtime/cublas_macros.hpp>
 #include <raft_runtime/cusolver_macros.hpp>
 #include <raft_runtime/cusparse_macros.hpp>
+#include <raft_runtime/interruptible.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -51,288 +51,288 @@ namespace raft {
  * @brief Main handle object that stores all necessary context used for calling
  *        necessary cuda kernels and/or libraries
  */
-    class handle_t {
-    public:
-        // delete copy/move constructors and assignment operators as
-        // copying and moving underlying resources is unsafe
-        handle_t(const handle_t&) = delete;
-        handle_t& operator=(const handle_t&) = delete;
-        handle_t(handle_t&&)                 = delete;
-        handle_t& operator=(handle_t&&) = delete;
-
-        /**
-         * @brief Construct a handle with a stream view and stream pool
-         *
-         * @param[in] stream_view the default stream (which has the default per-thread stream if
-         * unspecified)
-         * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
-         */
-        handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
-                 std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
-                : dev_id_([]() -> int {
-            int cur_dev = -1;
-            RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
-            return cur_dev;
-        }()),
-                  stream_view_{stream_view},
-                  stream_pool_{stream_pool}
-        {
-            create_resources();
-        }
-
-        /** Destroys all held-up resources */
-        virtual ~handle_t() { destroy_resources(); }
-
-        int get_device() const { return dev_id_; }
-
-        cublasHandle_t get_cublas_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cublas_initialized_) {
-                RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
-                RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
-                cublas_initialized_ = true;
-            }
-            return cublas_handle_;
-        }
-
-        cusolverDnHandle_t get_cusolver_dn_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cusolver_dn_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
-                cusolver_dn_initialized_ = true;
-            }
-            return cusolver_dn_handle_;
-        }
-
-        cusolverSpHandle_t get_cusolver_sp_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cusolver_sp_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
-                cusolver_sp_initialized_ = true;
-            }
-            return cusolver_sp_handle_;
-        }
-
-        cusparseHandle_t get_cusparse_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cusparse_initialized_) {
-                RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
-                RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
-                cusparse_initialized_ = true;
-            }
-            return cusparse_handle_;
-        }
-
-        rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
-
-        /**
-         * @brief synchronize a stream on the handle
-         */
-        void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
-
-        /**
-         * @brief synchronize main stream on the handle
-         */
-        void sync_stream() const { sync_stream(stream_view_); }
-
-        /**
-         * @brief returns main stream on the handle
-         */
-        rmm::cuda_stream_view get_stream() const { return stream_view_; }
-
-        /**
-         * @brief returns whether stream pool was initialized on the handle
-         */
-
-        bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
-
-        /**
-         * @brief returns stream pool on the handle
-         */
-        const rmm::cuda_stream_pool& get_stream_pool() const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            return *stream_pool_;
-        }
-
-        std::size_t get_stream_pool_size() const
-        {
-            return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
-        }
-
-        /**
-         * @brief return stream from pool
-         */
-        rmm::cuda_stream_view get_stream_from_stream_pool() const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            return stream_pool_->get_stream();
-        }
-
-        /**
-         * @brief return stream from pool at index
-         */
-        rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            return stream_pool_->get_stream(stream_idx);
-        }
-
-        /**
-         * @brief return stream from pool if size > 0, else main stream on handle
-         */
-        rmm::cuda_stream_view get_next_usable_stream() const
-        {
-            return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
-        }
-
-        /**
-         * @brief return stream from pool at index if size > 0, else main stream on handle
-         *
-         * @param[in] stream_idx the required index of the stream in the stream pool if available
-         */
-        rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
-        {
-            return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
-        }
-
-        /**
-         * @brief synchronize the stream pool on the handle
-         */
-        void sync_stream_pool() const
-        {
-            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-                sync_stream(stream_pool_->get_stream(i));
-            }
-        }
-
-        /**
-         * @brief synchronize subset of stream pool
-         *
-         * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
-         */
-        void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            for (const auto& stream_index : stream_indices) {
-                sync_stream(stream_pool_->get_stream(stream_index));
-            }
-        }
-
-        /**
-         * @brief ask stream pool to wait on last event in main stream
-         */
-        void wait_stream_pool_on_stream() const
-        {
-            RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
-            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-                RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
-            }
-        }
-
-        void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
-
-        const comms::comms_t& get_comms() const
-        {
-            RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
-            return *communicator_;
-        }
-
-        void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
-        {
-            subcomms_[key] = subcomm;
-        }
-
-        const comms::comms_t& get_subcomm(std::string key) const
-        {
-            RAFT_EXPECTS(
-                    subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
-
-            auto subcomm = subcomms_.at(key);
-
-            RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
-
-            return *subcomm;
-        }
-
-        bool comms_initialized() const { return (nullptr != communicator_.get()); }
-
-        const cudaDeviceProp& get_device_properties() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!device_prop_initialized_) {
-                RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
-                device_prop_initialized_ = true;
-            }
-            return prop_;
-        }
-
-    private:
-        std::shared_ptr<comms::comms_t> communicator_;
-        std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
-
-        const int dev_id_;
-        mutable cublasHandle_t cublas_handle_;
-        mutable bool cublas_initialized_{false};
-        mutable cusolverDnHandle_t cusolver_dn_handle_;
-        mutable bool cusolver_dn_initialized_{false};
-        mutable cusolverSpHandle_t cusolver_sp_handle_;
-        mutable bool cusolver_sp_initialized_{false};
-        mutable cusparseHandle_t cusparse_handle_;
-        mutable bool cusparse_initialized_{false};
-        std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
-        rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
-        std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
-        cudaEvent_t event_;
-        mutable cudaDeviceProp prop_;
-        mutable bool device_prop_initialized_{false};
-        mutable std::mutex mutex_;
-
-        void create_resources()
-        {
-            thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
-
-            RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-        }
-
-        void destroy_resources()
-        {
-            if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
-            if (cusolver_dn_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
-            }
-            if (cusolver_sp_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
-            }
-            if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
-            RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
-        }
-    };  // class handle_t
+class handle_t {
+ public:
+  // delete copy/move constructors and assignment operators as
+  // copying and moving underlying resources is unsafe
+  handle_t(const handle_t&) = delete;
+  handle_t& operator=(const handle_t&) = delete;
+  handle_t(handle_t&&)                 = delete;
+  handle_t& operator=(handle_t&&) = delete;
+
+  /**
+   * @brief Construct a handle with a stream view and stream pool
+   *
+   * @param[in] stream_view the default stream (which has the default per-thread stream if
+   * unspecified)
+   * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
+   */
+  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
+           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
+    : dev_id_([]() -> int {
+        int cur_dev = -1;
+        RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
+        return cur_dev;
+      }()),
+      stream_view_{stream_view},
+      stream_pool_{stream_pool}
+  {
+    create_resources();
+  }
+
+  /** Destroys all held-up resources */
+  virtual ~handle_t() { destroy_resources(); }
+
+  int get_device() const { return dev_id_; }
+
+  cublasHandle_t get_cublas_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cublas_initialized_) {
+      RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
+      RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
+      cublas_initialized_ = true;
+    }
+    return cublas_handle_;
+  }
+
+  cusolverDnHandle_t get_cusolver_dn_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusolver_dn_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
+      cusolver_dn_initialized_ = true;
+    }
+    return cusolver_dn_handle_;
+  }
+
+  cusolverSpHandle_t get_cusolver_sp_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusolver_sp_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
+      cusolver_sp_initialized_ = true;
+    }
+    return cusolver_sp_handle_;
+  }
+
+  cusparseHandle_t get_cusparse_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusparse_initialized_) {
+      RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
+      RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
+      cusparse_initialized_ = true;
+    }
+    return cusparse_handle_;
+  }
+
+  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
+
+  /**
+   * @brief synchronize a stream on the handle
+   */
+  void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
+
+  /**
+   * @brief synchronize main stream on the handle
+   */
+  void sync_stream() const { sync_stream(stream_view_); }
+
+  /**
+   * @brief returns main stream on the handle
+   */
+  rmm::cuda_stream_view get_stream() const { return stream_view_; }
+
+  /**
+   * @brief returns whether stream pool was initialized on the handle
+   */
+
+  bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
+
+  /**
+   * @brief returns stream pool on the handle
+   */
+  const rmm::cuda_stream_pool& get_stream_pool() const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return *stream_pool_;
+  }
+
+  std::size_t get_stream_pool_size() const
+  {
+    return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
+  }
+
+  /**
+   * @brief return stream from pool
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool() const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return stream_pool_->get_stream();
+  }
+
+  /**
+   * @brief return stream from pool at index
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return stream_pool_->get_stream(stream_idx);
+  }
+
+  /**
+   * @brief return stream from pool if size > 0, else main stream on handle
+   */
+  rmm::cuda_stream_view get_next_usable_stream() const
+  {
+    return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
+  }
+
+  /**
+   * @brief return stream from pool at index if size > 0, else main stream on handle
+   *
+   * @param[in] stream_idx the required index of the stream in the stream pool if available
+   */
+  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
+  {
+    return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
+  }
+
+  /**
+   * @brief synchronize the stream pool on the handle
+   */
+  void sync_stream_pool() const
+  {
+    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+      sync_stream(stream_pool_->get_stream(i));
+    }
+  }
+
+  /**
+   * @brief synchronize subset of stream pool
+   *
+   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+   */
+  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    for (const auto& stream_index : stream_indices) {
+      sync_stream(stream_pool_->get_stream(stream_index));
+    }
+  }
+
+  /**
+   * @brief ask stream pool to wait on last event in main stream
+   */
+  void wait_stream_pool_on_stream() const
+  {
+    RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
+    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
+    }
+  }
+
+  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
+
+  const comms::comms_t& get_comms() const
+  {
+    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
+    return *communicator_;
+  }
+
+  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+  {
+    subcomms_[key] = subcomm;
+  }
+
+  const comms::comms_t& get_subcomm(std::string key) const
+  {
+    RAFT_EXPECTS(
+      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
+
+    auto subcomm = subcomms_.at(key);
+
+    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
+
+    return *subcomm;
+  }
+
+  bool comms_initialized() const { return (nullptr != communicator_.get()); }
+
+  const cudaDeviceProp& get_device_properties() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!device_prop_initialized_) {
+      RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
+      device_prop_initialized_ = true;
+    }
+    return prop_;
+  }
+
+ private:
+  std::shared_ptr<comms::comms_t> communicator_;
+  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
+
+  const int dev_id_;
+  mutable cublasHandle_t cublas_handle_;
+  mutable bool cublas_initialized_{false};
+  mutable cusolverDnHandle_t cusolver_dn_handle_;
+  mutable bool cusolver_dn_initialized_{false};
+  mutable cusolverSpHandle_t cusolver_sp_handle_;
+  mutable bool cusolver_sp_initialized_{false};
+  mutable cusparseHandle_t cusparse_handle_;
+  mutable bool cusparse_initialized_{false};
+  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
+  rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
+  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
+  cudaEvent_t event_;
+  mutable cudaDeviceProp prop_;
+  mutable bool device_prop_initialized_{false};
+  mutable std::mutex mutex_;
+
+  void create_resources()
+  {
+    thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
+
+    RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+  }
+
+  void destroy_resources()
+  {
+    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
+    if (cusolver_dn_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+    }
+    if (cusolver_sp_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+    }
+    if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
+    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
+  }
+};  // class handle_t
 
 /**
  * @brief RAII approach to synchronizing across all streams in the handle
  */
-    class stream_syncer {
-    public:
-        explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
-        ~stream_syncer()
-        {
-            handle_.wait_stream_pool_on_stream();
-            handle_.sync_stream_pool();
-        }
-
-        stream_syncer(const stream_syncer& other) = delete;
-        stream_syncer& operator=(const stream_syncer& other) = delete;
-
-    private:
-        const handle_t& handle_;
-    };  // class stream_syncer
+class stream_syncer {
+ public:
+  explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
+  ~stream_syncer()
+  {
+    handle_.wait_stream_pool_on_stream();
+    handle_.sync_stream_pool();
+  }
+
+  stream_syncer(const stream_syncer& other) = delete;
+  stream_syncer& operator=(const stream_syncer& other) = delete;
+
+ private:
+  const handle_t& handle_;
+};  // class stream_syncer
 
 }  // namespace raft
 
diff --git a/cpp/include/raft_runtime/interruptible.hpp b/cpp/include/raft_runtime/interruptible.hpp
index bc1faf3bf8..8b9c15a4e1 100644
--- a/cpp/include/raft_runtime/interruptible.hpp
+++ b/cpp/include/raft_runtime/interruptible.hpp
@@ -34,9 +34,9 @@ namespace raft {
  * @brief Exception thrown during `interruptible::synchronize` call when it detects a request
  * to cancel the work performed in this CPU thread.
  */
-    struct interrupted_exception : public raft::exception {
-        using raft::exception::exception;
-    };
+struct interrupted_exception : public raft::exception {
+  using raft::exception::exception;
+};
 
 /**
  * @brief Cooperative-style interruptible execution.
@@ -63,208 +63,208 @@ namespace raft {
  * (e.g., CTRL+C), but extra effort on the use side is required to allow safe interrupting and
  * resuming of the GPU stream work.
  */
-    class interruptible {
-    public:
-        /**
-         * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
-         * called on this CPU thread.
-         *
-         * @param [in] stream a CUDA stream.
-         *
-         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-         * thread before the currently captured work has been finished.
-         * @throw raft::cuda_error if another CUDA error happens.
-         */
-        static inline void synchronize(rmm::cuda_stream_view stream)
-        {
-            get_token()->synchronize_impl(cudaStreamQuery, stream);
-        }
+class interruptible {
+ public:
+  /**
+   * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
+   * called on this CPU thread.
+   *
+   * @param [in] stream a CUDA stream.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread before the currently captured work has been finished.
+   * @throw raft::cuda_error if another CUDA error happens.
+   */
+  static inline void synchronize(rmm::cuda_stream_view stream)
+  {
+    get_token()->synchronize_impl(cudaStreamQuery, stream);
+  }
 
-        /**
-         * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
-         * called on this CPU thread.
-         *
-         * @param [in] event a CUDA event.
-         *
-         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-         * thread before the currently captured work has been finished.
-         * @throw raft::cuda_error if another CUDA error happens.
-         */
-        static inline void synchronize(cudaEvent_t event)
-        {
-            get_token()->synchronize_impl(cudaEventQuery, event);
-        }
+  /**
+   * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
+   * called on this CPU thread.
+   *
+   * @param [in] event a CUDA event.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread before the currently captured work has been finished.
+   * @throw raft::cuda_error if another CUDA error happens.
+   */
+  static inline void synchronize(cudaEvent_t event)
+  {
+    get_token()->synchronize_impl(cudaEventQuery, event);
+  }
 
-        /**
-         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-         * `interruptible::cancel`.
-         *
-         * This is a cancellation point for an interruptible thread. It's called in the internals of
-         * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
-         * recommended to call `interruptible::yield()` in between to make sure the thread does not become
-         * unresponsive for too long.
-         *
-         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-         *
-         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-         * thread.
-         */
-        static inline void yield() { get_token()->yield_impl(); }
+  /**
+   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+   * `interruptible::cancel`.
+   *
+   * This is a cancellation point for an interruptible thread. It's called in the internals of
+   * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
+   * recommended to call `interruptible::yield()` in between to make sure the thread does not become
+   * unresponsive for too long.
+   *
+   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread.
+   */
+  static inline void yield() { get_token()->yield_impl(); }
 
-        /**
-         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-         * `interruptible::cancel`.
-         *
-         * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
-         *
-         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-         *
-         * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
-         */
-        static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
+  /**
+   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+   * `interruptible::cancel`.
+   *
+   * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
+   *
+   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+   *
+   * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
+   */
+  static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
 
-        /**
-         * @brief Get a cancellation token for this CPU thread.
-         *
-         * @return an object that can be used to cancel the GPU work waited on this CPU thread.
-         */
-        static inline auto get_token() -> std::shared_ptr<interruptible>
-        {
-            // NB: using static thread-local storage to keep the token alive once it is initialized
-            static thread_local std::shared_ptr<interruptible> s(
-                    get_token_impl<true>(std::this_thread::get_id()));
-            return s;
-        }
+  /**
+   * @brief Get a cancellation token for this CPU thread.
+   *
+   * @return an object that can be used to cancel the GPU work waited on this CPU thread.
+   */
+  static inline auto get_token() -> std::shared_ptr<interruptible>
+  {
+    // NB: using static thread-local storage to keep the token alive once it is initialized
+    static thread_local std::shared_ptr<interruptible> s(
+      get_token_impl<true>(std::this_thread::get_id()));
+    return s;
+  }
 
-        /**
-         * @brief Get a cancellation token for a CPU thread given by its id.
-         *
-         * The returned token may live longer than the associated thread. In that case, using its
-         * `cancel` method has no effect.
-         *
-         * @param [in] thread_id an id of a C++ CPU thread.
-         * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
-         */
-        static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-        {
-            return get_token_impl<false>(thread_id);
-        }
+  /**
+   * @brief Get a cancellation token for a CPU thread given by its id.
+   *
+   * The returned token may live longer than the associated thread. In that case, using its
+   * `cancel` method has no effect.
+   *
+   * @param [in] thread_id an id of a C++ CPU thread.
+   * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
+   */
+  static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+  {
+    return get_token_impl<false>(thread_id);
+  }
 
-        /**
-         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-         * CPU thread given by the `thread_id`
-         *
-         * Note, this function uses a mutex to safely get a cancellation token that may be shared
-         * among multiple threads. If you plan to use it from a signal handler, consider the non-static
-         * `cancel()` instead.
-         *
-         * @param [in] thread_id a CPU thread, in which the work should be interrupted.
-         */
-        static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
+  /**
+   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+   * CPU thread given by the `thread_id`
+   *
+   * Note, this function uses a mutex to safely get a cancellation token that may be shared
+   * among multiple threads. If you plan to use it from a signal handler, consider the non-static
+   * `cancel()` instead.
+   *
+   * @param [in] thread_id a CPU thread, in which the work should be interrupted.
+   */
+  static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
 
-        /**
-         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-         * CPU thread given by this `interruptible` token.
-         *
-         * Note, this function does not involve thread synchronization/locks and does not throw any
-         * exceptions, so it's safe to call from a signal handler.
-         */
-        inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
+  /**
+   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+   * CPU thread given by this `interruptible` token.
+   *
+   * Note, this function does not involve thread synchronization/locks and does not throw any
+   * exceptions, so it's safe to call from a signal handler.
+   */
+  inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
 
-        // don't allow the token to leave the shared_ptr
-        interruptible(interruptible const&) = delete;
-        interruptible(interruptible&&)      = delete;
-        auto operator=(interruptible const&) -> interruptible& = delete;
-        auto operator=(interruptible&&) -> interruptible& = delete;
+  // don't allow the token to leave the shared_ptr
+  interruptible(interruptible const&) = delete;
+  interruptible(interruptible&&)      = delete;
+  auto operator=(interruptible const&) -> interruptible& = delete;
+  auto operator=(interruptible&&) -> interruptible& = delete;
 
-    private:
-        /** Global registry of thread-local cancellation stores. */
-        static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
-        /** Protect the access to the registry. */
-        static inline std::mutex mutex_;
+ private:
+  /** Global registry of thread-local cancellation stores. */
+  static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
+  /** Protect the access to the registry. */
+  static inline std::mutex mutex_;
 
-        /**
-         * Create a new interruptible token or get an existing from the global registry_.
-         *
-         * Presumptions:
-         *
-         *   1. get_token_impl<true> must be called at most once per thread.
-         *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
-         *   3. get_token_impl<false> can be called as many times as needed, producing a valid
-         *      token for any input thread_id, independent of whether a C++ thread with this
-         *      id exists or not.
-         *
-         * @tparam Claim whether to bind the token to the given thread.
-         * @param [in] thread_id the id of the associated C++ thread.
-         * @return new or existing interruptible token.
-         */
-        template <bool Claim>
-        static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-        {
-            std::lock_guard<std::mutex> guard_get(mutex_);
-            // the following constructs an empty shared_ptr if the key does not exist.
-            auto& weak_store  = registry_[thread_id];
-            auto thread_store = weak_store.lock();
-            if (!thread_store || (Claim && thread_store->claimed_)) {
-                // Create a new thread_store in two cases:
-                //  1. It does not exist in the map yet
-                //  2. The previous store in the map has not yet been deleted
-                thread_store.reset(new interruptible(), [thread_id](auto ts) {
-                    std::lock_guard<std::mutex> guard_erase(mutex_);
-                    auto found = registry_.find(thread_id);
-                    if (found != registry_.end()) {
-                        auto stored = found->second.lock();
-                        // thread_store is not moveable, thus retains its original location.
-                        // Not equal pointers below imply the new store has been already placed
-                        // in the registry_ by the same std::thread::id
-                        if (!stored || stored.get() == ts) { registry_.erase(found); }
-                    }
-                    delete ts;
-                });
-                std::weak_ptr<interruptible>(thread_store).swap(weak_store);
-            }
-            // The thread_store is "claimed" by the thread
-            if constexpr (Claim) { thread_store->claimed_ = true; }
-            return thread_store;
+  /**
+   * Create a new interruptible token or get an existing from the global registry_.
+   *
+   * Presumptions:
+   *
+   *   1. get_token_impl<true> must be called at most once per thread.
+   *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
+   *   3. get_token_impl<false> can be called as many times as needed, producing a valid
+   *      token for any input thread_id, independent of whether a C++ thread with this
+   *      id exists or not.
+   *
+   * @tparam Claim whether to bind the token to the given thread.
+   * @param [in] thread_id the id of the associated C++ thread.
+   * @return new or existing interruptible token.
+   */
+  template <bool Claim>
+  static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+  {
+    std::lock_guard<std::mutex> guard_get(mutex_);
+    // the following constructs an empty shared_ptr if the key does not exist.
+    auto& weak_store  = registry_[thread_id];
+    auto thread_store = weak_store.lock();
+    if (!thread_store || (Claim && thread_store->claimed_)) {
+      // Create a new thread_store in two cases:
+      //  1. It does not exist in the map yet
+      //  2. The previous store in the map has not yet been deleted
+      thread_store.reset(new interruptible(), [thread_id](auto ts) {
+        std::lock_guard<std::mutex> guard_erase(mutex_);
+        auto found = registry_.find(thread_id);
+        if (found != registry_.end()) {
+          auto stored = found->second.lock();
+          // thread_store is not moveable, thus retains its original location.
+          // Not equal pointers below imply the new store has been already placed
+          // in the registry_ by the same std::thread::id
+          if (!stored || stored.get() == ts) { registry_.erase(found); }
         }
+        delete ts;
+      });
+      std::weak_ptr<interruptible>(thread_store).swap(weak_store);
+    }
+    // The thread_store is "claimed" by the thread
+    if constexpr (Claim) { thread_store->claimed_ = true; }
+    return thread_store;
+  }
 
-        /**
-         * Communicate whether the thread is in a cancelled state or can continue execution.
-         *
-         * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
-         * These are the only two places where it's used.
-         */
-        std::atomic_flag continue_;
-        /** This flag is set to true when the created token is placed into a thread-local storage. */
-        bool claimed_ = false;
+  /**
+   * Communicate whether the thread is in a cancelled state or can continue execution.
+   *
+   * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
+   * These are the only two places where it's used.
+   */
+  std::atomic_flag continue_;
+  /** This flag is set to true when the created token is placed into a thread-local storage. */
+  bool claimed_ = false;
 
-        interruptible() noexcept { yield_no_throw_impl(); }
+  interruptible() noexcept { yield_no_throw_impl(); }
 
-        void yield_impl()
-        {
-            if (!yield_no_throw_impl()) {
-                throw interrupted_exception("The work in this thread was cancelled.");
-            }
-        }
+  void yield_impl()
+  {
+    if (!yield_no_throw_impl()) {
+      throw interrupted_exception("The work in this thread was cancelled.");
+    }
+  }
 
-        auto yield_no_throw_impl() noexcept -> bool
-        {
-            return continue_.test_and_set(std::memory_order_relaxed);
-        }
+  auto yield_no_throw_impl() noexcept -> bool
+  {
+    return continue_.test_and_set(std::memory_order_relaxed);
+  }
 
-        template <typename Query, typename Object>
-        inline void synchronize_impl(Query query, Object object)
-        {
-            cudaError_t query_result;
-            while (true) {
-                yield_impl();
-                query_result = query(object);
-                if (query_result != cudaErrorNotReady) { break; }
-                std::this_thread::yield();
-            }
-            RAFT_CUDA_TRY(query_result);
-        }
-    };
+  template <typename Query, typename Object>
+  inline void synchronize_impl(Query query, Object object)
+  {
+    cudaError_t query_result;
+    while (true) {
+      yield_impl();
+      query_result = query(object);
+      if (query_result != cudaErrorNotReady) { break; }
+      std::this_thread::yield();
+    }
+    RAFT_CUDA_TRY(query_result);
+  }
+};
 
 }  // namespace raft
 

From 8ae80ad35a9b7527a78ae44d9db04be44382bf35 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 8 Mar 2022 17:58:39 -0500
Subject: [PATCH 017/167] Almost there...

---
 cpp/CMakeLists.txt                        | 44 ++++++++++++-----------
 cpp/cmake/modules/raft_export.cmake       | 13 ++++---
 cpp/cmake/thirdparty/get_libcudacxx.cmake |  2 +-
 cpp/cmake/thirdparty/get_rmm.cmake        |  2 +-
 cpp/include/raft.hpp                      |  2 +-
 cpp/include/raft/cudart_utils.h           |  2 +-
 cpp/include/raft/error.hpp                |  2 +-
 cpp/include/raft_runtime/cudart_utils.hpp |  2 +-
 cpp/include/raft_runtime/error.hpp        |  2 +-
 9 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2318465825..21e29420f1 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -143,7 +143,7 @@ target_compile_features(raft_runtime INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda
 ##############################################################################
 # - raft ---------------------------------------------------------------------
 
-add_library(raft INTERFACE EXCLUDE_FROM_ALL)
+add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
 
 target_include_directories(raft INTERFACE
@@ -292,9 +292,9 @@ include(CPack)
 install(TARGETS raft_runtime
         DESTINATION ${lib_dir}
         EXPORT raft-runtime-exports)
-#install(TARGETS raft
-#        DESTINATION ${lib_dir}
-#        EXPORT raft-exports)
+install(TARGETS raft
+        DESTINATION ${PROJECT_BINARY_DIR}
+        EXPORT raft-exports)
 install(TARGETS raft_distance
         DESTINATION ${lib_dir}
         EXPORT raft-distance-exports)
@@ -369,39 +369,41 @@ include(cmake/modules/raft_export.cmake)
 raft_export(INSTALL runtime
         COMPONENTS nn distance
         EXPORT_SET raft-runtime-exports
-        GLOBAL_TARGETS runtime raft nn distance
+        GLOBAL_TARGETS runtime nn distance
         NAMESPACE raft::
         DOCUMENTATION doc_string
         FINAL_CODE_BLOCK code_string
+        INSTALL_FILES ON
         )
 
-#raft_export(INSTALL raft
-#    EXPORT_SET raft-exports
-#    GLOBAL_TARGETS raft
-#    NAMESPACE raft::
-#    DOCUMENTATION doc_string
-#    FINAL_CODE_BLOCK code_string
-#    )
+raft_export(INSTALL raft
+    EXPORT_SET raft-exports
+    GLOBAL_TARGETS raft runtime raft_distance raft_nn
+    COMPONENTS nn distance
+    NAMESPACE raft::
+    DOCUMENTATION doc_string
+    FINAL_CODE_BLOCK code_string
+        )
 
 ##############################################################################
 # - build export -------------------------------------------------------------
 raft_export(BUILD runtime
         EXPORT_SET raft-runtime-exports
         COMPONENTS nn distance
-        GLOBAL_TARGETS runtime raft raft_distance raft_nn
+        GLOBAL_TARGETS runtime raft_distance raft_nn
         DOCUMENTATION doc_string
         NAMESPACE raft::
         FINAL_CODE_BLOCK code_string
         )
 
-#raft_export(BUILD raft
-#    EXPORT_SET raft-exports
-##    COMPONENTS nn distance
-#    GLOBAL_TARGETS raft
-#    DOCUMENTATION doc_string
-#    NAMESPACE raft::
-#    FINAL_CODE_BLOCK code_string
-#    )
+raft_export(BUILD raft
+    EXPORT_SET raft-exports
+    GLOBAL_TARGETS raft runtime raft_distance raft_nn
+    COMPONENTS nn distance
+    DOCUMENTATION doc_string
+    NAMESPACE raft::
+    FINAL_CODE_BLOCK code_string
+    )
 
 ##############################################################################
 # - export/install optional components  --------------------------------------
diff --git a/cpp/cmake/modules/raft_export.cmake b/cpp/cmake/modules/raft_export.cmake
index 4411433336..0917f47000 100644
--- a/cpp/cmake/modules/raft_export.cmake
+++ b/cpp/cmake/modules/raft_export.cmake
@@ -32,6 +32,7 @@ Generate a projects -Config.cmake module and all related information
       [ DOCUMENTATION <doc_variable> ]
       [ FINAL_CODE_BLOCK <code_block_variable> ]
       [ LANGUAGES <langs...> ]
+      [ INSTALL_FILES ON|OFF ]
       )
 
 The :cmake:command:`raft_export` function allow projects to easily generate a fully
@@ -122,7 +123,7 @@ function(raft_export type project_name)
   string(TOLOWER ${type} type)
 
   set(options "")
-  set(one_value EXPORT_SET VERSION NAMESPACE DOCUMENTATION FINAL_CODE_BLOCK)
+  set(one_value EXPORT_SET VERSION NAMESPACE DOCUMENTATION FINAL_CODE_BLOCK INSTALL_FILES)
   set(multi_value COMPONENTS GLOBAL_TARGETS LANGUAGES)
   cmake_parse_arguments(RAPIDS "${options}" "${one_value}" "${multi_value}" ${ARGN})
 
@@ -174,7 +175,11 @@ function(raft_export type project_name)
 
     set(scratch_dir "${PROJECT_BINARY_DIR}/rapids-cmake/${project_name}/export")
 
-    configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
+    if(NOT DEFINED RAPIDS_INSTALL_FILES)
+      set(install_location "${scratch_dir}")
+    endif()
+
+      configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
                                   "${scratch_dir}/${project_name}-config.cmake"
                                   INSTALL_DESTINATION "${install_location}")
 
@@ -184,8 +189,8 @@ function(raft_export type project_name)
         COMPATIBILITY ${rapids_project_version_compat})
     endif()
 
-    install(EXPORT ${RAPIDS_EXPORT_SET} FILE ${project_name}-targets.cmake
-            NAMESPACE ${RAPIDS_PROJECT_VERSION} DESTINATION "${install_location}")
+      install(EXPORT ${RAPIDS_EXPORT_SET} FILE ${project_name}-targets.cmake
+              NAMESPACE ${RAPIDS_PROJECT_VERSION} DESTINATION "${install_location}")
 
     if(TARGET rapids_export_install_${RAPIDS_EXPORT_SET})
       include("${rapids-cmake-dir}/export/write_dependencies.cmake")
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 7d0f2d7d86..4333ba3fcd 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index d1e078ea1a..8717eaad8c 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index 42370d9e4f..c49e51932a 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index b84fabbe6a..21b0c8387a 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index 04806e6ca2..5e1aa3af28 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft_runtime/cudart_utils.hpp b/cpp/include/raft_runtime/cudart_utils.hpp
index e0f941d777..a6c385a034 100644
--- a/cpp/include/raft_runtime/cudart_utils.hpp
+++ b/cpp/include/raft_runtime/cudart_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft_runtime/error.hpp b/cpp/include/raft_runtime/error.hpp
index 33a1861b22..a65b9a8469 100644
--- a/cpp/include/raft_runtime/error.hpp
+++ b/cpp/include/raft_runtime/error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 4bfde362cd96865fb2d63ec7b8b2bbac25714d40 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 8 Mar 2022 18:00:47 -0500
Subject: [PATCH 018/167] COpyright year

---
 cpp/cmake/modules/raft_export.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/modules/raft_export.cmake b/cpp/cmake/modules/raft_export.cmake
index 0917f47000..fbcd7fd684 100644
--- a/cpp/cmake/modules/raft_export.cmake
+++ b/cpp/cmake/modules/raft_export.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 6306b7b0f838dbe3f8392765a5e9264d524179aa Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 8 Mar 2022 18:05:47 -0500
Subject: [PATCH 019/167] Updating mdspan copyright

---
 cpp/cmake/thirdparty/get_mdspan.cmake | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index 820bb0a4d1..03fafd4577 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -1,3 +1,17 @@
+# =============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
 function(find_and_configure_mdspan VERSION)
   rapids_cpm_find(
     mdspan ${VERSION}

From b06f8e68828926d73f6efd045245cf8849d38375 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 8 Mar 2022 20:22:43 -0500
Subject: [PATCH 020/167] Fixing doxygen

---
 cpp/include/raft_runtime/cudart_utils.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/include/raft_runtime/cudart_utils.hpp b/cpp/include/raft_runtime/cudart_utils.hpp
index a6c385a034..153f426c09 100644
--- a/cpp/include/raft_runtime/cudart_utils.hpp
+++ b/cpp/include/raft_runtime/cudart_utils.hpp
@@ -156,6 +156,7 @@ class grid_1d_thread_t {
    * this can't be determined generically/automatically (as opposed to the number of blocks)
    * @param elements_per_thread Typically, a single kernel thread processes more than a single
    * element; this affects the number of threads the grid must contain
+   * @param max_num_blocks_1d maximum number of 1d blocks
    */
   grid_1d_thread_t(size_t overall_num_elements,
                    size_t num_threads_per_block,
@@ -188,6 +189,7 @@ class grid_1d_warp_t {
    * @param num_threads_per_block The grid block size, determined according to the kernel's
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param max_num_blocks_1d maximum number of 1d blocks
    */
   grid_1d_warp_t(size_t overall_num_elements,
                  size_t num_threads_per_block,
@@ -217,6 +219,7 @@ class grid_1d_block_t {
    * @param num_threads_per_block The grid block size, determined according to the kernel's
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param max_num_blocks_1d maximium number of 1d blocks
    */
   grid_1d_block_t(size_t overall_num_elements,
                   size_t num_threads_per_block,

From f4387b6294b39532ecfd0e912e8f9cd5c85de3a0 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 06:31:54 -0500
Subject: [PATCH 021/167] CHanging conda package to libraft-runtime so
 libraft-headers won't be overwritten (this allows downstream projects to
 continue to use it until they update

---
 ci/cpu/build.sh                                      | 12 ++++++------
 conda/recipes/libraft_distance/meta.yaml             |  4 ++--
 conda/recipes/libraft_nn/meta.yaml                   |  4 ++--
 .../{libraft_headers => libraft_runtime}/build.sh    |  0
 .../{libraft_headers => libraft_runtime}/meta.yaml   |  4 ++--
 conda/recipes/pyraft/meta.yaml                       |  4 ++--
 python/raft/common/handle.pxd                        |  2 +-
 python/raft/common/interruptible.pxd                 |  4 ++--
 python/raft/dask/common/comms_utils.pyx              |  2 +-
 9 files changed, 18 insertions(+), 18 deletions(-)
 rename conda/recipes/{libraft_headers => libraft_runtime}/build.sh (100%)
 rename conda/recipes/{libraft_headers => libraft_runtime}/meta.yaml (96%)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 64d46a68c7..f68b9c8a3e 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -85,16 +85,16 @@ gpuci_mamba_retry install -c conda-forge boa
 ###############################################################################
 
 if [ "$BUILD_LIBRAFT" == '1' ]; then
-  gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-headers"
+  gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-runtime"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_runtime
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance
   else
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_headers
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_runtime
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
-    mkdir -p ${CONDA_BLD_DIR}/libraft_headers/work
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_headers/work
+    mkdir -p ${CONDA_BLD_DIR}/libraft_runtime/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_runtime/work
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_nn
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
@@ -107,7 +107,7 @@ if [ "$BUILD_LIBRAFT" == '1' ]; then
     mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_distance/work
   fi
 else
-  gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-headers"
+  gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-runtime"
 fi
 
 if [ "$BUILD_RAFT" == "1" ]; then
diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index 4474629df4..c85c5a8ac3 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-headers {{ version }}
+    - libraft-runtime {{ version }}
     - nccl>=2.9.9
     - cudatoolkit {{ cuda_version }}.*
     - ucx-py {{ ucx_py_version }}
@@ -47,7 +47,7 @@ requirements:
     - gmock
     - librmm {{ minor_version }}
   run:
-    - libraft-headers {{ version }}
+    - libraft-runtime {{ version }}
     - nccl>=2.9.9
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
index 9d6732d56b..ffa3a26bb5 100644
--- a/conda/recipes/libraft_nn/meta.yaml
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-headers {{ version }}
+    - libraft-runtime {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - lapack
     - faiss-proc=*=cuda
@@ -48,7 +48,7 @@ requirements:
     - librmm {{ minor_version }}
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
-    - libraft-headers {{ version }}
+    - libraft-runtime {{ version }}
     - faiss-proc=*=cuda
     - libfaiss 1.7.0 *_cuda
     - libcusolver>=11.2.1
diff --git a/conda/recipes/libraft_headers/build.sh b/conda/recipes/libraft_runtime/build.sh
similarity index 100%
rename from conda/recipes/libraft_headers/build.sh
rename to conda/recipes/libraft_runtime/build.sh
diff --git a/conda/recipes/libraft_headers/meta.yaml b/conda/recipes/libraft_runtime/meta.yaml
similarity index 96%
rename from conda/recipes/libraft_headers/meta.yaml
rename to conda/recipes/libraft_runtime/meta.yaml
index aec6fa4351..b98d26a308 100644
--- a/conda/recipes/libraft_headers/meta.yaml
+++ b/conda/recipes/libraft_runtime/meta.yaml
@@ -8,7 +8,7 @@
 {% set cuda_major=cuda_version.split('.')[0] %}
 {% set ucx_py_version=environ.get('UCX_PY_VERSION') %}
 package:
-  name: libraft-headers
+  name: libraft-runtime
   version: {{ version }}
 
 source:
@@ -58,4 +58,4 @@ about:
   home: http://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
-  summary: libraft-headers library
+  summary: libraft-runtime library
diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
index eae9963204..2408701377 100644
--- a/conda/recipes/pyraft/meta.yaml
+++ b/conda/recipes/pyraft/meta.yaml
@@ -30,7 +30,7 @@ requirements:
     - setuptools
     - cython>=0.29,<0.30
     - rmm {{ minor_version }}
-    - libraft-headers {{ version }}
+    - libraft-runtime {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0
     - nccl>=2.9.9
@@ -39,7 +39,7 @@ requirements:
   run:
     - python x.x
     - dask-cuda {{ minor_version }}
-    - libraft-headers {{ version }}
+    - libraft-runtime {{ version }}
     - nccl>=2.9.9
     - rmm {{ minor_version }}
     - ucx-py {{ ucx_py_version }}
diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd
index 8415b7e3d7..ce9bac434c 100644
--- a/python/raft/common/handle.pxd
+++ b/python/raft/common/handle.pxd
@@ -31,7 +31,7 @@ cdef extern from "raft/mr/device/allocator.hpp" \
     cdef cppclass allocator:
         pass
 
-cdef extern from "raft/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft_runtime/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
diff --git a/python/raft/common/interruptible.pxd b/python/raft/common/interruptible.pxd
index a73e8c1ac7..d858f88beb 100644
--- a/python/raft/common/interruptible.pxd
+++ b/python/raft/common/interruptible.pxd
@@ -22,11 +22,11 @@
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
-cdef extern from "raft/interruptible.hpp" namespace "raft" nogil:
+cdef extern from "raft_runtime/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
 
-cdef extern from "raft/interruptible.hpp" \
+cdef extern from "raft_runtime/interruptible.hpp" \
         namespace "raft::interruptible" nogil:
     cdef void inter_synchronize \
         "raft::interruptible::synchronize"(cuda_stream_view stream) except+
diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx
index 38c5670372..8928d2f295 100644
--- a/python/raft/dask/common/comms_utils.pyx
+++ b/python/raft/dask/common/comms_utils.pyx
@@ -31,7 +31,7 @@ cdef extern from "nccl.h":
     cdef struct ncclComm
     ctypedef ncclComm *ncclComm_t
 
-cdef extern from "raft/handle.hpp" namespace "raft":
+cdef extern from "raft_runtime/handle.hpp" namespace "raft":
     cdef cppclass handle_t:
         handle_t() except +
 

From 6a92f19b03f08a10dd8502745a32eda2b96c5e23 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 06:35:04 -0500
Subject: [PATCH 022/167] Update copyrightys

---
 conda/recipes/libraft_distance/build.sh | 1 +
 conda/recipes/libraft_nn/build.sh       | 1 +
 conda/recipes/libraft_runtime/build.sh  | 1 +
 conda/recipes/pyraft/build.sh           | 1 +
 4 files changed, 4 insertions(+)

diff --git a/conda/recipes/libraft_distance/build.sh b/conda/recipes/libraft_distance/build.sh
index 7523263f01..0057d85b1a 100644
--- a/conda/recipes/libraft_distance/build.sh
+++ b/conda/recipes/libraft_distance/build.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
 ./build.sh libraft -v --allgpuarch --compile-dist --nogtest
diff --git a/conda/recipes/libraft_nn/build.sh b/conda/recipes/libraft_nn/build.sh
index 5c60cd2fa1..3a26f8782b 100644
--- a/conda/recipes/libraft_nn/build.sh
+++ b/conda/recipes/libraft_nn/build.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
 ./build.sh libraft -v --allgpuarch --compile-nn --nogtest
diff --git a/conda/recipes/libraft_runtime/build.sh b/conda/recipes/libraft_runtime/build.sh
index ca6d9b4960..9abf1b49bc 100644
--- a/conda/recipes/libraft_runtime/build.sh
+++ b/conda/recipes/libraft_runtime/build.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
 ./build.sh libraft -v --allgpuarch --nogtest
diff --git a/conda/recipes/pyraft/build.sh b/conda/recipes/pyraft/build.sh
index 044a34f906..51ac9d7cf1 100644
--- a/conda/recipes/pyraft/build.sh
+++ b/conda/recipes/pyraft/build.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
 # This assumes the script is executed from the root of the repo directory
 ./build.sh pyraft

From 831fe8154a37cbf594637438b71926802b906965 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 06:43:22 -0500
Subject: [PATCH 023/167] MOving runtime headers into corresponding directories

---
 cpp/include/raft_runtime/{ => comms}/comms.hpp            | 0
 cpp/include/raft_runtime/handle.hpp                       | 8 ++++----
 cpp/include/raft_runtime/{ => linalg}/cublas_macros.hpp   | 6 ++----
 cpp/include/raft_runtime/{ => linalg}/cusolver_macros.hpp | 4 ++--
 cpp/include/raft_runtime/raft.hpp                         | 2 +-
 cpp/include/raft_runtime/{ => sparse}/cusparse_macros.hpp | 0
 6 files changed, 9 insertions(+), 11 deletions(-)
 rename cpp/include/raft_runtime/{ => comms}/comms.hpp (100%)
 rename cpp/include/raft_runtime/{ => linalg}/cublas_macros.hpp (95%)
 rename cpp/include/raft_runtime/{ => linalg}/cusolver_macros.hpp (96%)
 rename cpp/include/raft_runtime/{ => sparse}/cusparse_macros.hpp (100%)

diff --git a/cpp/include/raft_runtime/comms.hpp b/cpp/include/raft_runtime/comms/comms.hpp
similarity index 100%
rename from cpp/include/raft_runtime/comms.hpp
rename to cpp/include/raft_runtime/comms/comms.hpp
diff --git a/cpp/include/raft_runtime/handle.hpp b/cpp/include/raft_runtime/handle.hpp
index d99cc08601..3ecd257b21 100644
--- a/cpp/include/raft_runtime/handle.hpp
+++ b/cpp/include/raft_runtime/handle.hpp
@@ -37,10 +37,10 @@
 
 #include <raft_runtime/cudart_utils.hpp>
 
-#include <raft_runtime/comms.hpp>
-#include <raft_runtime/cublas_macros.hpp>
-#include <raft_runtime/cusolver_macros.hpp>
-#include <raft_runtime/cusparse_macros.hpp>
+#include <raft_runtime/comms/comms.hpp>
+#include <raft_runtime/linalg/cublas_macros.hpp>
+#include <raft_runtime/linalg/cusolver_macros.hpp>
+#include <raft_runtime/sparse/cusparse_macros.hpp>
 #include <raft_runtime/interruptible.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/include/raft_runtime/cublas_macros.hpp b/cpp/include/raft_runtime/linalg/cublas_macros.hpp
similarity index 95%
rename from cpp/include/raft_runtime/cublas_macros.hpp
rename to cpp/include/raft_runtime/linalg/cublas_macros.hpp
index eb944e282f..3a3093ec05 100644
--- a/cpp/include/raft_runtime/cublas_macros.hpp
+++ b/cpp/include/raft_runtime/linalg/cublas_macros.hpp
@@ -41,7 +41,6 @@ struct cublas_error : public raft::exception {
 };
 
 namespace linalg {
-namespace detail {
 
 inline const char* cublas_error_to_string(cublasStatus_t err)
 {
@@ -60,7 +59,6 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
   };
 }
 
-}  // namespace detail
 }  // namespace linalg
 }  // namespace raft
 
@@ -82,7 +80,7 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
                     "call='%s', Reason=%d:%s",                             \
                     #call,                                                 \
                     status,                                                \
-                    raft::linalg::detail::cublas_error_to_string(status)); \
+                    raft::linalg::cublas_error_to_string(status)); \
       throw raft::cublas_error(msg);                                       \
     }                                                                      \
   } while (0)
@@ -104,7 +102,7 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
              #call,                                                  \
              __FILE__,                                               \
              __LINE__,                                               \
-             raft::linalg::detail::cublas_error_to_string(status));  \
+             raft::linalg::cublas_error_to_string(status));  \
     }                                                                \
   } while (0)
 
diff --git a/cpp/include/raft_runtime/cusolver_macros.hpp b/cpp/include/raft_runtime/linalg/cusolver_macros.hpp
similarity index 96%
rename from cpp/include/raft_runtime/cusolver_macros.hpp
rename to cpp/include/raft_runtime/linalg/cusolver_macros.hpp
index c1ac54a7cb..02c1de02e0 100644
--- a/cpp/include/raft_runtime/cusolver_macros.hpp
+++ b/cpp/include/raft_runtime/linalg/cusolver_macros.hpp
@@ -79,7 +79,7 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
                     "call='%s', Reason=%d:%s",                               \
                     #call,                                                   \
                     status,                                                  \
-                    raft::linalg::detail::cusolver_error_to_string(status)); \
+                    raft::linalg::cusolver_error_to_string(status)); \
       throw raft::cusolver_error(msg);                                       \
     }                                                                        \
   } while (0)
@@ -101,7 +101,7 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
              #call,                                                    \
              __FILE__,                                                 \
              __LINE__,                                                 \
-             raft::linalg::detail::cusolver_error_to_string(status));  \
+             raft::linalg::cusolver_error_to_string(status));  \
     }                                                                  \
   } while (0)
 
diff --git a/cpp/include/raft_runtime/raft.hpp b/cpp/include/raft_runtime/raft.hpp
index 00879bad1a..82e8514ddb 100644
--- a/cpp/include/raft_runtime/raft.hpp
+++ b/cpp/include/raft_runtime/raft.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "comms.hpp"
+#include "comms/comms.hpp"
 #include "error.hpp"
 #include "handle.hpp"
 #include "interruptible.hpp"
diff --git a/cpp/include/raft_runtime/cusparse_macros.hpp b/cpp/include/raft_runtime/sparse/cusparse_macros.hpp
similarity index 100%
rename from cpp/include/raft_runtime/cusparse_macros.hpp
rename to cpp/include/raft_runtime/sparse/cusparse_macros.hpp

From 0fc02ca023d5cdf4ebd366c49d5a558a06b5a20b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 06:44:07 -0500
Subject: [PATCH 024/167] Updating style

---
 cpp/include/raft_runtime/handle.hpp           |  2 +-
 .../raft_runtime/linalg/cublas_macros.hpp     | 26 +++++++++----------
 .../raft_runtime/linalg/cusolver_macros.hpp   | 26 +++++++++----------
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/cpp/include/raft_runtime/handle.hpp b/cpp/include/raft_runtime/handle.hpp
index 3ecd257b21..99156b1971 100644
--- a/cpp/include/raft_runtime/handle.hpp
+++ b/cpp/include/raft_runtime/handle.hpp
@@ -38,10 +38,10 @@
 #include <raft_runtime/cudart_utils.hpp>
 
 #include <raft_runtime/comms/comms.hpp>
+#include <raft_runtime/interruptible.hpp>
 #include <raft_runtime/linalg/cublas_macros.hpp>
 #include <raft_runtime/linalg/cusolver_macros.hpp>
 #include <raft_runtime/sparse/cusparse_macros.hpp>
-#include <raft_runtime/interruptible.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft_runtime/linalg/cublas_macros.hpp b/cpp/include/raft_runtime/linalg/cublas_macros.hpp
index 3a3093ec05..6ab105bd0a 100644
--- a/cpp/include/raft_runtime/linalg/cublas_macros.hpp
+++ b/cpp/include/raft_runtime/linalg/cublas_macros.hpp
@@ -70,19 +70,19 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
  * Invokes a cuBLAS runtime API function call, if the call does not return
  * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
  */
-#define RAFT_CUBLAS_TRY(call)                                              \
-  do {                                                                     \
-    cublasStatus_t const status = (call);                                  \
-    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                   \
-      SET_ERROR_MSG(msg,                                                   \
-                    "cuBLAS error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                             \
-                    #call,                                                 \
-                    status,                                                \
+#define RAFT_CUBLAS_TRY(call)                                      \
+  do {                                                             \
+    cublasStatus_t const status = (call);                          \
+    if (CUBLAS_STATUS_SUCCESS != status) {                         \
+      std::string msg{};                                           \
+      SET_ERROR_MSG(msg,                                           \
+                    "cuBLAS error encountered at: ",               \
+                    "call='%s', Reason=%d:%s",                     \
+                    #call,                                         \
+                    status,                                        \
                     raft::linalg::cublas_error_to_string(status)); \
-      throw raft::cublas_error(msg);                                       \
-    }                                                                      \
+      throw raft::cublas_error(msg);                               \
+    }                                                              \
   } while (0)
 
 // FIXME: Remove after consumers rename
@@ -102,7 +102,7 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
              #call,                                                  \
              __FILE__,                                               \
              __LINE__,                                               \
-             raft::linalg::cublas_error_to_string(status));  \
+             raft::linalg::cublas_error_to_string(status));          \
     }                                                                \
   } while (0)
 
diff --git a/cpp/include/raft_runtime/linalg/cusolver_macros.hpp b/cpp/include/raft_runtime/linalg/cusolver_macros.hpp
index 02c1de02e0..7d727f1c6e 100644
--- a/cpp/include/raft_runtime/linalg/cusolver_macros.hpp
+++ b/cpp/include/raft_runtime/linalg/cusolver_macros.hpp
@@ -69,19 +69,19 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
  * Invokes a cuSOLVER runtime API function call, if the call does not return
  * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
  */
-#define RAFT_CUSOLVER_TRY(call)                                              \
-  do {                                                                       \
-    cusolverStatus_t const status = (call);                                  \
-    if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                     \
-      SET_ERROR_MSG(msg,                                                     \
-                    "cuSOLVER error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                               \
-                    #call,                                                   \
-                    status,                                                  \
+#define RAFT_CUSOLVER_TRY(call)                                      \
+  do {                                                               \
+    cusolverStatus_t const status = (call);                          \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                         \
+      std::string msg{};                                             \
+      SET_ERROR_MSG(msg,                                             \
+                    "cuSOLVER error encountered at: ",               \
+                    "call='%s', Reason=%d:%s",                       \
+                    #call,                                           \
+                    status,                                          \
                     raft::linalg::cusolver_error_to_string(status)); \
-      throw raft::cusolver_error(msg);                                       \
-    }                                                                        \
+      throw raft::cusolver_error(msg);                               \
+    }                                                                \
   } while (0)
 
 // FIXME: remove after consumer rename
@@ -101,7 +101,7 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
              #call,                                                    \
              __FILE__,                                                 \
              __LINE__,                                                 \
-             raft::linalg::cusolver_error_to_string(status));  \
+             raft::linalg::cusolver_error_to_string(status));          \
     }                                                                  \
   } while (0)
 

From e8ba5a19c8c79214a9065c191cdf46c38c616a76 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 07:50:51 -0500
Subject: [PATCH 025/167] Removing todos

---
 cpp/CMakeLists.txt      | 36 +++++++++++++++---------------------
 cpp/cmake/versions.json |  9 ---------
 2 files changed, 15 insertions(+), 30 deletions(-)
 delete mode 100644 cpp/cmake/versions.json

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 21e29420f1..27df57d150 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -234,7 +234,6 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
 
 endif()
 
-# TODO: Create public header(s) for exposed distance functions
 target_link_libraries(raft_distance INTERFACE raft::runtime
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
@@ -278,7 +277,6 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
 
 endif()
 
-# TODO: Create public header(s) for exposed nn functions
 target_link_libraries(raft_nn INTERFACE raft::runtime faiss::faiss
     $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
@@ -367,14 +365,13 @@ endif()
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
 raft_export(INSTALL runtime
-        COMPONENTS nn distance
-        EXPORT_SET raft-runtime-exports
-        GLOBAL_TARGETS runtime nn distance
-        NAMESPACE raft::
-        DOCUMENTATION doc_string
-        FINAL_CODE_BLOCK code_string
-        INSTALL_FILES ON
-        )
+    COMPONENTS nn distance
+    EXPORT_SET raft-runtime-exports
+    GLOBAL_TARGETS runtime nn distance
+    NAMESPACE raft::
+    DOCUMENTATION doc_string
+    FINAL_CODE_BLOCK code_string
+    INSTALL_FILES ON)
 
 raft_export(INSTALL raft
     EXPORT_SET raft-exports
@@ -382,19 +379,17 @@ raft_export(INSTALL raft
     COMPONENTS nn distance
     NAMESPACE raft::
     DOCUMENTATION doc_string
-    FINAL_CODE_BLOCK code_string
-        )
+    FINAL_CODE_BLOCK code_string)
 
 ##############################################################################
 # - build export -------------------------------------------------------------
 raft_export(BUILD runtime
-        EXPORT_SET raft-runtime-exports
-        COMPONENTS nn distance
-        GLOBAL_TARGETS runtime raft_distance raft_nn
-        DOCUMENTATION doc_string
-        NAMESPACE raft::
-        FINAL_CODE_BLOCK code_string
-        )
+    EXPORT_SET raft-runtime-exports
+    COMPONENTS nn distance
+    GLOBAL_TARGETS runtime raft_distance raft_nn
+    DOCUMENTATION doc_string
+    NAMESPACE raft::
+    FINAL_CODE_BLOCK code_string)
 
 raft_export(BUILD raft
     EXPORT_SET raft-exports
@@ -402,8 +397,7 @@ raft_export(BUILD raft
     COMPONENTS nn distance
     DOCUMENTATION doc_string
     NAMESPACE raft::
-    FINAL_CODE_BLOCK code_string
-    )
+    FINAL_CODE_BLOCK code_string)
 
 ##############################################################################
 # - export/install optional components  --------------------------------------
diff --git a/cpp/cmake/versions.json b/cpp/cmake/versions.json
deleted file mode 100644
index cca2dd8859..0000000000
--- a/cpp/cmake/versions.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "packages" : {
-      "Thrust" : {
-        "version" : "1.15.0",
-        "git_url" : "https://github.com/NVIDIA/thrust.git",
-        "git_tag" : "${version}"
-    }
-  }
-}

From 4a13200a0c9180e1e96f91c11765cdb926bb77fd Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 09:39:34 -0500
Subject: [PATCH 026/167] updating readme

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index a79679c579..0f3566768b 100755
--- a/README.md
+++ b/README.md
@@ -73,14 +73,14 @@ RAFT can be installed through conda, cmake-package-manager (cpm), or by building
 ### Conda
 
 The easiest way to install RAFT is through conda and several packages are provided.
-- `libraft-headers` contains all the CUDA/C++ headers
-- `libraft-nn` (optional) contains precompiled shared libraries for the nearest neighbors algorithms. If FAISS is not already installed in your environment, this will need to be installed to use the nearest neighbors headers.
-- `libraft-distance` (optional) contains shared libraries for distance algorithms.
-- `pyraft` (optional) contains the Python library
+- `libraft-runtime` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`
+- `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
+- `libraft-distance` (optional) contains shared libraries for distance primitives.
+- `pyraft` (optional) contains reusable Python tools to accelerate Python algorithm development
 
 To install RAFT with conda (change to `rapidsai-nightly` for more up-to-date but less stable nightly packages)
 ```bash
-conda install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft
+conda install -c rapidsai libraft-runtime libraft-nn libraft-distance pyraft
 ```
 
 After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
@@ -137,11 +137,11 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
 
 ### Source
 
-The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository,
-1. create an environment with the RAFT dependencies: `conda env create --name raft_dev -f conda/environments/raft_dev_cuda11.5.yml`
-2. run the build script from the repository root: `./build.sh pyraft libraft --compile-libs`
+The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository:
+1. Create an environment with the needed dependencies: `conda env create --name raft_dev -f conda/environments/raft_dev_cuda11.5.yml`
+2. Run the build script from the repository root: `./build.sh pyraft libraft --compile-libs`
 
-The [Build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) guide.
+The [Build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) guide.
 
 ## Folder Structure and Contents
 
@@ -151,7 +151,7 @@ The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with t
 - `conda`: Conda recipes and development conda environments
 - `cpp`: Source code for all C++ code. 
   - `docs`: Doxygen configuration
-  - `include`: The C++ API is fully-contained here 
+  - `include`: The C++ API is fully-contained here
   - `src`: Compiled template specializations for the shared libraries
 - `docs`: Source code and scripts for building library documentation (doxygen + pydocs)
 - `python`: Source code for all Python source code.

From 858338605545754c5199dc4051a2cd932bdaa91c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 09:44:20 -0500
Subject: [PATCH 027/167] Adding docs for the raft_export INSTALL_FILES option

---
 cpp/cmake/modules/raft_export.cmake | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/cpp/cmake/modules/raft_export.cmake b/cpp/cmake/modules/raft_export.cmake
index fbcd7fd684..2df4c5593d 100644
--- a/cpp/cmake/modules/raft_export.cmake
+++ b/cpp/cmake/modules/raft_export.cmake
@@ -107,6 +107,12 @@ calls to :cmake:command:`find_dependency`, or :cmake:command:`CPMFindPackage`.
   of your package. This makes sure all consumers properly setup these
   languages correctly.
 
+``INSTALL_FILES``
+  Optional boolean value denoting whether exported files should be installed
+  to the RAPIDS lib directory during the install stage. This is OFF by
+  default so the export files will only be installed into the project's build
+  directory.
+
   This is required as CMake's :cmake:command:`enable_language` only supports
   enabling languages for the current directory scope, and doesn't support
   being called from within functions. Marking languages here overcomes
@@ -139,7 +145,6 @@ function(raft_export type project_name)
     # Choose the project version when an explicit version isn't provided
     set(RAPIDS_VERSION "${PROJECT_VERSION}")
   endif()
-
   if(rapids_version_set)
     include("${rapids-cmake-dir}/export/detail/parse_version.cmake")
     rapids_export_parse_version(${RAPIDS_VERSION} rapids_orig rapids_project_version)
@@ -165,6 +170,10 @@ function(raft_export type project_name)
     set(RAPIDS_PROJECT_FINAL_CODE_BLOCK "${${RAPIDS_FINAL_CODE_BLOCK}}")
   endif()
 
+  if(DEFINED RAPIDS_INSTALL_FILES AND NOT RAPIDS_INSTALL_FILES)
+    unset(RAPIDS_INSTALL_FILES)
+  endif()
+
   # Write configuration and version files
   string(TOLOWER ${project_name} project_name)
   string(TOUPPER ${project_name} project_name_uppercase)

From d518ba7d8a54754bcb138cc826654fd723e2140c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 09:46:38 -0500
Subject: [PATCH 028/167] Updating build.md

---
 BUILD.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index 1bf3783fae..c2a4ac62b1 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -79,20 +79,19 @@ RAFT's cmake has the following configurable flags available:.
 | --- | --- | --- | --- |
 | BUILD_TESTS | ON, OFF | ON | Compile Googletests |  
 | RAFT_COMPILE_LIBRARIES | ON, OFF | OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
-| RAFT_COMPILE_NN_LIBRARY | ON, OFF | ON | Compiles the `libraft-nn` shared library |  
-| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |  
+| RAFT_COMPILE_NN_LIBRARY | ON, OFF | ON | Compiles the `libraft-nn` shared library |
+| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |
 | RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. |
-| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | 
+| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` |
+| RAFT_STATIC_LINK_LIBRARIES | ON, OFF | OFF | Statically link compiled libraft libraries |
 | DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies |
 | NVTX | ON, OFF | OFF | Enable NVTX Markers |
-| CUDA_ENABLE_KERNELINFO | ON, OFF | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` | 
+| CUDA_ENABLE_KERNELINFO | ON, OFF | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` |
 | CUDA_ENABLE_LINEINFO  | ON, OFF | OFF | Enable the -lineinfo option for nvcc |
 | CUDA_STATIC_RUNTIME | ON, OFF | OFF | Statically link the CUDA runtime |
 
 Shared libraries are provided for the `libraft-nn` and `libraft-distance` components currently. The `libraft-nn` component depends upon [FAISS](https://github.com/facebookresearch/faiss) and the `RAFT_ENABLE_NN_DEPENDENCIES` option will build it from source if it is not already installed.
 
-
-
 ### <a id="python"></a>Python
 
 Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. The following example will install create and install dependencies for a CUDA 11.5 conda environment:

From 5f6f1eb45c6b52cfdda812e86fa1d891c762f9f2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 11:04:03 -0500
Subject: [PATCH 029/167] pylibraft is compiling!

---
 cpp/CMakeLists.txt                            |  2 +-
 .../raft_distance/pairwise_distance.hpp       | 41 +++++++++++++
 cpp/src/distance/pairwise_distance.cu         | 56 +++++++++++++++++
 .../pylibraft/pylibraft/distance/__init__.py  |  2 +
 .../pylibraft/distance/pairwise_distance.pxd  | 43 +++++++++++++
 .../pylibraft/distance/pairwise_distance.pyx  | 60 +++++++++++++++++++
 python/pylibraft/setup.py                     |  2 +-
 python/raft/setup.py                          |  2 +-
 8 files changed, 205 insertions(+), 3 deletions(-)
 create mode 100644 cpp/include/raft_distance/pairwise_distance.hpp
 create mode 100644 cpp/src/distance/pairwise_distance.cu
 create mode 100644 python/pylibraft/pylibraft/distance/pairwise_distance.pxd
 create mode 100644 python/pylibraft/pylibraft/distance/pairwise_distance.pyx

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 27df57d150..2f198276e4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -187,7 +187,7 @@ set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
   add_library(raft_distance_lib ${RAFT_LIB_TYPE}
-    src/distance/specializations/detail
+    src/distance/pairwise_distance.cu
     src/distance/specializations/detail/canberra.cu
     src/distance/specializations/detail/chebyshev.cu
     src/distance/specializations/detail/correlation.cu
diff --git a/cpp/include/raft_distance/pairwise_distance.hpp b/cpp/include/raft_distance/pairwise_distance.hpp
new file mode 100644
index 0000000000..7bd5ca3767
--- /dev/null
+++ b/cpp/include/raft_distance/pairwise_distance.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/distance_type.hpp>
+
+namespace raft::distance::runtime {
+    void pairwise_distance(raft::handle_t const &handle,
+                           float *x,
+                           float *y,
+                           float *dists,
+                           int m,
+                           int n,
+                           int k,
+                           raft::distance::DistanceType metric,
+                           bool isRowMajor,
+                           float metric_arg);
+
+    void pairwise_distance(raft::handle_t const &handle,
+                           double *x,
+                           double *y,
+                           double *dists,
+                           int m,
+                           int n,
+                           int k,
+                           raft::distance::DistanceType metric,
+                           bool isRowMajor,
+                           float metric_arg);
+}
\ No newline at end of file
diff --git a/cpp/src/distance/pairwise_distance.cu b/cpp/src/distance/pairwise_distance.cu
new file mode 100644
index 0000000000..9c113cc44c
--- /dev/null
+++ b/cpp/src/distance/pairwise_distance.cu
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/handle.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/distance/specializations.cuh>
+#include <raft/distance/distance_type.hpp>
+
+namespace raft::distance::runtime {
+
+    void pairwise_distance(raft::handle_t const &handle,
+                           float *x,
+                           float *y,
+                           float *dists,
+                           int m,
+                           int n,
+                           int k,
+                           raft::distance::DistanceType metric,
+                           bool isRowMajor,
+                           float metric_arg) {
+
+         raft::distance::pairwise_distance<float, int>(handle,
+                               x, y, dists, m, n, k, metric,
+                               isRowMajor,
+                               metric_arg);
+    }
+
+    void pairwise_distance(raft::handle_t const &handle,
+                           double *x,
+                           double *y,
+                           double *dists,
+                           int m,
+                           int n,
+                           int k,
+                           raft::distance::DistanceType metric,
+                           bool isRowMajor,
+                           float metric_arg) {
+        raft::distance::pairwise_distance<double, int>(handle,
+                                                      x, y, dists, m, n, k, metric,
+                                                      isRowMajor,
+                                                      metric_arg);
+    }
+}
\ No newline at end of file
diff --git a/python/pylibraft/pylibraft/distance/__init__.py b/python/pylibraft/pylibraft/distance/__init__.py
index 273b4497cc..278f6d9a81 100644
--- a/python/pylibraft/pylibraft/distance/__init__.py
+++ b/python/pylibraft/pylibraft/distance/__init__.py
@@ -12,3 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+from pairwise_distance import distance as pairwise_distance
\ No newline at end of file
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pxd b/python/pylibraft/pylibraft/distance/pairwise_distance.pxd
new file mode 100644
index 0000000000..b0b9875ac7
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pxd
@@ -0,0 +1,43 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from libcpp cimport bool
+from pylibraft.distance.distance_type cimport DistanceType
+from pylibraft.common.handle cimport handle_t
+
+cdef extern from "raft_distance/pairwise_distance.hpp" namespace "raft::distance::runtime":
+
+    cdef void pairwise_distance(const handle_t &handle,
+                                float *x,
+                                float *y,
+                                float *dists,
+                                int m,
+                                int n,
+                                int k,
+                                DistanceType metric,
+                                bool isRowMajor,
+                                float metric_arg)
+
+    cdef void pairwise_distance(const handle_t &handle,
+                                double *x,
+                                double *y,
+                                double *dists,
+                                int m,
+                                int n,
+                                int k,
+                                DistanceType metric,
+                                bool isRowMajor,
+                                float metric_arg)
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
new file mode 100644
index 0000000000..e0da33c73b
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -0,0 +1,60 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from libc.stdint cimport uintptr_t
+from cython.operator cimport dereference as deref
+
+from pylibraft.distance.distance_type cimport DistanceType
+from pylibraft.common.handle cimport handle_t
+from pylibraft.distance.pairwise_distance import *
+
+def distance(X, Y, dists, metric="euclidean"):
+    """
+    Compute pairwise distances between X and Y
+
+    Parameters
+    ----------
+
+    X : CUDA array interface matrix shape (m, k)
+    Y : CUDA array interface matrix shape (n, k)
+    dists : Writable CUDA array interface matrix shape (m, n)
+    metric : string denoting the metric type
+    """
+
+    # TODO: Validate inputs, shapes, etc...
+    x_cai = X.__cuda_array_interface__
+    y_cai = Y.__cuda_array_interface__
+    dists_cai = dists.__cuda_array_interface__
+
+    m = x_cai["shape"][0]
+    n = y_cai["shape"][0]
+    k = dists_cai["shape"][0]
+
+    x_ptr = <uintptr_t>x_cai["data"][0]
+    y_ptr = <uintptr_t>y_cai["data"][0]
+    d_ptr = <uintptr_t>dists_cai["data"][0]
+
+    cdef handle_t *h = new handle_t()
+
+    # TODO: Support single and double precision
+    pairwise_distance(deref(h),<float*> x_ptr,
+              <float*> y_ptr,
+              <float*> d_ptr,
+              <int>m,
+              <int>n,
+              <int>k,
+              <DistanceType>DistanceType2.L2Expanded,
+              <bool>true, <float>0.0)
diff --git a/python/pylibraft/setup.py b/python/pylibraft/setup.py
index ec7e11ba85..e2340b377c 100644
--- a/python/pylibraft/setup.py
+++ b/python/pylibraft/setup.py
@@ -104,7 +104,7 @@
 
 include_dirs = [cuda_include_dir,
                 numpy.get_include(),
-                "../cpp/include/",
+                "../../cpp/include/",
                 os.path.dirname(sysconfig.get_path("include"))]
 
 extensions = [
diff --git a/python/raft/setup.py b/python/raft/setup.py
index 10beca1eb4..4af7ff2a88 100644
--- a/python/raft/setup.py
+++ b/python/raft/setup.py
@@ -106,7 +106,7 @@
 
 include_dirs = [cuda_include_dir,
                 numpy.get_include(),
-                "../cpp/include/",
+                "../../cpp/include/",
                 os.path.dirname(sysconfig.get_path("include"))]
 
 extensions = [

From bd42373c869dcd19e0a990e0e91b91a6dcb28f16 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 15:14:25 -0500
Subject: [PATCH 030/167] Fixing style

---
 python/pylibraft/pylibraft/common/mdarray.pxd | 27 -------------------
 .../pylibraft/pylibraft/distance/distance.pyx | 24 -----------------
 .../pylibraft/distance/pairwise_distance.pxd  |  3 ++-
 .../pylibraft/distance/pairwise_distance.pyx  | 18 +++++++------
 4 files changed, 12 insertions(+), 60 deletions(-)
 delete mode 100644 python/pylibraft/pylibraft/common/mdarray.pxd
 delete mode 100644 python/pylibraft/pylibraft/distance/distance.pyx

diff --git a/python/pylibraft/pylibraft/common/mdarray.pxd b/python/pylibraft/pylibraft/common/mdarray.pxd
deleted file mode 100644
index 1f5e275b82..0000000000
--- a/python/pylibraft/pylibraft/common/mdarray.pxd
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from .handle cimport handle_t
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-
-cdef extern from "raft/mdarray.hpp" namespace "raft":
-    cdef cppclass device_matrix[T]:
-        pass
-
-    cdef device_matrix[T] make_device_matrix[T](size_t n_rows,
-                            size_t n_cols,
-                            cuda_stream_view stream)
diff --git a/python/pylibraft/pylibraft/distance/distance.pyx b/python/pylibraft/pylibraft/distance/distance.pyx
deleted file mode 100644
index 89232600f6..0000000000
--- a/python/pylibraft/pylibraft/distance/distance.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from pylibraft.common.handle cimport handle_t
-from .distance_type import DistanceType
-from pylibraft.common.mdarray cimport make_device_matrix, device_matrix
-
-cdef pairwise_distance():
-
-    cdef handle_t handle
-    cdef device_matrix[int] hellp = make_device_matrix[int](5, 10, handle.get_stream())
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pxd b/python/pylibraft/pylibraft/distance/pairwise_distance.pxd
index b0b9875ac7..ad16be6113 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pxd
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pxd
@@ -18,7 +18,8 @@ from libcpp cimport bool
 from pylibraft.distance.distance_type cimport DistanceType
 from pylibraft.common.handle cimport handle_t
 
-cdef extern from "raft_distance/pairwise_distance.hpp" namespace "raft::distance::runtime":
+cdef extern from "raft_distance/pairwise_distance.hpp" \
+    namespace "raft::distance::runtime":
 
     cdef void pairwise_distance(const handle_t &handle,
                                 float *x,
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index e0da33c73b..08c63cbdce 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -21,6 +21,7 @@ from pylibraft.distance.distance_type cimport DistanceType
 from pylibraft.common.handle cimport handle_t
 from pylibraft.distance.pairwise_distance import *
 
+
 def distance(X, Y, dists, metric="euclidean"):
     """
     Compute pairwise distances between X and Y
@@ -50,11 +51,12 @@ def distance(X, Y, dists, metric="euclidean"):
     cdef handle_t *h = new handle_t()
 
     # TODO: Support single and double precision
-    pairwise_distance(deref(h),<float*> x_ptr,
-              <float*> y_ptr,
-              <float*> d_ptr,
-              <int>m,
-              <int>n,
-              <int>k,
-              <DistanceType>DistanceType2.L2Expanded,
-              <bool>true, <float>0.0)
+    pairwise_distance(deref(h),
+                      <float*> x_ptr,
+                      <float*> y_ptr,
+                      <float*> d_ptr,
+                      <int>m,
+                      <int>n,
+                      <int>k,
+                      <DistanceType>DistanceType2.L2Expanded,
+                      <bool>true, <float>0.0)

From 084975c2f7a5c5b0247977e3b50b7cd3c3cbe352 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 15:37:59 -0500
Subject: [PATCH 031/167] Fixing cython style

---
 python/pylibraft/pylibraft/distance/pairwise_distance.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pxd b/python/pylibraft/pylibraft/distance/pairwise_distance.pxd
index ad16be6113..b48af03e94 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pxd
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pxd
@@ -19,7 +19,7 @@ from pylibraft.distance.distance_type cimport DistanceType
 from pylibraft.common.handle cimport handle_t
 
 cdef extern from "raft_distance/pairwise_distance.hpp" \
-    namespace "raft::distance::runtime":
+        namespace "raft::distance::runtime":
 
     cdef void pairwise_distance(const handle_t &handle,
                                 float *x,

From 1bb7fb5abe34e9306d4a5ae34c1fd7705c1eaf65 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 15:40:02 -0500
Subject: [PATCH 032/167] Fixing cpp style

---
 .../raft_distance/pairwise_distance.hpp       | 42 ++++++------
 cpp/src/distance/pairwise_distance.cu         | 65 +++++++++----------
 2 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/cpp/include/raft_distance/pairwise_distance.hpp b/cpp/include/raft_distance/pairwise_distance.hpp
index 7bd5ca3767..50fdbbdd8c 100644
--- a/cpp/include/raft_distance/pairwise_distance.hpp
+++ b/cpp/include/raft_distance/pairwise_distance.hpp
@@ -17,25 +17,25 @@
 #include <raft/distance/distance_type.hpp>
 
 namespace raft::distance::runtime {
-    void pairwise_distance(raft::handle_t const &handle,
-                           float *x,
-                           float *y,
-                           float *dists,
-                           int m,
-                           int n,
-                           int k,
-                           raft::distance::DistanceType metric,
-                           bool isRowMajor,
-                           float metric_arg);
+void pairwise_distance(raft::handle_t const& handle,
+                       float* x,
+                       float* y,
+                       float* dists,
+                       int m,
+                       int n,
+                       int k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor,
+                       float metric_arg);
 
-    void pairwise_distance(raft::handle_t const &handle,
-                           double *x,
-                           double *y,
-                           double *dists,
-                           int m,
-                           int n,
-                           int k,
-                           raft::distance::DistanceType metric,
-                           bool isRowMajor,
-                           float metric_arg);
-}
\ No newline at end of file
+void pairwise_distance(raft::handle_t const& handle,
+                       double* x,
+                       double* y,
+                       double* dists,
+                       int m,
+                       int n,
+                       int k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor,
+                       float metric_arg);
+}  // namespace raft::distance::runtime
\ No newline at end of file
diff --git a/cpp/src/distance/pairwise_distance.cu b/cpp/src/distance/pairwise_distance.cu
index 9c113cc44c..3a9ff469a1 100644
--- a/cpp/src/distance/pairwise_distance.cu
+++ b/cpp/src/distance/pairwise_distance.cu
@@ -14,43 +14,40 @@
  * limitations under the License.
  */
 
-#include <raft/handle.hpp>
 #include <raft/distance/distance.cuh>
-#include <raft/distance/specializations.cuh>
 #include <raft/distance/distance_type.hpp>
+#include <raft/distance/specializations.cuh>
+#include <raft/handle.hpp>
 
 namespace raft::distance::runtime {
 
-    void pairwise_distance(raft::handle_t const &handle,
-                           float *x,
-                           float *y,
-                           float *dists,
-                           int m,
-                           int n,
-                           int k,
-                           raft::distance::DistanceType metric,
-                           bool isRowMajor,
-                           float metric_arg) {
-
-         raft::distance::pairwise_distance<float, int>(handle,
-                               x, y, dists, m, n, k, metric,
-                               isRowMajor,
-                               metric_arg);
-    }
+void pairwise_distance(raft::handle_t const& handle,
+                       float* x,
+                       float* y,
+                       float* dists,
+                       int m,
+                       int n,
+                       int k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor,
+                       float metric_arg)
+{
+  raft::distance::pairwise_distance<float, int>(
+    handle, x, y, dists, m, n, k, metric, isRowMajor, metric_arg);
+}
 
-    void pairwise_distance(raft::handle_t const &handle,
-                           double *x,
-                           double *y,
-                           double *dists,
-                           int m,
-                           int n,
-                           int k,
-                           raft::distance::DistanceType metric,
-                           bool isRowMajor,
-                           float metric_arg) {
-        raft::distance::pairwise_distance<double, int>(handle,
-                                                      x, y, dists, m, n, k, metric,
-                                                      isRowMajor,
-                                                      metric_arg);
-    }
-}
\ No newline at end of file
+void pairwise_distance(raft::handle_t const& handle,
+                       double* x,
+                       double* y,
+                       double* dists,
+                       int m,
+                       int n,
+                       int k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor,
+                       float metric_arg)
+{
+  raft::distance::pairwise_distance<double, int>(
+    handle, x, y, dists, m, n, k, metric, isRowMajor, metric_arg);
+}
+}  // namespace raft::distance::runtime
\ No newline at end of file

From fb48427d07fad5f401b2aa0b2790b13f07d7433e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 16:12:49 -0500
Subject: [PATCH 033/167] Updating conda package to use raft-runtime instead of
 raft-headers

---
 conda/recipes/pylibraft/meta.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 2565966401..6da1c25c21 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -29,12 +29,11 @@ requirements:
     - setuptools
     - cython>=0.29,<0.30
     - rmm {{ minor_version }}
-    - libraft-headers {{ version }}
+    - libraft-runtime {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0
   run:
     - python x.x
-    - libraft-headers {{ version }}
     - libraft-distance {{ version }}
     - cuda-python >=11.5,<12.0
     - joblib >=0.11

From aaa72e4758fdab41aeb070998ce5db5b1da1d516 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 18:23:53 -0500
Subject: [PATCH 034/167] adding logging.

---
 .../raft/common/detail/callback_sink.hpp      |  71 ++++++
 cpp/include/raft/common/detail/logger.hpp     | 153 +++++++++++++
 cpp/include/raft/common/detail/scatter.cuh    |  52 +++++
 cpp/include/raft/common/logger.hpp            | 211 ++++++++++++++++++
 cpp/include/raft/common/scatter.cuh           |  42 +---
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/common/logger.cpp                    |  97 ++++++++
 7 files changed, 592 insertions(+), 35 deletions(-)
 create mode 100644 cpp/include/raft/common/detail/callback_sink.hpp
 create mode 100644 cpp/include/raft/common/detail/logger.hpp
 create mode 100644 cpp/include/raft/common/detail/scatter.cuh
 create mode 100644 cpp/include/raft/common/logger.hpp
 create mode 100644 cpp/test/common/logger.cpp

diff --git a/cpp/include/raft/common/detail/callback_sink.hpp b/cpp/include/raft/common/detail/callback_sink.hpp
new file mode 100644
index 0000000000..ecd869ee4f
--- /dev/null
+++ b/cpp/include/raft/common/detail/callback_sink.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <iostream>
+#include <mutex>
+
+#define SPDLOG_HEADER_ONLY
+#include <spdlog/common.h>
+#include <spdlog/details/log_msg.h>
+#include <spdlog/sinks/base_sink.h>
+
+namespace spdlog::sinks {
+
+typedef void (*LogCallback)(int lvl, const char* msg);
+
+template <class Mutex>
+class CallbackSink : public base_sink<Mutex> {
+ public:
+  explicit CallbackSink(std::string tag      = "spdlog",
+                        LogCallback callback = nullptr,
+                        void (*flush)()      = nullptr)
+    : _callback{callback}, _flush{flush} {};
+
+  void set_callback(LogCallback callback) { _callback = callback; }
+  void set_flush(void (*flush)()) { _flush = flush; }
+
+ protected:
+  void sink_it_(const details::log_msg& msg) override
+  {
+    spdlog::memory_buf_t formatted;
+    base_sink<Mutex>::formatter_->format(msg, formatted);
+    std::string msg_string = fmt::to_string(formatted);
+
+    if (_callback) {
+      _callback(static_cast<int>(msg.level), msg_string.c_str());
+    } else {
+      std::cout << msg_string;
+    }
+  }
+
+  void flush_() override
+  {
+    if (_flush) {
+      _flush();
+    } else {
+      std::cout << std::flush;
+    }
+  }
+
+  LogCallback _callback;
+  void (*_flush)();
+};
+
+using callback_sink_mt = CallbackSink<std::mutex>;
+using callback_sink_st = CallbackSink<details::null_mutex>;
+
+}  // end namespace spdlog::sinks
\ No newline at end of file
diff --git a/cpp/include/raft/common/detail/logger.hpp b/cpp/include/raft/common/detail/logger.hpp
new file mode 100644
index 0000000000..053b6e3c88
--- /dev/null
+++ b/cpp/include/raft/common/detail/logger.hpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <stdarg.h>
+
+#define SPDLOG_HEADER_ONLY
+#include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
+#include <spdlog/spdlog.h>                    // NOLINT
+
+#include <algorithm>
+
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+
+#include <raft/common/detail/callback_sink.hpp>
+
+/**
+ * @defgroup logging levels used in raft
+ *
+ * @note exactly match the corresponding ones (but reverse in terms of value)
+ *       in spdlog for wrapping purposes
+ *
+ * @{
+ */
+#define RAFT_LEVEL_TRACE    6
+#define RAFT_LEVEL_DEBUG    5
+#define RAFT_LEVEL_INFO     4
+#define RAFT_LEVEL_WARN     3
+#define RAFT_LEVEL_ERROR    2
+#define RAFT_LEVEL_CRITICAL 1
+#define RAFT_LEVEL_OFF      0
+/** @} */
+
+#if !defined(RAFT_ACTIVE_LEVEL)
+#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_DEBUG
+#endif
+
+namespace spdlog {
+class logger;
+namespace sinks {
+template <class Mutex>
+class CallbackSink;
+using callback_sink_mt = CallbackSink<std::mutex>;
+};  // namespace sinks
+};  // namespace spdlog
+
+namespace raft::detail {
+
+/**
+ * @defgroup CStringFormat Expand a C-style format string
+ *
+ * @brief Expands C-style formatted string into std::string
+ *
+ * @param[in] fmt format string
+ * @param[in] vl  respective values for each of format modifiers in the string
+ *
+ * @return the expanded `std::string`
+ *
+ * @{
+ */
+std::string format(const char* fmt, va_list& vl)
+{
+  char buf[4096];
+  vsnprintf(buf, sizeof(buf), fmt, vl);
+  return std::string(buf);
+}
+
+std::string format(const char* fmt, ...)
+{
+  va_list vl;
+  va_start(vl, fmt);
+  std::string str = format(fmt, vl);
+  va_end(vl);
+  return str;
+}
+/** @} */
+
+int convert_level_to_spdlog(int level)
+{
+  level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
+  return RAFT_LEVEL_TRACE - level;
+}
+
+};  // namespace raft::detail
+
+/**
+ * @defgroup loggerMacros Helper macros for dealing with logging
+ * @{
+ */
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
+#define RAFT_LOG_TRACE(fmt, ...)                                 \
+  do {                                                           \
+    std::stringstream ss;                                        \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);    \
+    ss << raft::detail::format(fmt, ##__VA_ARGS__);              \
+    raft::logger::get().log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_TRACE(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
+#define RAFT_LOG_DEBUG(fmt, ...)                                 \
+  do {                                                           \
+    std::stringstream ss;                                        \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);    \
+    ss << raft::detail::format(fmt, ##__VA_ARGS__);              \
+    raft::logger::get().log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_DEBUG(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
+#define RAFT_LOG_INFO(fmt, ...) raft::logger::get().log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_INFO(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
+#define RAFT_LOG_WARN(fmt, ...) raft::logger::get().log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_WARN(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
+#define RAFT_LOG_ERROR(fmt, ...) raft::logger::get().log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_ERROR(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
+#define RAFT_LOG_CRITICAL(fmt, ...) raft::logger::get().log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
+#endif
+/** @} */
\ No newline at end of file
diff --git a/cpp/include/raft/common/detail/scatter.cuh b/cpp/include/raft/common/detail/scatter.cuh
new file mode 100644
index 0000000000..e158999b1b
--- /dev/null
+++ b/cpp/include/raft/common/detail/scatter.cuh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft::detail {
+
+template <typename DataT, int VecLen, typename Lambda, typename IdxT>
+__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
+{
+  typedef TxN_t<DataT, VecLen> DataVec;
+  typedef TxN_t<IdxT, VecLen> IdxVec;
+  IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
+  tid *= VecLen;
+  if (tid >= len) return;
+  IdxVec idxIn;
+  idxIn.load(idx, tid);
+  DataVec dataIn;
+#pragma unroll
+  for (int i = 0; i < VecLen; ++i) {
+    auto inPos         = idxIn.val.data[i];
+    dataIn.val.data[i] = op(in[inPos], tid + i);
+  }
+  dataIn.store(out, tid);
+}
+
+template <typename DataT, int VecLen, typename Lambda, typename IdxT, int TPB>
+void scatterImpl(
+  DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream)
+{
+  const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB);
+  scatterKernel<DataT, VecLen, Lambda, IdxT><<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+}  // namespace raft::detail
diff --git a/cpp/include/raft/common/logger.hpp b/cpp/include/raft/common/logger.hpp
new file mode 100644
index 0000000000..aa7c55e863
--- /dev/null
+++ b/cpp/include/raft/common/logger.hpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <stdarg.h>
+
+#include <algorithm>
+
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include <raft/common/detail/logger.hpp>
+
+namespace raft {
+
+static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
+
+/**
+ * @brief The main Logging class for raft library.
+ *
+ * This class acts as a thin wrapper over the underlying `spdlog` interface. The
+ * design is done in this way in order to avoid us having to also ship `spdlog`
+ * header files in our installation.
+ *
+ * @todo This currently only supports logging to stdout. Need to add support in
+ *       future to add custom loggers as well [Issue #2046]
+ */
+class logger {
+ public:
+  // @todo setting the logger once per process with
+  logger(std::string const& name_ = "")
+    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
+      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
+      cur_pattern()
+  {
+    set_pattern(default_log_pattern);
+    set_level(RAFT_LEVEL_INFO);
+  }
+  /**
+   * @brief Singleton method to get the underlying logger object
+   *
+   * @return the singleton logger object
+   */
+  static logger& get(std::string const& name = "")
+  {
+    if (log_map.find(name) == log_map.end()) {
+      log_map[name] = std::make_shared<raft::logger>(name);
+    }
+    return *log_map[name];
+  }
+
+  /**
+   * @brief Set the logging level.
+   *
+   * Only messages with level equal or above this will be printed
+   *
+   * @param[in] level logging level
+   *
+   * @note The log level will actually be set only if the input is within the
+   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
+   *       be ignored. See documentation of decisiontree for how this gets used
+   */
+  void set_level(int level)
+  {
+    level = detail::convert_level_to_spdlog(level);
+    spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
+  }
+
+  /**
+   * @brief Set the logging pattern
+   *
+   * @param[in] pattern the pattern to be set. Refer this link
+   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
+   *                    to know the right syntax of this pattern
+   */
+  void set_pattern(const std::string& pattern)
+  {
+    cur_pattern = pattern;
+    spdlogger->set_pattern(pattern);
+  }
+
+  /**
+   * @brief Register a callback function to be run in place of usual log call
+   *
+   * @param[in] callback the function to be run on all logged messages
+   */
+  void set_callback(void (*callback)(int lvl, const char* msg)) { sink->set_callback(callback); }
+
+  /**
+   * @brief Register a flush function compatible with the registered callback
+   *
+   * @param[in] flush the function to use when flushing logs
+   */
+  void set_flush(void (*flush)()) { sink->set_flush(flush); }
+
+  /**
+   * @brief Tells whether messages will be logged for the given log level
+   *
+   * @param[in] level log level to be checked for
+   * @return true if messages will be logged for this level, else false
+   */
+  bool should_log_for(int level) const
+  {
+    level        = detail::convert_level_to_spdlog(level);
+    auto level_e = static_cast<spdlog::level::level_enum>(level);
+    return spdlogger->should_log(level_e);
+  }
+
+  /**
+   * @brief Query for the current log level
+   *
+   * @return the current log level
+   */
+  int get_level() const
+  {
+    auto level_e = spdlogger->level();
+    return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
+  }
+
+  /**
+   * @brief Get the current logging pattern
+   * @return the pattern
+   */
+  std::string get_pattern() const { return cur_pattern; }
+
+  /**
+   * @brief Main logging method
+   *
+   * @param[in] level logging level of this message
+   * @param[in] fmt   C-like format string, followed by respective params
+   */
+  void log(int level, const char* fmt, ...)
+  {
+    level        = detail::convert_level_to_spdlog(level);
+    auto level_e = static_cast<spdlog::level::level_enum>(level);
+    // explicit check to make sure that we only expand messages when required
+    if (spdlogger->should_log(level_e)) {
+      va_list vl;
+      va_start(vl, fmt);
+      auto msg = detail::format(fmt, vl);
+      va_end(vl);
+      spdlogger->log(level_e, msg);
+    }
+  }
+
+  /**
+   * @brief Flush logs by calling flush on underlying logger
+   */
+  void flush() { spdlogger->flush(); }
+
+  ~logger() {}
+
+ private:
+  logger();
+
+  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
+  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
+  std::shared_ptr<spdlog::logger> spdlogger;
+  std::string cur_pattern;
+  int cur_level;
+};  // class logger
+
+/**
+ * @brief RAII based pattern setter for logger class
+ *
+ * @code{.cpp}
+ * {
+ *   PatternSetter _("%l -- %v");
+ *   RAFT_LOG_INFO("Test message\n");
+ * }
+ * @endcode
+ */
+class PatternSetter {
+ public:
+  /**
+   * @brief Set the pattern for the rest of the log messages
+   * @param[in] pattern pattern to be set
+   */
+  PatternSetter(const std::string& pattern = "%v") : prev_pattern()
+  {
+    prev_pattern = logger::get().get_pattern();
+    logger::get().set_pattern(pattern);
+  }
+
+  /**
+   * @brief This will restore the previous pattern that was active during the
+   *        moment this object was created
+   */
+  ~PatternSetter() { logger::get().set_pattern(prev_pattern); }
+
+ private:
+  std::string prev_pattern;
+};  // class PatternSetter
+
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh
index 2d25b85a50..04b4393261 100644
--- a/cpp/include/raft/common/scatter.cuh
+++ b/cpp/include/raft/common/scatter.cuh
@@ -16,39 +16,11 @@
 
 #pragma once
 
+#include <raft/common/detail/scatter.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/vectorized.cuh>
 
 namespace raft {
 
-template <typename DataT, int VecLen, typename Lambda, typename IdxT>
-__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
-{
-  typedef TxN_t<DataT, VecLen> DataVec;
-  typedef TxN_t<IdxT, VecLen> IdxVec;
-  IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
-  tid *= VecLen;
-  if (tid >= len) return;
-  IdxVec idxIn;
-  idxIn.load(idx, tid);
-  DataVec dataIn;
-#pragma unroll
-  for (int i = 0; i < VecLen; ++i) {
-    auto inPos         = idxIn.val.data[i];
-    dataIn.val.data[i] = op(in[inPos], tid + i);
-  }
-  dataIn.store(out, tid);
-}
-
-template <typename DataT, int VecLen, typename Lambda, typename IdxT, int TPB>
-void scatterImpl(
-  DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream)
-{
-  const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB);
-  scatterKernel<DataT, VecLen, Lambda, IdxT><<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
 /**
  * @brief Performs scatter operation based on the input indexing array
  * @tparam DataT data type whose array gets scattered
@@ -79,17 +51,17 @@ void scatter(DataT* out,
   constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize;
   size_t bytes                = len * MaxPerElem;
   if (16 / MaxPerElem && bytes % 16 == 0) {
-    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (8 / MaxPerElem && bytes % 8 == 0) {
-    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (4 / MaxPerElem && bytes % 4 == 0) {
-    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (2 / MaxPerElem && bytes % 2 == 0) {
-    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (1 / MaxPerElem) {
-    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else {
-    scatterImpl<DataT, 1, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 1, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   }
 }
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0d3121fee6..f8ae28f550 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -16,6 +16,7 @@
 
 # keep the files in alphabetical order!
 add_executable(test_raft
+    test/common/logger.cpp
     test/common/seive.cu
     test/cudart_utils.cpp
     test/cluster_solvers.cu
diff --git a/cpp/test/common/logger.cpp b/cpp/test/common/logger.cpp
new file mode 100644
index 0000000000..ff63b8249e
--- /dev/null
+++ b/cpp/test/common/logger.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/common/logger.hpp>
+#include <string>
+
+namespace raft {
+
+TEST(logger, Test)
+{
+  RAFT_LOG_CRITICAL("This is a critical message");
+  RAFT_LOG_ERROR("This is an error message");
+  RAFT_LOG_WARN("This is a warning message");
+  RAFT_LOG_INFO("This is an info message");
+
+  logger::get().set_level(RAFT_LEVEL_WARN);
+  ASSERT_EQ(RAFT_LEVEL_WARN, logger::get().get_level());
+  logger::get().set_level(RAFT_LEVEL_INFO);
+  ASSERT_EQ(RAFT_LEVEL_INFO, logger::get().get_level());
+
+  ASSERT_FALSE(logger::get().should_log_for(RAFT_LEVEL_TRACE));
+  ASSERT_FALSE(logger::get().should_log_for(RAFT_LEVEL_DEBUG));
+  ASSERT_TRUE(logger::get().should_log_for(RAFT_LEVEL_INFO));
+  ASSERT_TRUE(logger::get().should_log_for(RAFT_LEVEL_WARN));
+}
+
+std::string logged = "";
+void exampleCallback(int lvl, const char* msg) { logged = std::string(msg); }
+
+int flushCount = 0;
+void exampleFlush() { ++flushCount; }
+
+class loggerTest : public ::testing::Test {
+ protected:
+  void SetUp() override
+  {
+    flushCount = 0;
+    logged     = "";
+    logger::get().set_level(RAFT_LEVEL_TRACE);
+  }
+
+  void TearDown() override
+  {
+    logger::get().set_callback(nullptr);
+    logger::get().set_flush(nullptr);
+    logger::get().set_level(RAFT_LEVEL_INFO);
+  }
+};
+
+TEST_F(loggerTest, callback)
+{
+  std::string testMsg;
+  logger::get().set_callback(exampleCallback);
+
+  testMsg = "This is a critical message";
+  RAFT_LOG_CRITICAL(testMsg.c_str());
+  ASSERT_TRUE(logged.find(testMsg) != std::string::npos);
+
+  testMsg = "This is an error message";
+  RAFT_LOG_ERROR(testMsg.c_str());
+  ASSERT_TRUE(logged.find(testMsg) != std::string::npos);
+
+  testMsg = "This is a warning message";
+  RAFT_LOG_WARN(testMsg.c_str());
+  ASSERT_TRUE(logged.find(testMsg) != std::string::npos);
+
+  testMsg = "This is an info message";
+  RAFT_LOG_INFO(testMsg.c_str());
+  ASSERT_TRUE(logged.find(testMsg) != std::string::npos);
+
+  testMsg = "This is a debug message";
+  RAFT_LOG_DEBUG(testMsg.c_str());
+  ASSERT_TRUE(logged.find(testMsg) != std::string::npos);
+}
+
+TEST_F(loggerTest, flush)
+{
+  logger::get().set_flush(exampleFlush);
+  logger::get().flush();
+  ASSERT_EQ(1, flushCount);
+}
+
+}  // namespace raft
\ No newline at end of file

From fba776961b39cc77d6f5eaaf5fb7014e41467f3a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 18:55:16 -0500
Subject: [PATCH 035/167] Adding libraft-distance to build-time dependency for
 pylibraft

---
 conda/recipes/pylibraft/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 6da1c25c21..b3880d9d61 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -30,6 +30,7 @@ requirements:
     - cython>=0.29,<0.30
     - rmm {{ minor_version }}
     - libraft-runtime {{ version }}
+    - libraft-distance {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0
   run:

From 4fc06e647704a3ae913206759ce5841c86cb642f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 20:24:14 -0500
Subject: [PATCH 036/167] Adding libraft-distance to gpu build

---
 ci/gpu/build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index afc6056b42..34d6d3a07e 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -61,6 +61,7 @@ gpuci_mamba_retry install -y -c conda-forge -c rapidsai -c rapidsai-nightly -c n
       "breathe" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
+      "libraft-distance=${MINOR_VERSION}" \
       "ucx-py=${UCX_PY_VERSION}" \
       "rapids-build-env=${MINOR_VERSION}.*" \
       "rapids-notebook-env=${MINOR_VERSION}.*" \

From c6a326acb7afbf7d450508f0b6b042b8e1608ff9 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 22:31:25 -0500
Subject: [PATCH 037/167] Explicity building pylibraft

---
 ci/gpu/build.sh | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 34d6d3a07e..3404638a6c 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -61,7 +61,6 @@ gpuci_mamba_retry install -y -c conda-forge -c rapidsai -c rapidsai-nightly -c n
       "breathe" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "libraft-distance=${MINOR_VERSION}" \
       "ucx-py=${UCX_PY_VERSION}" \
       "rapids-build-env=${MINOR_VERSION}.*" \
       "rapids-notebook-env=${MINOR_VERSION}.*" \
@@ -94,7 +93,7 @@ gpuci_logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH"
 export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
-gpuci_logger "Build C++ and Python targets"
+gpuci_logger "Build C++ and pyraft targets"
 # These should link against the existing shared libs
 if hasArg --skip-tests; then
   "$WORKSPACE/build.sh" pyraft libraft -v --nogtest
@@ -102,6 +101,14 @@ else
   "$WORKSPACE/build.sh" pyraft libraft -v
 fi
 
+gpuci_logger "Build C++ and pylibraft targets"
+# These should link against the existing shared libs
+if hasArg --skip-tests; then
+  "$WORKSPACE/build.sh" pylibraft libraft -v --nogtest
+else
+  "$WORKSPACE/build.sh" pylibraft libraft -v
+fi
+
 gpuci_logger "sccache stats"
 sccache --show-stats
 
@@ -129,7 +136,9 @@ gpuci_logger "GoogleTest for raft"
 cd "$WORKSPACE/cpp/build"
 GTEST_OUTPUT="xml:$WORKSPACE/test-results/raft_cpp/" ./test_raft
 
-gpuci_logger "Python pytest for raft"
-cd "$WORKSPACE/python"
+gpuci_logger "Python pytest for pyraft"
+cd "$WORKSPACE/python/raft"
+python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-raft.xml" -v -s
 
+cd "$WORKSPACE/python/pylibraft"
 python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-raft.xml" -v -s

From 271b38b666869dfffdc14c1b114852795d0c828e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Mar 2022 22:35:20 -0500
Subject: [PATCH 038/167] Updating docs

---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 22979b102b..eea300d3a8 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,7 +24,7 @@
 # is relative to the documentation root, use os.path.abspath to make it
 # absolute, like shown here.
 sys.path.insert(0, os.path.abspath('sphinxext'))
-sys.path.insert(0, os.path.abspath('../../python'))
+sys.path.insert(0, os.path.abspath('../../python/pyraft'))
 
 from github_link import make_linkcode_resolve # noqa
 

From f482b67b53b1adbdba18b66b98a42de89d9a6912 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 13:55:36 -0500
Subject: [PATCH 039/167] Updates

---
 ci/gpu/build.sh | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 3404638a6c..b7ce351f39 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -93,22 +93,14 @@ gpuci_logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH"
 export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
-gpuci_logger "Build C++ and pyraft targets"
+gpuci_logger "Build C++ and Python targets"
 # These should link against the existing shared libs
 if hasArg --skip-tests; then
-  "$WORKSPACE/build.sh" pyraft libraft -v --nogtest
+  "$WORKSPACE/build.sh" pyraft pylibraft libraft -v --nogtest
 else
   "$WORKSPACE/build.sh" pyraft libraft -v
 fi
 
-gpuci_logger "Build C++ and pylibraft targets"
-# These should link against the existing shared libs
-if hasArg --skip-tests; then
-  "$WORKSPACE/build.sh" pylibraft libraft -v --nogtest
-else
-  "$WORKSPACE/build.sh" pylibraft libraft -v
-fi
-
 gpuci_logger "sccache stats"
 sccache --show-stats
 
@@ -140,5 +132,6 @@ gpuci_logger "Python pytest for pyraft"
 cd "$WORKSPACE/python/raft"
 python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-raft.xml" -v -s
 
+gpuci_logger "Python pytest for pylibraft"
 cd "$WORKSPACE/python/pylibraft"
 python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-raft.xml" -v -s

From 6fa6cd94dd5291cb9f0e3ddeb87ec8aaa17b5565 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 16:32:24 -0500
Subject: [PATCH 040/167] Updates

---
 build.sh                            |  1 -
 cpp/CMakeLists.txt                  | 82 +++++++++++++----------------
 cpp/cmake/modules/raft_export.cmake | 17 +-----
 cpp/cmake/versions.json             | 11 ++++
 4 files changed, 50 insertions(+), 61 deletions(-)
 create mode 100644 cpp/cmake/versions.json

diff --git a/build.sh b/build.sh
index b43b683180..7f287e8e22 100755
--- a/build.sh
+++ b/build.sh
@@ -155,7 +155,6 @@ if (( ${CLEAN} == 1 )); then
           find ${bd} -mindepth 1 -delete
           rmdir ${bd} || true
       fi
-
     done
 
     cd ${REPODIR}/python/raft
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2f198276e4..6dbacb109d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -122,29 +122,12 @@ if(BUILD_TESTS)
   include(cmake/thirdparty/get_ucx.cmake)
 endif()
 
-##############################################################################
-# - raft_runtime -------------------------------------------------------------
-
-add_library(raft_runtime INTERFACE)
-add_library(raft::runtime ALIAS raft_runtime)
-set_target_properties(raft_runtime PROPERTIES EXPORT_NAME runtime)
-
-target_link_libraries(raft_runtime INTERFACE
-                      CUDA::cublas
-                      CUDA::curand
-                      CUDA::cusolver
-                      CUDA::cudart
-                      CUDA::cusparse
-                      rmm::rmm)
-
-target_compile_definitions(raft_runtime INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
-target_compile_features(raft_runtime INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
-
 ##############################################################################
 # - raft ---------------------------------------------------------------------
 
 add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
+#set_target_properties(raft PROPERTIES EXPORT_NAME raft)
 
 target_include_directories(raft INTERFACE
         "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
@@ -152,8 +135,8 @@ target_include_directories(raft INTERFACE
 
 target_link_libraries(raft INTERFACE
   raft::Thrust
-  raft::runtime
   $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
+  ${RAFT_LINK_LIBS}
   cuco::cuco
   std::mdspan)
 
@@ -175,6 +158,30 @@ set(RAFT_LIB_TYPE SHARED)
 if(${RAFT_STATIC_LINK_LIBRARIES})
   set(RAFT_LIB_TYPE STATIC)
 endif()
+
+##############################################################################
+# - raft_runtime -------------------------------------------------------------
+
+set(RAFT_LINK_LIBS
+        CUDA::cublas
+        CUDA::curand
+        CUDA::cusolver
+        CUDA::cudart
+        CUDA::cusparse
+        rmm::rmm)
+
+add_library(raft_runtime INTERFACE)
+add_library(raft::runtime ALIAS raft_runtime)
+
+target_include_directories(raft_runtime INTERFACE
+        "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
+        "$<INSTALL_INTERFACE:include>"
+        )
+target_link_libraries(raft_runtime INTERFACE ${RAFT_LINK_LIBS})
+
+target_compile_definitions(raft_runtime INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
+target_compile_features(raft_runtime INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+
 ##############################################################################
 # - raft_distance ------------------------------------------------------------
 add_library(raft_distance INTERFACE)
@@ -234,7 +241,8 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
 
 endif()
 
-target_link_libraries(raft_distance INTERFACE raft::runtime
+target_link_libraries(raft_distance INTERFACE
+    raft::runtime
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )
@@ -290,9 +298,11 @@ include(CPack)
 install(TARGETS raft_runtime
         DESTINATION ${lib_dir}
         EXPORT raft-runtime-exports)
+
 install(TARGETS raft
-        DESTINATION ${PROJECT_BINARY_DIR}
-        EXPORT raft-exports)
+        DESTINATION ${lib_dir}
+        EXPORT raft-runtime-exports)
+
 install(TARGETS raft_distance
         DESTINATION ${lib_dir}
         EXPORT raft-distance-exports)
@@ -312,6 +322,7 @@ if(TARGET raft_nn_lib)
           EXPORT raft-nn-lib-exports)
 endif()
 
+
 install(DIRECTORY include/raft_runtime
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_runtime)
 
@@ -343,7 +354,7 @@ Imported Targets:
 set(code_string
 [=[
 if(NOT TARGET raft::Thrust)
-  thrust_create_target(raft::Thrust FROM_OPTIONS EXCLUDE_FROM_ALL TRUE)
+  thrust_create_target(raft::Thrust FROM_OPTIONS)
 endif()
 
 if(distance IN_LIST raft_FIND_COMPONENTS)
@@ -364,37 +375,20 @@ endif()
 
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
-raft_export(INSTALL runtime
+raft_export(INSTALL raft_runtime
     COMPONENTS nn distance
     EXPORT_SET raft-runtime-exports
-    GLOBAL_TARGETS runtime nn distance
-    NAMESPACE raft::
-    DOCUMENTATION doc_string
-    FINAL_CODE_BLOCK code_string
-    INSTALL_FILES ON)
-
-raft_export(INSTALL raft
-    EXPORT_SET raft-exports
-    GLOBAL_TARGETS raft runtime raft_distance raft_nn
-    COMPONENTS nn distance
+    GLOBAL_TARGETS raft nn distance
     NAMESPACE raft::
     DOCUMENTATION doc_string
     FINAL_CODE_BLOCK code_string)
 
 ##############################################################################
 # - build export -------------------------------------------------------------
-raft_export(BUILD runtime
+raft_export(BUILD raft_runtime
     EXPORT_SET raft-runtime-exports
     COMPONENTS nn distance
-    GLOBAL_TARGETS runtime raft_distance raft_nn
-    DOCUMENTATION doc_string
-    NAMESPACE raft::
-    FINAL_CODE_BLOCK code_string)
-
-raft_export(BUILD raft
-    EXPORT_SET raft-exports
-    GLOBAL_TARGETS raft runtime raft_distance raft_nn
-    COMPONENTS nn distance
+    GLOBAL_TARGETS raft raft_distance raft_nn
     DOCUMENTATION doc_string
     NAMESPACE raft::
     FINAL_CODE_BLOCK code_string)
diff --git a/cpp/cmake/modules/raft_export.cmake b/cpp/cmake/modules/raft_export.cmake
index 2df4c5593d..6a66b5420b 100644
--- a/cpp/cmake/modules/raft_export.cmake
+++ b/cpp/cmake/modules/raft_export.cmake
@@ -32,7 +32,6 @@ Generate a projects -Config.cmake module and all related information
       [ DOCUMENTATION <doc_variable> ]
       [ FINAL_CODE_BLOCK <code_block_variable> ]
       [ LANGUAGES <langs...> ]
-      [ INSTALL_FILES ON|OFF ]
       )
 
 The :cmake:command:`raft_export` function allow projects to easily generate a fully
@@ -107,12 +106,6 @@ calls to :cmake:command:`find_dependency`, or :cmake:command:`CPMFindPackage`.
   of your package. This makes sure all consumers properly setup these
   languages correctly.
 
-``INSTALL_FILES``
-  Optional boolean value denoting whether exported files should be installed
-  to the RAPIDS lib directory during the install stage. This is OFF by
-  default so the export files will only be installed into the project's build
-  directory.
-
   This is required as CMake's :cmake:command:`enable_language` only supports
   enabling languages for the current directory scope, and doesn't support
   being called from within functions. Marking languages here overcomes
@@ -129,7 +122,7 @@ function(raft_export type project_name)
   string(TOLOWER ${type} type)
 
   set(options "")
-  set(one_value EXPORT_SET VERSION NAMESPACE DOCUMENTATION FINAL_CODE_BLOCK INSTALL_FILES)
+  set(one_value EXPORT_SET VERSION NAMESPACE DOCUMENTATION FINAL_CODE_BLOCK)
   set(multi_value COMPONENTS GLOBAL_TARGETS LANGUAGES)
   cmake_parse_arguments(RAPIDS "${options}" "${one_value}" "${multi_value}" ${ARGN})
 
@@ -170,10 +163,6 @@ function(raft_export type project_name)
     set(RAPIDS_PROJECT_FINAL_CODE_BLOCK "${${RAPIDS_FINAL_CODE_BLOCK}}")
   endif()
 
-  if(DEFINED RAPIDS_INSTALL_FILES AND NOT RAPIDS_INSTALL_FILES)
-    unset(RAPIDS_INSTALL_FILES)
-  endif()
-
   # Write configuration and version files
   string(TOLOWER ${project_name} project_name)
   string(TOUPPER ${project_name} project_name_uppercase)
@@ -184,10 +173,6 @@ function(raft_export type project_name)
 
     set(scratch_dir "${PROJECT_BINARY_DIR}/rapids-cmake/${project_name}/export")
 
-    if(NOT DEFINED RAPIDS_INSTALL_FILES)
-      set(install_location "${scratch_dir}")
-    endif()
-
       configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
                                   "${scratch_dir}/${project_name}-config.cmake"
                                   INSTALL_DESTINATION "${install_location}")
diff --git a/cpp/cmake/versions.json b/cpp/cmake/versions.json
new file mode 100644
index 0000000000..eac6e5b448
--- /dev/null
+++ b/cpp/cmake/versions.json
@@ -0,0 +1,11 @@
+{
+  "packages" : {
+    "Thrust" : {
+      "version" : "1.12.0",
+      "git_url" : "https://github.com/NVIDIA/thrust.git",
+      "git_tag" : "${version}",
+      "git_shallow" : true,
+      "exclude_from_all" : true
+    }
+  }
+}
\ No newline at end of file

From f3120426cdc23525c449e787489b68917570b9eb Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 16:38:07 -0500
Subject: [PATCH 041/167] Updating runtime target

---
 cpp/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6dbacb109d..8e25fe7cb8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -242,7 +242,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
 endif()
 
 target_link_libraries(raft_distance INTERFACE
-    raft::runtime
+    raft::raft_runtime
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )
@@ -285,7 +285,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
 
 endif()
 
-target_link_libraries(raft_nn INTERFACE raft::runtime faiss::faiss
+target_link_libraries(raft_nn INTERFACE raft::raft_runtime faiss::faiss
     $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 

From a4cad22df602e9e70f2522c5ab2c596892699654 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 16:46:49 -0500
Subject: [PATCH 042/167] Updates

---
 cpp/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8e25fe7cb8..dc86fc2dab 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -242,7 +242,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
 endif()
 
 target_link_libraries(raft_distance INTERFACE
-    raft::raft_runtime
+    raft::runtime
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )
@@ -285,7 +285,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
 
 endif()
 
-target_link_libraries(raft_nn INTERFACE raft::raft_runtime faiss::faiss
+target_link_libraries(raft_nn INTERFACE raft::runtime faiss::faiss
     $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 
@@ -375,7 +375,7 @@ endif()
 
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
-raft_export(INSTALL raft_runtime
+raft_export(INSTALL raft
     COMPONENTS nn distance
     EXPORT_SET raft-runtime-exports
     GLOBAL_TARGETS raft nn distance
@@ -385,7 +385,7 @@ raft_export(INSTALL raft_runtime
 
 ##############################################################################
 # - build export -------------------------------------------------------------
-raft_export(BUILD raft_runtime
+raft_export(BUILD raft
     EXPORT_SET raft-runtime-exports
     COMPONENTS nn distance
     GLOBAL_TARGETS raft raft_distance raft_nn

From 8ccd270577a9a9e2d0bdca2725c4466f8643fb70 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 16:48:39 -0500
Subject: [PATCH 043/167] exporting raft-exports

---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index dc86fc2dab..559334e647 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -301,7 +301,7 @@ install(TARGETS raft_runtime
 
 install(TARGETS raft
         DESTINATION ${lib_dir}
-        EXPORT raft-runtime-exports)
+        EXPORT raft-exports)
 
 install(TARGETS raft_distance
         DESTINATION ${lib_dir}

From 64677059d01beb255a983b18f5cd397d2d807a6b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 16:58:36 -0500
Subject: [PATCH 044/167] Using raft::headers and raft::raft instead of
 raft::runtimne and raft::raft

---
 cpp/CMakeLists.txt                        | 50 +++++++++++------------
 cpp/cmake/thirdparty/get_cuco.cmake       |  2 +-
 cpp/cmake/thirdparty/get_gtest.cmake      |  2 +-
 cpp/cmake/thirdparty/get_libcudacxx.cmake |  4 +-
 cpp/cmake/thirdparty/get_mdspan.cmake     |  4 +-
 cpp/cmake/thirdparty/get_rmm.cmake        |  4 +-
 cpp/cmake/thirdparty/get_thrust.cmake     |  4 +-
 7 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 559334e647..211a4466d8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -125,23 +125,23 @@ endif()
 ##############################################################################
 # - raft ---------------------------------------------------------------------
 
-add_library(raft INTERFACE)
-add_library(raft::raft ALIAS raft)
+add_library(raft_headers INTERFACE)
+add_library(raft::headers ALIAS raft_headers)
 #set_target_properties(raft PROPERTIES EXPORT_NAME raft)
 
-target_include_directories(raft INTERFACE
+target_include_directories(raft_headers INTERFACE
         "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
         "$<INSTALL_INTERFACE:include>")
 
-target_link_libraries(raft INTERFACE
+target_link_libraries(raft_headers INTERFACE
   raft::Thrust
   $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
   ${RAFT_LINK_LIBS}
   cuco::cuco
   std::mdspan)
 
-target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
-target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_compile_definitions(raft_headers INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
+target_compile_features(raft_headers INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY OR RAFT_COMPILE_NN_LIBRARY)
   file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
@@ -170,17 +170,17 @@ set(RAFT_LINK_LIBS
         CUDA::cusparse
         rmm::rmm)
 
-add_library(raft_runtime INTERFACE)
-add_library(raft::runtime ALIAS raft_runtime)
+add_library(raft INTERFACE)
+add_library(raft::raft ALIAS raft)
 
-target_include_directories(raft_runtime INTERFACE
+target_include_directories(raft INTERFACE
         "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
         "$<INSTALL_INTERFACE:include>"
         )
-target_link_libraries(raft_runtime INTERFACE ${RAFT_LINK_LIBS})
+target_link_libraries(raft INTERFACE ${RAFT_LINK_LIBS})
 
-target_compile_definitions(raft_runtime INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
-target_compile_features(raft_runtime INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
+target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
 ##############################################################################
 # - raft_distance ------------------------------------------------------------
@@ -228,7 +228,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
   )
   set_target_properties(raft_distance_lib PROPERTIES OUTPUT_NAME raft_distance)
 
-  target_link_libraries(raft_distance_lib PRIVATE raft::raft)
+  target_link_libraries(raft_distance_lib PRIVATE raft::headers)
   target_compile_options(raft_distance_lib
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -242,7 +242,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
 endif()
 
 target_link_libraries(raft_distance INTERFACE
-    raft::runtime
+    raft::raft
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )
@@ -272,7 +272,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
   )
   set_target_properties(raft_nn_lib PROPERTIES OUTPUT_NAME raft_nn)
 
-  target_link_libraries(raft_nn_lib PRIVATE raft::raft faiss::faiss)
+  target_link_libraries(raft_nn_lib PRIVATE raft::headers faiss::faiss)
   target_compile_options(raft_nn_lib
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -285,7 +285,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
 
 endif()
 
-target_link_libraries(raft_nn INTERFACE raft::runtime faiss::faiss
+target_link_libraries(raft_nn INTERFACE raft::raft faiss::faiss
     $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 
@@ -295,14 +295,14 @@ rapids_cmake_install_lib_dir( lib_dir )
 include(GNUInstallDirs)
 include(CPack)
 
-install(TARGETS raft_runtime
-        DESTINATION ${lib_dir}
-        EXPORT raft-runtime-exports)
-
 install(TARGETS raft
         DESTINATION ${lib_dir}
         EXPORT raft-exports)
 
+install(TARGETS raft_headers
+        DESTINATION ${lib_dir}
+        EXPORT raft-headers-exports)
+
 install(TARGETS raft_distance
         DESTINATION ${lib_dir}
         EXPORT raft-distance-exports)
@@ -344,8 +344,8 @@ Optional Components:
   - distance
 
 Imported Targets:
+  - raft::headers
   - raft::raft
-  - raft::runtime
   - raft::nn brought in by the `nn` optional component
   - raft::distance brought in by the `distance` optional component
 
@@ -377,8 +377,8 @@ endif()
 include(cmake/modules/raft_export.cmake)
 raft_export(INSTALL raft
     COMPONENTS nn distance
-    EXPORT_SET raft-runtime-exports
-    GLOBAL_TARGETS raft nn distance
+    EXPORT_SET raft-exports
+    GLOBAL_TARGETS headers nn distance
     NAMESPACE raft::
     DOCUMENTATION doc_string
     FINAL_CODE_BLOCK code_string)
@@ -386,9 +386,9 @@ raft_export(INSTALL raft
 ##############################################################################
 # - build export -------------------------------------------------------------
 raft_export(BUILD raft
-    EXPORT_SET raft-runtime-exports
+    EXPORT_SET raft-exports
     COMPONENTS nn distance
-    GLOBAL_TARGETS raft raft_distance raft_nn
+    GLOBAL_TARGETS raft_headers raft_distance raft_nn
     DOCUMENTATION doc_string
     NAMESPACE raft::
     FINAL_CODE_BLOCK code_string)
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index da733d0ef1..4e74fa9cd6 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -18,7 +18,7 @@ function(find_and_configure_cuco VERSION)
 
     rapids_cpm_find(cuco ${VERSION}
       GLOBAL_TARGETS      cuco::cuco
-      BUILD_EXPORT_SET    raft-exports
+      BUILD_EXPORT_SET    raft-headers-exports
       INSTALL_EXPORT_SET  raft-exports
       CPM_ARGS
         EXCLUDE_FROM_ALL TRUE
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 72fb0e18c6..8a48b9466c 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -17,7 +17,7 @@
 function(find_and_configure_gtest )
 
     include(${rapids-cmake-dir}/cpm/gtest.cmake)
-    rapids_cpm_gtest(BUILD_EXPORT_SET raft-exports
+    rapids_cpm_gtest(BUILD_EXPORT_SET raft-headers-exports
                      EXCLUDE_FROM_ALL TRUE)
 
     if(GTest_ADDED)
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 4333ba3fcd..222ff3ce59 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,8 +16,8 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
-                        INSTALL_EXPORT_SET raft-exports
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-headers-exports
+                        INSTALL_EXPORT_SET raft-headers-exports
                         EXCLUDE_FROM_ALL TRUE)
 
 endfunction()
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index 03fafd4577..3d2663bb5b 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -16,8 +16,8 @@ function(find_and_configure_mdspan VERSION)
   rapids_cpm_find(
     mdspan ${VERSION}
     GLOBAL_TARGETS std::mdspan
-    BUILD_EXPORT_SET    raft-exports
-    INSTALL_EXPORT_SET  raft-exports
+    BUILD_EXPORT_SET    raft-headers-exports
+    INSTALL_EXPORT_SET  raft-headers-exports
     CPM_ARGS
       EXCLUDE_FROM_ALL TRUE
       GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 8717eaad8c..84c6dc0d7f 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -19,8 +19,8 @@ function(find_and_configure_rmm)
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
     rapids_cpm_rmm(
         GLOBAL_TARGETS      rmm::rmm
-        BUILD_EXPORT_SET    raft-runtime-exports
-        INSTALL_EXPORT_SET  raft-runtime-exports
+        BUILD_EXPORT_SET    raft-exports
+        INSTALL_EXPORT_SET  raft-exports
         EXCLUDE_FROM_ALL TRUE
     )
 
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 3813d0ea02..2d96c38dd1 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -18,8 +18,8 @@ function(find_and_configure_thrust)
 
     rapids_cpm_thrust(
             NAMESPACE raft
-            BUILD_EXPORT_SET raft-exports
-            INSTALL_EXPORT_SET raft-exports
+            BUILD_EXPORT_SET raft-headers-exports
+            INSTALL_EXPORT_SET raft-headers-exports
     )
 
 endfunction()

From 802b2f2480fc29a8a095063fb4df12f680def498 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 17:07:46 -0500
Subject: [PATCH 045/167] updting coipyright fir get_gtest

---
 cpp/cmake/thirdparty/get_gtest.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 8a48b9466c..876a8ef988 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 7ec1100951c0b288688eb49c123aeed6dc6a7776 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 18:06:41 -0500
Subject: [PATCH 046/167] Exorting rmm through raft-headers-exports

---
 cpp/cmake/thirdparty/get_rmm.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 84c6dc0d7f..758611b0f5 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -19,8 +19,8 @@ function(find_and_configure_rmm)
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
     rapids_cpm_rmm(
         GLOBAL_TARGETS      rmm::rmm
-        BUILD_EXPORT_SET    raft-exports
-        INSTALL_EXPORT_SET  raft-exports
+        BUILD_EXPORT_SET    raft-headers-exports
+        INSTALL_EXPORT_SET  raft-headers-exports
         EXCLUDE_FROM_ALL TRUE
     )
 

From e1edba48e434afc3981e34e3712bf7344895b722 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 18:07:37 -0500
Subject: [PATCH 047/167] Properlyu setting RAFT_LINK_LIBS

---
 cpp/CMakeLists.txt | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 211a4466d8..1323e53fe8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -124,6 +124,13 @@ endif()
 
 ##############################################################################
 # - raft ---------------------------------------------------------------------
+set(RAFT_LINK_LIBS
+        CUDA::cublas
+        CUDA::curand
+        CUDA::cusolver
+        CUDA::cudart
+        CUDA::cusparse
+        rmm::rmm)
 
 add_library(raft_headers INTERFACE)
 add_library(raft::headers ALIAS raft_headers)
@@ -162,14 +169,6 @@ endif()
 ##############################################################################
 # - raft_runtime -------------------------------------------------------------
 
-set(RAFT_LINK_LIBS
-        CUDA::cublas
-        CUDA::curand
-        CUDA::cusolver
-        CUDA::cudart
-        CUDA::cusparse
-        rmm::rmm)
-
 add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
 

From 12ddb51933f63bc40adfea70e527e024bb7a768d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 18:46:12 -0500
Subject: [PATCH 048/167] Still working out kinks w/ headers

---
 cpp/CMakeLists.txt                  | 6 ++++--
 cpp/cmake/thirdparty/get_cuco.cmake | 2 +-
 python/pylibraft/setup.py           | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1323e53fe8..4ca8240c56 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -97,8 +97,8 @@ endif()
 # * enable the CMake CUDA language
 # * set other CUDA compilation flags
 rapids_find_package(CUDAToolkit REQUIRED
-    BUILD_EXPORT_SET raft-runtime-exports
-    INSTALL_EXPORT_SET raft-runtime-exports
+    BUILD_EXPORT_SET raft-exports
+    INSTALL_EXPORT_SET raft-exports
     )
 include(cmake/modules/ConfigureCUDA.cmake)
 
@@ -404,6 +404,8 @@ endif()
 if(TARGET raft_nn_lib)
   list(APPEND raft_components  nn-lib)
 endif()
+
+list(APPEND raft_components headers)
 foreach(comp IN LISTS raft_components)
   install(
     EXPORT raft-${comp}-exports
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 4e74fa9cd6..05f1e3ff3a 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -19,7 +19,7 @@ function(find_and_configure_cuco VERSION)
     rapids_cpm_find(cuco ${VERSION}
       GLOBAL_TARGETS      cuco::cuco
       BUILD_EXPORT_SET    raft-headers-exports
-      INSTALL_EXPORT_SET  raft-exports
+      INSTALL_EXPORT_SET  raft-headers-exports
       CPM_ARGS
         EXCLUDE_FROM_ALL TRUE
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
diff --git a/python/pylibraft/setup.py b/python/pylibraft/setup.py
index e2340b377c..290202403d 100644
--- a/python/pylibraft/setup.py
+++ b/python/pylibraft/setup.py
@@ -113,6 +113,7 @@
               include_dirs=include_dirs,
               library_dirs=[get_python_lib()],
               runtime_library_dirs=[cuda_lib_dir,
+                                    get_python_lib(),
                                     os.path.join(os.sys.prefix, "lib")],
               libraries=libs,
               language='c++',

From 5bbc377b16a31b1ce0b4851c243b05575ae5a9cb Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 18:48:57 -0500
Subject: [PATCH 049/167] Adding headers as another component

---
 cpp/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4ca8240c56..6443e4c473 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -375,7 +375,7 @@ endif()
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
 raft_export(INSTALL raft
-    COMPONENTS nn distance
+    COMPONENTS headers nn distance
     EXPORT_SET raft-exports
     GLOBAL_TARGETS headers nn distance
     NAMESPACE raft::
@@ -386,7 +386,7 @@ raft_export(INSTALL raft
 # - build export -------------------------------------------------------------
 raft_export(BUILD raft
     EXPORT_SET raft-exports
-    COMPONENTS nn distance
+    COMPONENTS headers nn distance
     GLOBAL_TARGETS raft_headers raft_distance raft_nn
     DOCUMENTATION doc_string
     NAMESPACE raft::
@@ -405,7 +405,6 @@ if(TARGET raft_nn_lib)
   list(APPEND raft_components  nn-lib)
 endif()
 
-list(APPEND raft_components headers)
 foreach(comp IN LISTS raft_components)
   install(
     EXPORT raft-${comp}-exports

From 869bb36e6fae59401ed351e8c25b7f3ecc29f1a4 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 18:52:30 -0500
Subject: [PATCH 050/167] Always adding headers to components

---
 cpp/CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6443e4c473..bd334f4ae4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -66,6 +66,8 @@ message(VERBOSE "RAFT: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}")
 message(VERBOSE "RAFT: Enable nvtx markers: ${NVTX}")
 message(VERBOSE "RAFT: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 
+list(APPEND raft_FIND_COMPONENTS "headers")
+
 # Set RMM logging level
 set(RMM_LOGGING_LEVEL "INFO" CACHE STRING "Choose the logging level.")
 set_property(CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF")
@@ -98,8 +100,7 @@ endif()
 # * set other CUDA compilation flags
 rapids_find_package(CUDAToolkit REQUIRED
     BUILD_EXPORT_SET raft-exports
-    INSTALL_EXPORT_SET raft-exports
-    )
+    INSTALL_EXPORT_SET raft-exports)
 include(cmake/modules/ConfigureCUDA.cmake)
 
 ##############################################################################

From 869cfce049a965acf3d2ffe8db01699d26bedf4e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 19:10:38 -0500
Subject: [PATCH 051/167] Headers ad top-level

---
 cpp/CMakeLists.txt | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bd334f4ae4..9b2c78a454 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -134,7 +134,10 @@ set(RAFT_LINK_LIBS
         rmm::rmm)
 
 add_library(raft_headers INTERFACE)
-add_library(raft::headers ALIAS raft_headers)
+if(TARGET raft_distance AND (NOT TARGET raft::distance))
+  add_library(raft::headers ALIAS raft_headers)
+endif()
+
 #set_target_properties(raft PROPERTIES EXPORT_NAME raft)
 
 target_include_directories(raft_headers INTERFACE
@@ -353,8 +356,10 @@ Imported Targets:
 
 set(code_string
 [=[
-if(NOT TARGET raft::Thrust)
-  thrust_create_target(raft::Thrust FROM_OPTIONS)
+if(headers IN_LIST raft_FIND_COMPONENTS)
+  if(NOT TARGET raft::Thrust)
+    thrust_create_target(raft::Thrust FROM_OPTIONS)
+  endif()
 endif()
 
 if(distance IN_LIST raft_FIND_COMPONENTS)

From 067ea7c2b2a14ff669cd22f46b69179546fd5ef1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 19:12:59 -0500
Subject: [PATCH 052/167] Using proper name

---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9b2c78a454..94745a16ea 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -134,7 +134,7 @@ set(RAFT_LINK_LIBS
         rmm::rmm)
 
 add_library(raft_headers INTERFACE)
-if(TARGET raft_distance AND (NOT TARGET raft::distance))
+if(TARGET raft_headers AND (NOT TARGET raft::headers))
   add_library(raft::headers ALIAS raft_headers)
 endif()
 

From 20b4053399c1bddecabe99b41a7d34d117ff260c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 19:15:37 -0500
Subject: [PATCH 053/167] Exporting headers

---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 94745a16ea..5e6fc82a17 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -138,7 +138,7 @@ if(TARGET raft_headers AND (NOT TARGET raft::headers))
   add_library(raft::headers ALIAS raft_headers)
 endif()
 
-#set_target_properties(raft PROPERTIES EXPORT_NAME raft)
+set_target_properties(raft_headers PROPERTIES EXPORT_NAME headers)
 
 target_include_directories(raft_headers INTERFACE
         "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"

From a301de4fffe174ffadcb8e53d563f177debbfb7b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 19:34:47 -0500
Subject: [PATCH 054/167] Sending rmm to raft-exports

---
 cpp/CMakeLists.txt                 | 10 +++++++++-
 cpp/cmake/thirdparty/get_rmm.cmake |  4 ++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5e6fc82a17..a7762cef47 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -356,13 +356,21 @@ Imported Targets:
 
 set(code_string
 [=[
+
+if((distance IN_LIST raft_FIND_COMPONENTS OR
+    nn IN_LIST raft_FIND_COMPONENTS) AND NOT
+    headers IN_LIST raft_FIND_COMPONENTS))
+    FATAL_ERROR("headers must be included to use components ${raft_FIND_COMPONENTS}")
+endif()
+
 if(headers IN_LIST raft_FIND_COMPONENTS)
   if(NOT TARGET raft::Thrust)
     thrust_create_target(raft::Thrust FROM_OPTIONS)
   endif()
 endif()
 
-if(distance IN_LIST raft_FIND_COMPONENTS)
+if(distance IN_LIST raft_FIND_COMPONENTS OR
+   headers IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)
 endif()
 
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 758611b0f5..84c6dc0d7f 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -19,8 +19,8 @@ function(find_and_configure_rmm)
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
     rapids_cpm_rmm(
         GLOBAL_TARGETS      rmm::rmm
-        BUILD_EXPORT_SET    raft-headers-exports
-        INSTALL_EXPORT_SET  raft-headers-exports
+        BUILD_EXPORT_SET    raft-exports
+        INSTALL_EXPORT_SET  raft-exports
         EXCLUDE_FROM_ALL TRUE
     )
 

From 339b8b67afc11dae6d2d626f0acb4c904621ace8 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 20:11:38 -0500
Subject: [PATCH 055/167] Adding headers to test-raft links

---
 cpp/cmake/thirdparty/get_thrust.cmake | 8 +++-----
 cpp/test/CMakeLists.txt               | 1 +
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 2d96c38dd1..6f21cb8528 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -16,11 +16,9 @@
 function(find_and_configure_thrust)
     include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
-    rapids_cpm_thrust(
-            NAMESPACE raft
-            BUILD_EXPORT_SET raft-headers-exports
-            INSTALL_EXPORT_SET raft-headers-exports
-    )
+    rapids_cpm_thrust(NAMESPACE raft)
+    rapids_export_package(BUILD thrust raft-headers-exports)
+    rapids_export_package(INSTALL thrust raft-headers-exports)
 
 endfunction()
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0d3121fee6..8007819ae0 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -163,6 +163,7 @@ target_include_directories(test_raft
 target_link_libraries(test_raft
 PRIVATE
   raft::raft
+  raft::headers
   raft::distance
   raft::nn
   NCCL::NCCL

From b9784362faa5650f4400b10723b30409b6d119df Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 21:00:06 -0500
Subject: [PATCH 056/167] BUilding docs last

---
 ci/gpu/build.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index b7ce351f39..ab7a98ac12 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -104,9 +104,6 @@ fi
 gpuci_logger "sccache stats"
 sccache --show-stats
 
-gpuci_logger "Building docs"
-"$WORKSPACE/build.sh" docs -v
-
 gpuci_logger "Resetting LD_LIBRARY_PATH"
 
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_CACHED
@@ -135,3 +132,6 @@ python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-raft.xml" -v -s
 gpuci_logger "Python pytest for pylibraft"
 cd "$WORKSPACE/python/pylibraft"
 python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-raft.xml" -v -s
+
+gpuci_logger "Building docs"
+"$WORKSPACE/build.sh" docs -v

From 230bcff13b2fd7caec970d90db024e3502a89eb7 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 21:26:16 -0500
Subject: [PATCH 057/167] Docs to use proper raft path

---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index eea300d3a8..6fd7e3d702 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,7 +24,7 @@
 # is relative to the documentation root, use os.path.abspath to make it
 # absolute, like shown here.
 sys.path.insert(0, os.path.abspath('sphinxext'))
-sys.path.insert(0, os.path.abspath('../../python/pyraft'))
+sys.path.insert(0, os.path.abspath('../../python/raft'))
 
 from github_link import make_linkcode_resolve # noqa
 

From 36ee5dc5212db0eacfe2125f3e3cc5e8658ad014 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Mar 2022 22:03:17 -0500
Subject: [PATCH 058/167] Adding test in pylibraft

---
 ci/gpu/build.sh                                 |  4 ++--
 .../pylibraft/pylibraft/test/test_distance.py   | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)
 create mode 100644 python/pylibraft/pylibraft/test/test_distance.py

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index ab7a98ac12..01ba454a40 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -127,11 +127,11 @@ GTEST_OUTPUT="xml:$WORKSPACE/test-results/raft_cpp/" ./test_raft
 
 gpuci_logger "Python pytest for pyraft"
 cd "$WORKSPACE/python/raft"
-python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-raft.xml" -v -s
+python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-pyraft.xml" -v -s
 
 gpuci_logger "Python pytest for pylibraft"
 cd "$WORKSPACE/python/pylibraft"
-python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-raft.xml" -v -s
+python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-pylibraft.xml" -v -s
 
 gpuci_logger "Building docs"
 "$WORKSPACE/build.sh" docs -v
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
new file mode 100644
index 0000000000..f748f744d1
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+def test_distance():
+    assert True

From 50adc87902aa1b5b724c9f4f370462590e05cad6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 06:22:35 -0500
Subject: [PATCH 059/167] Removing pylibraft from docs build for now

---
 build.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/build.sh b/build.sh
index 7f287e8e22..0d500025da 100755
--- a/build.sh
+++ b/build.sh
@@ -37,7 +37,9 @@ HELP="$0 [<target> ...] [<flag> ...]
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
    --nogtest        - do not build google tests for libraft
-   --noinstall     - do not install cmake targets
+   --noinstall      - do not install cmake targets
+   --uninstall-cmake-deps  - uninstall any cmake dependencies
+   --clean
    --nvtx           - Enable nvtx for profiling support
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
@@ -216,7 +218,7 @@ if (( ${NUMARGS} == 0 )) || hasArg pyraft || hasArg docs; then
 fi
 
 # Build and (optionally) install the pylibraft Python package
-if (( ${NUMARGS} == 0 )) || hasArg pylibraft || hasArg docs; then
+if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
 
     cd ${REPODIR}/python/pylibraft
     if [[ ${INSTALL_TARGET} != "" ]]; then

From 2a90fc8e3a867480e6aca001e988c6f3f0344652 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 07:20:12 -0500
Subject: [PATCH 060/167] Updating cmkelists

---
 cpp/CMakeLists.txt | 286 +++++++++++++++++++++++----------------------
 1 file changed, 147 insertions(+), 139 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 27df57d150..17c8e05614 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -16,7 +16,7 @@
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
 file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.04/RAPIDS.cmake
-    ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+        ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(rapids-cmake)
 include(rapids-cpm)
@@ -66,6 +66,8 @@ message(VERBOSE "RAFT: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}")
 message(VERBOSE "RAFT: Enable nvtx markers: ${NVTX}")
 message(VERBOSE "RAFT: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 
+list(APPEND raft_FIND_COMPONENTS "headers")
+
 # Set RMM logging level
 set(RMM_LOGGING_LEVEL "INFO" CACHE STRING "Choose the logging level.")
 set_property(CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF")
@@ -77,8 +79,8 @@ message(VERBOSE "RAFT: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'.")
 if(DETECT_CONDA_ENV)
   rapids_cmake_support_conda_env( conda_env MODIFY_PREFIX_PATH )
   if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND DEFINED ENV{CONDA_PREFIX})
-      message(STATUS "RAFT: No CMAKE_INSTALL_PREFIX argument detected, setting to: $ENV{CONDA_PREFIX}")
-      set(CMAKE_INSTALL_PREFIX "$ENV{CONDA_PREFIX}")
+    message(STATUS "RAFT: No CMAKE_INSTALL_PREFIX argument detected, setting to: $ENV{CONDA_PREFIX}")
+    set(CMAKE_INSTALL_PREFIX "$ENV{CONDA_PREFIX}")
   endif()
 endif()
 
@@ -97,9 +99,8 @@ endif()
 # * enable the CMake CUDA language
 # * set other CUDA compilation flags
 rapids_find_package(CUDAToolkit REQUIRED
-    BUILD_EXPORT_SET raft-runtime-exports
-    INSTALL_EXPORT_SET raft-runtime-exports
-    )
+        BUILD_EXPORT_SET raft-exports
+        INSTALL_EXPORT_SET raft-exports)
 include(cmake/modules/ConfigureCUDA.cmake)
 
 ##############################################################################
@@ -122,47 +123,40 @@ if(BUILD_TESTS)
   include(cmake/thirdparty/get_ucx.cmake)
 endif()
 
-##############################################################################
-# - raft_runtime -------------------------------------------------------------
-
-add_library(raft_runtime INTERFACE)
-add_library(raft::runtime ALIAS raft_runtime)
-set_target_properties(raft_runtime PROPERTIES EXPORT_NAME runtime)
-
-target_link_libraries(raft_runtime INTERFACE
-                      CUDA::cublas
-                      CUDA::curand
-                      CUDA::cusolver
-                      CUDA::cudart
-                      CUDA::cusparse
-                      rmm::rmm)
-
-target_compile_definitions(raft_runtime INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
-target_compile_features(raft_runtime INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
-
 ##############################################################################
 # - raft ---------------------------------------------------------------------
+set(RAFT_LINK_LIBS
+        CUDA::cublas
+        CUDA::curand
+        CUDA::cusolver
+        CUDA::cudart
+        CUDA::cusparse
+        rmm::rmm)
+
+add_library(raft_headers INTERFACE)
+if(TARGET raft_headers AND (NOT TARGET raft::headers))
+  add_library(raft::headers ALIAS raft_headers)
+endif()
 
-add_library(raft INTERFACE)
-add_library(raft::raft ALIAS raft)
+set_target_properties(raft_headers PROPERTIES EXPORT_NAME headers)
 
-target_include_directories(raft INTERFACE
+target_include_directories(raft_headers INTERFACE
         "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
         "$<INSTALL_INTERFACE:include>")
 
-target_link_libraries(raft INTERFACE
-  raft::Thrust
-  raft::runtime
-  $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
-  cuco::cuco
-  std::mdspan)
+target_link_libraries(raft_headers INTERFACE
+        raft::Thrust
+        $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
+        ${RAFT_LINK_LIBS}
+        cuco::cuco
+        std::mdspan)
 
-target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
-target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_compile_definitions(raft_headers INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
+target_compile_features(raft_headers INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY OR RAFT_COMPILE_NN_LIBRARY)
   file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
-[=[
+          [=[
 SECTIONS
 {
 .nvFatBinSegment : { *(.nvFatBinSegment) }
@@ -175,6 +169,22 @@ set(RAFT_LIB_TYPE SHARED)
 if(${RAFT_STATIC_LINK_LIBRARIES})
   set(RAFT_LIB_TYPE STATIC)
 endif()
+
+##############################################################################
+# - raft_runtime -------------------------------------------------------------
+
+add_library(raft INTERFACE)
+add_library(raft::raft ALIAS raft)
+
+target_include_directories(raft INTERFACE
+        "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
+        "$<INSTALL_INTERFACE:include>"
+        )
+target_link_libraries(raft INTERFACE ${RAFT_LINK_LIBS})
+
+target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
+target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+
 ##############################################################################
 # - raft_distance ------------------------------------------------------------
 add_library(raft_distance INTERFACE)
@@ -187,41 +197,41 @@ set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
   add_library(raft_distance_lib ${RAFT_LIB_TYPE}
-    src/distance/specializations/detail
-    src/distance/specializations/detail/canberra.cu
-    src/distance/specializations/detail/chebyshev.cu
-    src/distance/specializations/detail/correlation.cu
-    src/distance/specializations/detail/cosine.cu
-    src/distance/specializations/detail/hamming_unexpanded.cu
-    src/distance/specializations/detail/hellinger_expanded.cu
-    src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
-    src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
-    src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
-    src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
-    src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
-    src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
-    src/distance/specializations/detail/l1_float_float_float_int.cu
-    src/distance/specializations/detail/l1_float_float_float_uint32.cu
-    src/distance/specializations/detail/l1_double_double_double_int.cu
-    src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
-    src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
-    src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
-    src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
-    src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
-    src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
-    src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
-    src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
-    src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
-    src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
-  )
+          src/distance/pairwise_distance.cu
+          src/distance/specializations/detail/canberra.cu
+          src/distance/specializations/detail/chebyshev.cu
+          src/distance/specializations/detail/correlation.cu
+          src/distance/specializations/detail/cosine.cu
+          src/distance/specializations/detail/hamming_unexpanded.cu
+          src/distance/specializations/detail/hellinger_expanded.cu
+          src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
+          src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
+          src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+          src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
+          src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
+          src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
+          src/distance/specializations/detail/l1_float_float_float_int.cu
+          src/distance/specializations/detail/l1_float_float_float_uint32.cu
+          src/distance/specializations/detail/l1_double_double_double_int.cu
+          src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
+          src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
+          src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
+          src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
+          src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
+          src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
+          src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
+          src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
+          src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
+          src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
+          src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
+          src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+          src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
+          src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
+          src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
+          )
   set_target_properties(raft_distance_lib PROPERTIES OUTPUT_NAME raft_distance)
 
-  target_link_libraries(raft_distance_lib PRIVATE raft::raft)
+  target_link_libraries(raft_distance_lib PRIVATE raft::headers)
   target_compile_options(raft_distance_lib
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -234,10 +244,11 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
 
 endif()
 
-target_link_libraries(raft_distance INTERFACE raft::runtime
-    $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
-    $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
-)
+target_link_libraries(raft_distance INTERFACE
+        raft::raft
+        $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
+        $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
+        )
 
 ##############################################################################
 # - raft_nn ------------------------------------------------------------------
@@ -251,20 +262,20 @@ set_target_properties(raft_nn PROPERTIES EXPORT_NAME nn)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
   add_library(raft_nn_lib ${RAFT_LIB_TYPE}
-    src/nn/specializations/ball_cover.cu
-    src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
-    src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
-    src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
-    src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
-    src/nn/specializations/fused_l2_knn_long_float_true.cu
-    src/nn/specializations/fused_l2_knn_long_float_false.cu
-    src/nn/specializations/fused_l2_knn_int_float_true.cu
-    src/nn/specializations/fused_l2_knn_int_float_false.cu
-    src/nn/specializations/knn.cu
-  )
+          src/nn/specializations/ball_cover.cu
+          src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+          src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+          src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+          src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
+          src/nn/specializations/fused_l2_knn_long_float_true.cu
+          src/nn/specializations/fused_l2_knn_long_float_false.cu
+          src/nn/specializations/fused_l2_knn_int_float_true.cu
+          src/nn/specializations/fused_l2_knn_int_float_false.cu
+          src/nn/specializations/knn.cu
+          )
   set_target_properties(raft_nn_lib PROPERTIES OUTPUT_NAME raft_nn)
 
-  target_link_libraries(raft_nn_lib PRIVATE raft::raft faiss::faiss)
+  target_link_libraries(raft_nn_lib PRIVATE raft::headers faiss::faiss)
   target_compile_options(raft_nn_lib
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -277,9 +288,9 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
 
 endif()
 
-target_link_libraries(raft_nn INTERFACE raft::runtime faiss::faiss
-    $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
-    $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
+target_link_libraries(raft_nn INTERFACE raft::raft faiss::faiss
+        $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
+        $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 
 ##############################################################################
 # - install targets-----------------------------------------------------------
@@ -287,12 +298,14 @@ rapids_cmake_install_lib_dir( lib_dir )
 include(GNUInstallDirs)
 include(CPack)
 
-install(TARGETS raft_runtime
-        DESTINATION ${lib_dir}
-        EXPORT raft-runtime-exports)
 install(TARGETS raft
-        DESTINATION ${PROJECT_BINARY_DIR}
+        DESTINATION ${lib_dir}
         EXPORT raft-exports)
+
+install(TARGETS raft_headers
+        DESTINATION ${lib_dir}
+        EXPORT raft-headers-exports)
+
 install(TARGETS raft_distance
         DESTINATION ${lib_dir}
         EXPORT raft-distance-exports)
@@ -312,6 +325,7 @@ if(TARGET raft_nn_lib)
           EXPORT raft-nn-lib-exports)
 endif()
 
+
 install(DIRECTORY include/raft_runtime
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_runtime)
 
@@ -322,7 +336,7 @@ install(FILES include/raft_runtime/raft.hpp
 ##############################################################################
 # - install export -----------------------------------------------------------
 set(doc_string
-[=[
+        [=[
 Provide targets for the RAFT: RAPIDS Analytics Framework Toolkit.
 
 RAFT (Reusable Analytics Functions and other Tools) contains fundamental
@@ -333,20 +347,30 @@ Optional Components:
   - distance
 
 Imported Targets:
+  - raft::headers
   - raft::raft
-  - raft::runtime
   - raft::nn brought in by the `nn` optional component
   - raft::distance brought in by the `distance` optional component
 
 ]=])
 
 set(code_string
-[=[
-if(NOT TARGET raft::Thrust)
-  thrust_create_target(raft::Thrust FROM_OPTIONS EXCLUDE_FROM_ALL TRUE)
+        [=[
+
+if((distance IN_LIST raft_FIND_COMPONENTS OR
+    nn IN_LIST raft_FIND_COMPONENTS) AND NOT
+    headers IN_LIST raft_FIND_COMPONENTS))
+    FATAL_ERROR("headers must be included to use components ${raft_FIND_COMPONENTS}")
+endif()
+
+if(headers IN_LIST raft_FIND_COMPONENTS)
+  if(NOT TARGET raft::Thrust)
+    thrust_create_target(raft::Thrust FROM_OPTIONS)
+  endif()
 endif()
 
-if(distance IN_LIST raft_FIND_COMPONENTS)
+if(distance IN_LIST raft_FIND_COMPONENTS OR
+   headers IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)
 endif()
 
@@ -360,44 +384,27 @@ if(nn IN_LIST raft_FIND_COMPONENTS)
   endif()
 endif()
 ]=]
-)
+        )
 
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
-raft_export(INSTALL runtime
-    COMPONENTS nn distance
-    EXPORT_SET raft-runtime-exports
-    GLOBAL_TARGETS runtime nn distance
-    NAMESPACE raft::
-    DOCUMENTATION doc_string
-    FINAL_CODE_BLOCK code_string
-    INSTALL_FILES ON)
-
 raft_export(INSTALL raft
-    EXPORT_SET raft-exports
-    GLOBAL_TARGETS raft runtime raft_distance raft_nn
-    COMPONENTS nn distance
-    NAMESPACE raft::
-    DOCUMENTATION doc_string
-    FINAL_CODE_BLOCK code_string)
+        COMPONENTS headers nn distance
+        EXPORT_SET raft-exports
+        GLOBAL_TARGETS headers nn distance
+        NAMESPACE raft::
+        DOCUMENTATION doc_string
+        FINAL_CODE_BLOCK code_string)
 
 ##############################################################################
 # - build export -------------------------------------------------------------
-raft_export(BUILD runtime
-    EXPORT_SET raft-runtime-exports
-    COMPONENTS nn distance
-    GLOBAL_TARGETS runtime raft_distance raft_nn
-    DOCUMENTATION doc_string
-    NAMESPACE raft::
-    FINAL_CODE_BLOCK code_string)
-
 raft_export(BUILD raft
-    EXPORT_SET raft-exports
-    GLOBAL_TARGETS raft runtime raft_distance raft_nn
-    COMPONENTS nn distance
-    DOCUMENTATION doc_string
-    NAMESPACE raft::
-    FINAL_CODE_BLOCK code_string)
+        EXPORT_SET raft-exports
+        COMPONENTS headers nn distance
+        GLOBAL_TARGETS raft_headers raft_distance raft_nn
+        DOCUMENTATION doc_string
+        NAMESPACE raft::
+        FINAL_CODE_BLOCK code_string)
 
 ##############################################################################
 # - export/install optional components  --------------------------------------
@@ -411,23 +418,24 @@ endif()
 if(TARGET raft_nn_lib)
   list(APPEND raft_components  nn-lib)
 endif()
+
 foreach(comp IN LISTS raft_components)
   install(
-    EXPORT raft-${comp}-exports
-    FILE raft-${comp}-targets.cmake
-    NAMESPACE raft::
-    DESTINATION "${lib_dir}/cmake/raft"
+          EXPORT raft-${comp}-exports
+          FILE raft-${comp}-targets.cmake
+          NAMESPACE raft::
+          DESTINATION "${lib_dir}/cmake/raft"
   )
   export(
-    EXPORT raft-${comp}-exports
-    FILE ${RAFT_BINARY_DIR}/raft-${comp}-targets.cmake
-    NAMESPACE raft::
+          EXPORT raft-${comp}-exports
+          FILE ${RAFT_BINARY_DIR}/raft-${comp}-targets.cmake
+          NAMESPACE raft::
   )
   rapids_export_write_dependencies(
-    BUILD raft-${comp}-exports "${PROJECT_BINARY_DIR}/raft-${comp}-dependencies.cmake"
+          BUILD raft-${comp}-exports "${PROJECT_BINARY_DIR}/raft-${comp}-dependencies.cmake"
   )
   rapids_export_write_dependencies(
-    INSTALL raft-${comp}-exports "${PROJECT_BINARY_DIR}/rapids-cmake/raft/export/raft-${comp}-dependencies.cmake"
+          INSTALL raft-${comp}-exports "${PROJECT_BINARY_DIR}/rapids-cmake/raft/export/raft-${comp}-dependencies.cmake"
   )
 
 endforeach()
@@ -444,5 +452,5 @@ endif()
 
 include(cmake/doxygen.cmake)
 add_doxygen_target(IN_DOXYFILE doxygen/Doxyfile.in
-  OUT_DOXYFILE ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
-  CWD ${CMAKE_CURRENT_BINARY_DIR})
+        OUT_DOXYFILE ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
+        CWD ${CMAKE_CURRENT_BINARY_DIR})

From 882a3f0a1e85f93f56b8c3a4ce03e5fe8dd7f871 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 07:23:20 -0500
Subject: [PATCH 061/167] updating cmake

---
 cpp/cmake/thirdparty/get_cuco.cmake       |   4 +-
 cpp/cmake/thirdparty/get_gtest.cmake      |   2 +-
 cpp/cmake/thirdparty/get_libcudacxx.cmake |   4 +-
 cpp/cmake/thirdparty/get_mdspan.cmake     |   4 +-
 cpp/cmake/thirdparty/get_rmm.cmake        |   4 +-
 cpp/cmake/thirdparty/get_thrust.cmake     |   9 +-
 cpp/test/CMakeLists.txt                   | 302 +++++++++++-----------
 7 files changed, 164 insertions(+), 165 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index da733d0ef1..05f1e3ff3a 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -18,8 +18,8 @@ function(find_and_configure_cuco VERSION)
 
     rapids_cpm_find(cuco ${VERSION}
       GLOBAL_TARGETS      cuco::cuco
-      BUILD_EXPORT_SET    raft-exports
-      INSTALL_EXPORT_SET  raft-exports
+      BUILD_EXPORT_SET    raft-headers-exports
+      INSTALL_EXPORT_SET  raft-headers-exports
       CPM_ARGS
         EXCLUDE_FROM_ALL TRUE
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 72fb0e18c6..8a48b9466c 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -17,7 +17,7 @@
 function(find_and_configure_gtest )
 
     include(${rapids-cmake-dir}/cpm/gtest.cmake)
-    rapids_cpm_gtest(BUILD_EXPORT_SET raft-exports
+    rapids_cpm_gtest(BUILD_EXPORT_SET raft-headers-exports
                      EXCLUDE_FROM_ALL TRUE)
 
     if(GTest_ADDED)
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 4333ba3fcd..222ff3ce59 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,8 +16,8 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
-                        INSTALL_EXPORT_SET raft-exports
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-headers-exports
+                        INSTALL_EXPORT_SET raft-headers-exports
                         EXCLUDE_FROM_ALL TRUE)
 
 endfunction()
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index 03fafd4577..3d2663bb5b 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -16,8 +16,8 @@ function(find_and_configure_mdspan VERSION)
   rapids_cpm_find(
     mdspan ${VERSION}
     GLOBAL_TARGETS std::mdspan
-    BUILD_EXPORT_SET    raft-exports
-    INSTALL_EXPORT_SET  raft-exports
+    BUILD_EXPORT_SET    raft-headers-exports
+    INSTALL_EXPORT_SET  raft-headers-exports
     CPM_ARGS
       EXCLUDE_FROM_ALL TRUE
       GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 8717eaad8c..84c6dc0d7f 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -19,8 +19,8 @@ function(find_and_configure_rmm)
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
     rapids_cpm_rmm(
         GLOBAL_TARGETS      rmm::rmm
-        BUILD_EXPORT_SET    raft-runtime-exports
-        INSTALL_EXPORT_SET  raft-runtime-exports
+        BUILD_EXPORT_SET    raft-exports
+        INSTALL_EXPORT_SET  raft-exports
         EXCLUDE_FROM_ALL TRUE
     )
 
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 3813d0ea02..648ac0b5d5 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -16,11 +16,10 @@
 function(find_and_configure_thrust)
     include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
-    rapids_cpm_thrust(
-            NAMESPACE raft
-            BUILD_EXPORT_SET raft-exports
-            INSTALL_EXPORT_SET raft-exports
-    )
+    rapids_cpm_thrust( NAMESPACE raft )
+    rapids_export_package(BUILD thrust raft-headers-exports)
+    rapids_export_package(INSTALL thrust raft-headers-exports)
+
 
 endfunction()
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0d3121fee6..ecc635da19 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -16,167 +16,167 @@
 
 # keep the files in alphabetical order!
 add_executable(test_raft
-    test/common/seive.cu
-    test/cudart_utils.cpp
-    test/cluster_solvers.cu
-    test/distance/dist_adj.cu
-    test/distance/dist_canberra.cu
-    test/distance/dist_chebyshev.cu
-    test/distance/dist_correlation.cu
-    test/distance/dist_cos.cu
-    test/distance/dist_euc_exp.cu
-    test/distance/dist_euc_unexp.cu
-    test/distance/dist_hamming.cu
-    test/distance/dist_hellinger.cu
-    test/distance/dist_jensen_shannon.cu
-    test/distance/dist_kl_divergence.cu
-    test/distance/dist_l1.cu
-    test/distance/dist_minkowski.cu
-    test/distance/dist_russell_rao.cu
-    test/distance/fused_l2_nn.cu
-    test/eigen_solvers.cu
-    test/handle.cpp
-    test/integer_utils.cpp
-    test/interruptible.cu
-    test/nvtx.cpp
-    test/pow2_utils.cu
-    test/label/label.cu
-    test/label/merge_labels.cu
-    test/lap/lap.cu
-    test/linalg/add.cu
-    test/linalg/binary_op.cu
-    test/linalg/cholesky_r1.cu
-    test/linalg/coalesced_reduction.cu
-    test/linalg/divide.cu
-    test/linalg/eig.cu
-    test/linalg/eig_sel.cu
-    test/linalg/gemm_layout.cu
-    test/linalg/gemv.cu
-    test/linalg/map.cu
-    test/linalg/map_then_reduce.cu
-    test/linalg/matrix_vector_op.cu
-    test/linalg/multiply.cu
-    test/linalg/norm.cu
-    test/linalg/power.cu
-    test/linalg/reduce.cu
-    test/linalg/reduce_cols_by_key.cu
-    test/linalg/reduce_rows_by_key.cu
-    test/linalg/rsvd.cu
-    test/linalg/sqrt.cu
-    test/linalg/strided_reduction.cu
-    test/linalg/subtract.cu
-    test/linalg/svd.cu
-    test/linalg/ternary_op.cu
-    test/linalg/transpose.cu
-    test/linalg/unary_op.cu
-    test/matrix/math.cu
-    test/matrix/matrix.cu
-    test/matrix/columnSort.cu
-    test/matrix/linewise_op.cu
-    test/mdarray.cu
-    test/mr/host/buffer.cpp
-    test/mr/device/buffer.cpp
-    test/mst.cu
-    test/random/make_blobs.cu
-    test/random/make_regression.cu
-    test/random/multi_variable_gaussian.cu
-    test/random/permute.cu
-    test/random/rng.cu
-    test/random/rng_int.cu
-    test/random/sample_without_replacement.cu
-    test/span.cpp
-    test/span.cu
-    test/sparse/add.cu
-    test/sparse/convert_coo.cu
-    test/sparse/convert_csr.cu
-    test/sparse/connect_components.cu
-    test/sparse/csr_row_slice.cu
-    test/sparse/csr_to_dense.cu
-    test/sparse/csr_transpose.cu
-    test/sparse/degree.cu
-    test/sparse/dist_coo_spmv.cu
-    test/sparse/distance.cu
-    test/sparse/filter.cu
-    test/sparse/knn.cu
-    test/sparse/knn_graph.cu
-    test/sparse/linkage.cu
-    test/sparse/norm.cu
-    test/sparse/reduce.cu
-    test/sparse/row_op.cu
-    test/sparse/sort.cu
-    test/sparse/symmetrize.cu
-    test/spatial/knn.cu
-    test/spatial/fused_l2_knn.cu
-    test/spatial/haversine.cu
-    test/spatial/ball_cover.cu
-    test/spatial/epsilon_neighborhood.cu
-    test/spatial/faiss_mr.cu
-    test/spatial/selection.cu
-    test/spectral_matrix.cu
-    test/stats/adjusted_rand_index.cu
-    test/stats/completeness_score.cu
-    test/stats/contingencyMatrix.cu
-    test/stats/cov.cu
-    test/stats/dispersion.cu
-    test/stats/entropy.cu
-    test/stats/histogram.cu
-    test/stats/homogeneity_score.cu
-    test/stats/information_criterion.cu
-    test/stats/kl_divergence.cu
-    test/stats/mean.cu
-    test/stats/meanvar.cu
-    test/stats/mean_center.cu
-    test/stats/minmax.cu
-    test/stats/mutual_info_score.cu
-    test/stats/rand_index.cu
-    test/stats/silhouette_score.cu
-    test/stats/stddev.cu
-    test/stats/sum.cu
-    test/stats/trustworthiness.cu
-    test/stats/weighted_mean.cu
-    test/stats/v_measure.cu
-    test/test.cpp
-)
+        test/common/seive.cu
+        test/cudart_utils.cpp
+        test/cluster_solvers.cu
+        test/distance/dist_adj.cu
+        test/distance/dist_canberra.cu
+        test/distance/dist_chebyshev.cu
+        test/distance/dist_correlation.cu
+        test/distance/dist_cos.cu
+        test/distance/dist_euc_exp.cu
+        test/distance/dist_euc_unexp.cu
+        test/distance/dist_hamming.cu
+        test/distance/dist_hellinger.cu
+        test/distance/dist_jensen_shannon.cu
+        test/distance/dist_kl_divergence.cu
+        test/distance/dist_l1.cu
+        test/distance/dist_minkowski.cu
+        test/distance/dist_russell_rao.cu
+        test/distance/fused_l2_nn.cu
+        test/eigen_solvers.cu
+        test/handle.cpp
+        test/integer_utils.cpp
+        test/interruptible.cu
+        test/nvtx.cpp
+        test/pow2_utils.cu
+        test/label/label.cu
+        test/label/merge_labels.cu
+        test/lap/lap.cu
+        test/linalg/add.cu
+        test/linalg/binary_op.cu
+        test/linalg/cholesky_r1.cu
+        test/linalg/coalesced_reduction.cu
+        test/linalg/divide.cu
+        test/linalg/eig.cu
+        test/linalg/eig_sel.cu
+        test/linalg/gemm_layout.cu
+        test/linalg/gemv.cu
+        test/linalg/map.cu
+        test/linalg/map_then_reduce.cu
+        test/linalg/matrix_vector_op.cu
+        test/linalg/multiply.cu
+        test/linalg/norm.cu
+        test/linalg/power.cu
+        test/linalg/reduce.cu
+        test/linalg/reduce_cols_by_key.cu
+        test/linalg/reduce_rows_by_key.cu
+        test/linalg/rsvd.cu
+        test/linalg/sqrt.cu
+        test/linalg/strided_reduction.cu
+        test/linalg/subtract.cu
+        test/linalg/svd.cu
+        test/linalg/ternary_op.cu
+        test/linalg/transpose.cu
+        test/linalg/unary_op.cu
+        test/matrix/math.cu
+        test/matrix/matrix.cu
+        test/matrix/columnSort.cu
+        test/matrix/linewise_op.cu
+        test/mdarray.cu
+        test/mr/host/buffer.cpp
+        test/mr/device/buffer.cpp
+        test/mst.cu
+        test/random/make_blobs.cu
+        test/random/make_regression.cu
+        test/random/multi_variable_gaussian.cu
+        test/random/permute.cu
+        test/random/rng.cu
+        test/random/rng_int.cu
+        test/random/sample_without_replacement.cu
+        test/span.cpp
+        test/span.cu
+        test/sparse/add.cu
+        test/sparse/convert_coo.cu
+        test/sparse/convert_csr.cu
+        test/sparse/connect_components.cu
+        test/sparse/csr_row_slice.cu
+        test/sparse/csr_to_dense.cu
+        test/sparse/csr_transpose.cu
+        test/sparse/degree.cu
+        test/sparse/dist_coo_spmv.cu
+        test/sparse/distance.cu
+        test/sparse/filter.cu
+        test/sparse/knn.cu
+        test/sparse/knn_graph.cu
+        test/sparse/linkage.cu
+        test/sparse/norm.cu
+        test/sparse/reduce.cu
+        test/sparse/row_op.cu
+        test/sparse/sort.cu
+        test/sparse/symmetrize.cu
+        test/spatial/knn.cu
+        test/spatial/fused_l2_knn.cu
+        test/spatial/haversine.cu
+        test/spatial/ball_cover.cu
+        test/spatial/epsilon_neighborhood.cu
+        test/spatial/faiss_mr.cu
+        test/spatial/selection.cu
+        test/spectral_matrix.cu
+        test/stats/adjusted_rand_index.cu
+        test/stats/completeness_score.cu
+        test/stats/contingencyMatrix.cu
+        test/stats/cov.cu
+        test/stats/dispersion.cu
+        test/stats/entropy.cu
+        test/stats/histogram.cu
+        test/stats/homogeneity_score.cu
+        test/stats/information_criterion.cu
+        test/stats/kl_divergence.cu
+        test/stats/mean.cu
+        test/stats/meanvar.cu
+        test/stats/mean_center.cu
+        test/stats/minmax.cu
+        test/stats/mutual_info_score.cu
+        test/stats/rand_index.cu
+        test/stats/silhouette_score.cu
+        test/stats/stddev.cu
+        test/stats/sum.cu
+        test/stats/trustworthiness.cu
+        test/stats/weighted_mean.cu
+        test/stats/v_measure.cu
+        test/test.cpp
+        )
 
 set_target_properties(test_raft
-PROPERTIES BUILD_RPATH                         "\$ORIGIN"
-           # set target compile options
-           CXX_STANDARD                        17
-           CXX_STANDARD_REQUIRED               ON
-           CUDA_STANDARD                       17
-           CUDA_STANDARD_REQUIRED              ON
-           POSITION_INDEPENDENT_CODE           ON
-           INTERFACE_POSITION_INDEPENDENT_CODE ON
-           INSTALL_RPATH "\$ORIGIN/../../../lib"
-)
+        PROPERTIES BUILD_RPATH                         "\$ORIGIN"
+        # set target compile options
+        CXX_STANDARD                        17
+        CXX_STANDARD_REQUIRED               ON
+        CUDA_STANDARD                       17
+        CUDA_STANDARD_REQUIRED              ON
+        POSITION_INDEPENDENT_CODE           ON
+        INTERFACE_POSITION_INDEPENDENT_CODE ON
+        INSTALL_RPATH "\$ORIGIN/../../../lib"
+        )
 
 target_compile_options(test_raft
         PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-                "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-)
+        "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+        )
 
 target_include_directories(test_raft
-    PUBLIC  "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>"
-)
+        PUBLIC  "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>"
+        )
 
 
 target_link_libraries(test_raft
-PRIVATE
-  raft::raft
-  raft::distance
-  raft::nn
-  NCCL::NCCL
-  faiss::faiss
-  GTest::gtest
-  GTest::gtest_main
-  Threads::Threads
-  $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-  $<TARGET_NAME_IF_EXISTS:conda_env>
-)
+        PRIVATE
+        raft::raft
+        raft::headers
+        raft::distance
+        raft::nn
+        NCCL::NCCL
+        faiss::faiss
+        GTest::gtest
+        GTest::gtest_main
+        Threads::Threads
+        $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+        $<TARGET_NAME_IF_EXISTS:conda_env>)
 
 install(
-    TARGETS test_raft
-    COMPONENT testing
-    DESTINATION bin/libraft/gtests
-    EXCLUDE_FROM_ALL
+        TARGETS test_raft
+        COMPONENT testing
+        DESTINATION bin/libraft/gtests
+        EXCLUDE_FROM_ALL
 )

From 7f10eabaf2d21828846f48592009e72dede31ff8 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 07:25:00 -0500
Subject: [PATCH 062/167] Removign missing file

---
 cpp/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 17c8e05614..998dc69c22 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -197,7 +197,6 @@ set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
   add_library(raft_distance_lib ${RAFT_LIB_TYPE}
-          src/distance/pairwise_distance.cu
           src/distance/specializations/detail/canberra.cu
           src/distance/specializations/detail/chebyshev.cu
           src/distance/specializations/detail/correlation.cu

From 140d13ee59381a27ca6d678b685b91d6b6db0c00 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 07:28:57 -0500
Subject: [PATCH 063/167] Updating copyright

---
 cpp/cmake/thirdparty/get_gtest.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 8a48b9466c..b52518d415 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 1bf61fac31fb633a049971e0f022647b2a1b865a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 09:30:08 -0500
Subject: [PATCH 064/167] Updating README to describe the multiple components.

---
 README.md | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 3b1d5a8d4c..b54cfc649a 100755
--- a/README.md
+++ b/README.md
@@ -103,11 +103,14 @@ After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used i
 
 RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). 
 
-After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids-cmake in your project, you can begin using RAFT by placing the code snippet below in a file named `get_raft.cmake` and including it in your cmake build with `include(get_raft.cmake)`. This will create the `raft::raft` target to add to configure the link libraries for your artifacts.
+After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids-cmake in your project, you can begin using RAFT by placing the code snippet below in a file named `get_raft.cmake` and including it in your cmake build with `include(get_raft.cmake)`. This will make available several targets to add to configure the link libraries for your artifacts.
 
 ```cmake
 
 set(RAFT_VERSION "22.04")
+set(RAFT_FORK "rapidsai")
+set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
+set(RAFT_COMPONENTS "headers")
 
 function(find_and_configure_raft)
   set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC 
@@ -121,12 +124,13 @@ function(find_and_configure_raft)
 
   rapids_cpm_find(raft ${PKG_VERSION}
           GLOBAL_TARGETS      raft::raft
-          BUILD_EXPORT_SET    proj-exports
-          INSTALL_EXPORT_SET  proj-exports
+          BUILD_EXPORT_SET    projname-exports
+          INSTALL_EXPORT_SET  projname-exports
           CPM_ARGS
           GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
           GIT_TAG        ${PKG_PINNED_TAG}
           SOURCE_SUBDIR  cpp
+          FIND_PACKAGE_ARGUMENTS "COMPONENTS ${RAFT_COMPONENTS}"
           OPTIONS
           "BUILD_TESTS OFF"
           "RAFT_ENABLE_NN_DEPENDENCIES ${PKG_ENABLE_NN_DEPENDENCIES}"
@@ -140,26 +144,41 @@ endfunction()
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
-        FORK             rapidsai
-        PINNED_TAG       branch-${RAFT_VERSION}
-
+        FORK             ${RAFT_FORK}
+        PINNED_TAG       ${RAFT_PINNED_TAG}
         COMPILE_LIBRARIES      NO
         ENABLE_NN_DEPENDENCIES NO
         USE_FAISS_STATIC       NO
 )
 ```
 
+Several cmake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available.
+
+| Component | Target | Description | Dependencies |
+| --- | --- | --- | --- |
+| n/a | `raft::raft` | Only RAFT runtime headers | Cudatoolkit libraries, RMM |
+| headers | `raft::headers` | ALL RAFT headers | std::mdspan, cuCollections, Thrust, NVTools |
+| distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::headers |
+| nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::headers, FAISS |
+
 ### Source
 
 The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository:
-1. Create an environment with the needed dependencies: `conda env create --name raft_dev -f conda/environments/raft_dev_cuda11.5.yml`
-2. Run the build script from the repository root: `./build.sh pyraft libraft --compile-libs`
+1. Create an environment with the needed dependencies: 
+ ```
+ conda env create --name raft_dev -f conda/environments/raft_dev_cuda11.5.yml
+ conda activate raft_dev
+ ```
+2. Run the build script from the repository root: 
+ ```
+ ./build.sh pyraft libraft tests bench --compile-libs
+ ```
 
 The [Build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) guide.
 
 ## Folder Structure and Contents
 
-The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with the following folders:
+The folder structure mirrors other RAPIDS repos, with the following folders:
 
 - `ci`: Scripts for running CI in PRs
 - `conda`: Conda recipes and development conda environments

From c272d46bc0a93230d1a87a629f390813b16b3dde Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 09:30:55 -0500
Subject: [PATCH 065/167] A coupel more update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b54cfc649a..8bb8c6dc10 100755
--- a/README.md
+++ b/README.md
@@ -156,7 +156,7 @@ Several cmake targets can be made available by adding components in the table be
 
 | Component | Target | Description | Dependencies |
 | --- | --- | --- | --- |
-| n/a | `raft::raft` | Only RAFT runtime headers | Cudatoolkit libraries, RMM |
+| n/a | `raft::raft` | Only RAFT runtime headers. Safe to expose in public APIs | Cudatoolkit libraries, RMM |
 | headers | `raft::headers` | ALL RAFT headers | std::mdspan, cuCollections, Thrust, NVTools |
 | distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::headers |
 | nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::headers, FAISS |

From e67d0149f8fd23490f837d0091d514b1ac453239 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 09:57:20 -0500
Subject: [PATCH 066/167] more updates to build docs.

---
 BUILD.md | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index 2d0eb89736..b0d672b3e3 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -22,7 +22,7 @@
 
 C++ RAFT is a header-only library but provides the option of building shared libraries with template instantiations for common types to speed up compile times for larger projects.
 
-The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python code and provides options for building and installing the headers, Googletests, and individual shared libraries.
+The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python code and provides options for building and installing the headers, Googletests, benchmarks, and individual shared libraries.
 
 ### <a id="install_header_only_cpp"></a>Header-only C++
 
@@ -43,7 +43,7 @@ Build all the shared libraries by passing `--compile-libs` flag to `build.sh`:
 ./build.sh libraft --compile-libs
 ```
 
-To remain flexible, the individual shared libraries have their own flags and multiple can be used (though currently only the `nn` and `distance` packages contain shared libraries):
+Individual shared libraries have their own flags and multiple can be used (though currently only the `nn` and `distance` packages contain shared libraries):
 ```bash
 ./build.sh libraft --compile-nn --compile-dist
 ```
@@ -52,7 +52,7 @@ To remain flexible, the individual shared libraries have their own flags and mul
 
 Compile the Googletests using the `tests` target in `build.sh`:
 ```bash
-./build.sh libraft tests --compile-nn --compile-dist
+./build.sh libraft tests --compile-libs
 ```
 
 To run C++ tests:
@@ -65,10 +65,10 @@ To run C++ tests:
 
 Compile the benchmarks using the `bench` target in `build.sh`:
 ```bash
-./build.sh libraft bench --compile-nn --compile-dist
+./build.sh libraft bench
 ```
 
-To run C++ tests:
+To run the benchmarks:
 
 ```bash
 ./cpp/build/bench_raft
@@ -76,7 +76,7 @@ To run C++ tests:
 
 ### <a id="cpp_using_cmake"></a>C++ Using Cmake
 
-To install RAFT into a specific location, use `CMAKE_INSTALL_PREFIX`. The snippet below will install it into the current conda environment.
+To install RAFT into a specific location, use `CMAKE_INSTALL_PREFIX`. The snippet below will install it into the current conda environment:
 ```bash
 cd cpp
 mkdir build
@@ -84,8 +84,6 @@ cd build
 cmake -D BUILD_TESTS=ON -DRAFT_COMPILE_LIBRARIES=ON -DRAFT_ENABLE_NN_DEPENDENCIES=ON  -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX ../
 make install
 ```
-
-
 RAFT's cmake has the following configurable flags available:.
 
 | Flag | Possible Values | Default Value | Behavior |
@@ -112,6 +110,7 @@ Conda environment scripts are provided for installing the necessary dependencies
 
 ```bash
 conda env create --name raft_env -f conda/environments/raft_dev_cuda11.5.yml
+conda activate raft_env
 ```
 
 The Python API can be built using the `build.sh` script:
@@ -130,22 +129,28 @@ python setup.py install
 To run the Python tests:
 ```bash
 cd python
-python -m pytest raft
+py.test -s -v raft
 ```
 
 ## <a id="use_raft"></a>Using RAFT in downstream projects
 
 ### <a id="cxx_integration"></a>C++ header-only integration using cmake
 
-Use RAFT in cmake projects with `find_package(raft)` for header-only operation and the `raft::raft` target will be available for configuring linking and `RAFT_INCLUDE_DIR` will be available for includes. Note that if any packages are used which require downstream dependencies, such as the `libraft-nn` package requiring FAISS, these dependencies will have be installed and configured in cmake independently.
+The RAFT headers are broken down into two different include paths so that build-time headers can be isolated between projects while runtime headers can be installed globally, exposed to users, and shared across projects.
+- `cpp/include/raft_runtime` contains runtime headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
+- `cpp/include/raft` contains build-time headers that are the core of the RAFT library, containing primitives, algorithms, and other tools.
+
+Use `find_package(raft)` and the `raft::raft` if using RAFT to interact only with the public APIs of consuming projects.
+
+Use `find_package(raft COMPONENTS headers` and both the `raft::raft` and `raft::headers` targets when building a library that uses headers in `include/raft`.
 
 ### <a id="use_shared_libs"></a>Using pre-compiled shared libraries
 
-Use `find_package(raft COMPONENTS nn, distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, `raft::distance` and `raft::nn` targets will be available for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS in the case of the `nn` package).
+Use `find_package(raft COMPONENTS headers nn distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available in addition to `raft::raft` and `raft::headers` for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
 
-The pre-compiled libraries contain template specializations for commonly used types and require the additional include of header files with `extern template` definitions that tell the compiler not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `spectializations.hpp` and located in the base directory for the packages that contain specializations.
+The pre-compiled libraries contain template specializations for commonly used types. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `spectializations.hpp` and located in the base directory for the packages that contain specializations.
 
-The following example shows how to use the `libraft-distance` API with the pre-compiled specializations:
+The following example ignores the pre-compiled templates for the `libraft-distance` API so the symbols from pre-compiled shared library will be used:
 ```c++
 #include <raft/distance/distance.hpp>
 #include <raft/distance/specializations.hpp>
@@ -153,13 +158,15 @@ The following example shows how to use the `libraft-distance` API with the pre-c
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target and `RAFT_INCLUDE_DIR` for includes. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_runtime` headers and `raft::headers` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
 
 The following `cmake` snippet enables a flexible configuration of RAFT:
 
 ```cmake
 
 set(RAFT_VERSION "22.04")
+set(RAFT_FORK "rapidsai")
+set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
 
 function(find_and_configure_raft)
   set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC
@@ -181,7 +188,7 @@ function(find_and_configure_raft)
   # Add components
   #-----------------------------------------------------
 
-  string(APPEND RAFT_COMPONENTS "")
+  string(APPEND RAFT_COMPONENTS "headers")
   if(PKG_USE_NN_LIBRARY)
     string(APPEND RAFT_COMPONENTS " nn")
   endif()
@@ -196,8 +203,8 @@ function(find_and_configure_raft)
 
   rapids_cpm_find(raft ${PKG_VERSION}
           GLOBAL_TARGETS      raft::raft
-          BUILD_EXPORT_SET    proj-exports
-          INSTALL_EXPORT_SET  proj-exports
+          BUILD_EXPORT_SET    projname-exports
+          INSTALL_EXPORT_SET  projname-exports
           CPM_ARGS
           GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
           GIT_TAG        ${PKG_PINNED_TAG}
@@ -216,8 +223,8 @@ endfunction()
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
-        FORK             rapidsai
-        PINNED_TAG       branch-${RAFT_VERSION}
+        FORK             ${RAFT_FORK}
+        PINNED_TAG       ${RAFT_PINNED_TAG}
 
         # When PINNED_TAG above doesn't match cuml,
         # force local raft clone in build directory

From 731bf6fe954ac51f3a5a592e551e7d9e6a06d655 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 14:37:49 -0500
Subject: [PATCH 067/167] Fixing bad merge

---
 ci/gpu/build.sh | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 73d7219a9d..be24de7393 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -96,14 +96,9 @@ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 gpuci_logger "Build C++ and Python targets"
 # These should link against the existing shared libs
 if hasArg --skip-tests; then
-<<<<<<< HEAD
-  "$WORKSPACE/build.sh" pyraft pylibraft libraft -v --nogtest
+  "$WORKSPACE/build.sh" pyraft pylibraft libraft -v
 else
-=======
->>>>>>> fea-2204-raft_runtime
-  "$WORKSPACE/build.sh" pyraft libraft -v
-else
-  "$WORKSPACE/build.sh" pyraft libraft tests bench -v
+  "$WORKSPACE/build.sh" pyraft pylibraft libraft tests bench -v
 fi
 
 gpuci_logger "sccache stats"

From 55c3896c4b73386cc26afd085d6d4981cb47a99e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 19:14:13 -0500
Subject: [PATCH 068/167] Troubleshooting lib linking issue

---
 ci/cpu/build.sh | 10 +++++-----
 ci/gpu/build.sh |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index aa6f463b1b..871a9f13eb 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -94,17 +94,17 @@ if [ "$BUILD_LIBRAFT" == '1' ]; then
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_runtime
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
     mkdir -p ${CONDA_BLD_DIR}/libraft_runtime/work
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_runtime/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_runtime
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_nn
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
     mkdir -p ${CONDA_BLD_DIR}/libraft_nn/work
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_nn/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_nn
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_distance
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
     mkdir -p ${CONDA_BLD_DIR}/libraft_distance/work
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_distance/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_distance
   fi
 else
   gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-runtime"
@@ -118,11 +118,11 @@ if [ "$BUILD_RAFT" == "1" ]; then
   else
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
     mkdir -p ${CONDA_BLD_DIR}/pyraft
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pyraft/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pyraft
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pylibraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
     mkdir -p ${CONDA_BLD_DIR}/pylibraft
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pylibraft/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pylibraft
 
   fi
 else
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index be24de7393..348a27f664 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -98,7 +98,7 @@ gpuci_logger "Build C++ and Python targets"
 if hasArg --skip-tests; then
   "$WORKSPACE/build.sh" pyraft pylibraft libraft -v
 else
-  "$WORKSPACE/build.sh" pyraft pylibraft libraft tests bench -v
+  "$WORKSPACE/build.sh" pyraft pylibraft libraft tests bench  -v
 fi
 
 gpuci_logger "sccache stats"

From 6d091145fe417deae975f1aea7cd9c1e8642588f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 19:47:52 -0500
Subject: [PATCH 069/167] Printing out conda env

---
 ci/gpu/build.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 348a27f664..fe542f8b55 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -90,6 +90,8 @@ conda list --show-channel-urls
 
 gpuci_logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH"
 
+gpuci_logger `ls ${CONDA_PREFIX}/lib`
+
 export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 

From 2abd5602914c1c49b4d52c6ab77726087e083b2c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Mar 2022 20:59:16 -0500
Subject: [PATCH 070/167] Adding raft build dir to ld path

---
 ci/gpu/build.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index fe542f8b55..bb8bc22968 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -95,6 +95,9 @@ gpuci_logger `ls ${CONDA_PREFIX}/lib`
 export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
+export RAFT_BUILD_DIR="$WORKSPACE/ci/artifacts/raft/cpu/conda_work/cpp/build"
+export LD_LIBRARY_PATH="$RAFT_BUILD_DIR:$LD_LIBRARY_PATH"
+
 gpuci_logger "Build C++ and Python targets"
 # These should link against the existing shared libs
 if hasArg --skip-tests; then

From 7deacf544f026cb2c25eea437cbb75b6a87c6266 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 12 Mar 2022 18:51:21 -0500
Subject: [PATCH 071/167] Still debugging why libraft* is not being added to ld
 path in gpu build

---
 ci/cpu/build.sh | 8 ++++----
 ci/gpu/build.sh | 7 ++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 871a9f13eb..178f1c50cf 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -99,12 +99,12 @@ if [ "$BUILD_LIBRAFT" == '1' ]; then
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_nn
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
     mkdir -p ${CONDA_BLD_DIR}/libraft_nn/work
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_nn
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_nn/work
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_distance
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
     mkdir -p ${CONDA_BLD_DIR}/libraft_distance/work
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_distance
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_distance/work
   fi
 else
   gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-runtime"
@@ -118,11 +118,11 @@ if [ "$BUILD_RAFT" == "1" ]; then
   else
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
     mkdir -p ${CONDA_BLD_DIR}/pyraft
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pyraft
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pyraft/work
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pylibraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
     mkdir -p ${CONDA_BLD_DIR}/pylibraft
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pylibraft
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pylibraft/work
 
   fi
 else
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index bb8bc22968..4c69ccddbb 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -90,7 +90,6 @@ conda list --show-channel-urls
 
 gpuci_logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH"
 
-gpuci_logger `ls ${CONDA_PREFIX}/lib`
 
 export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
@@ -98,6 +97,8 @@ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 export RAFT_BUILD_DIR="$WORKSPACE/ci/artifacts/raft/cpu/conda_work/cpp/build"
 export LD_LIBRARY_PATH="$RAFT_BUILD_DIR:$LD_LIBRARY_PATH"
 
+gpuci_logger `ls ${RAFT_BUILD_DIR}`
+
 gpuci_logger "Build C++ and Python targets"
 # These should link against the existing shared libs
 if hasArg --skip-tests; then
@@ -111,8 +112,8 @@ sccache --show-stats
 
 gpuci_logger "Resetting LD_LIBRARY_PATH"
 
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_CACHED
-export LD_LIBRARY_PATH_CACHED=""
+#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_CACHED
+#export LD_LIBRARY_PATH_CACHED=""
 
 ################################################################################
 # TEST - Run GoogleTest and py.tests for RAFT

From 439d82d3f759c5b87f027987d3f5d99e003b9a40 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 12 Mar 2022 19:36:06 -0500
Subject: [PATCH 072/167] Still trying to figure out where raft cpp artifacts
 are installed inproject flash

---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 4c69ccddbb..0ec7782399 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -94,7 +94,7 @@ gpuci_logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH"
 export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
-export RAFT_BUILD_DIR="$WORKSPACE/ci/artifacts/raft/cpu/conda_work/cpp/build"
+export RAFT_BUILD_DIR="$WORKSPACE/ci/artifacts/raft/cpu/"
 export LD_LIBRARY_PATH="$RAFT_BUILD_DIR:$LD_LIBRARY_PATH"
 
 gpuci_logger `ls ${RAFT_BUILD_DIR}`

From c0ee958f70e9e615de059e560d15e7918f45e622 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 12 Mar 2022 20:24:16 -0500
Subject: [PATCH 073/167] Still trying to figure out what directories are
 available

---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 0ec7782399..02047e68fe 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -94,7 +94,7 @@ gpuci_logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH"
 export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
-export RAFT_BUILD_DIR="$WORKSPACE/ci/artifacts/raft/cpu/"
+export RAFT_BUILD_DIR="$WORKSPACE/ci/artifacts/raft"
 export LD_LIBRARY_PATH="$RAFT_BUILD_DIR:$LD_LIBRARY_PATH"
 
 gpuci_logger `ls ${RAFT_BUILD_DIR}`

From 73ba48f84af3903b783ece713dad02f714f0bb12 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 12 Mar 2022 20:27:16 -0500
Subject: [PATCH 074/167] Adding work dirs back

---
 ci/cpu/build.sh  | 12 ++++++------
 ci/cpu/upload.sh |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 178f1c50cf..f811aaab79 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -69,7 +69,7 @@ conda config --set ssl_verify False
 # machine with a single CUDA version, then have the gpu/build.sh script simply
 # install. This should eliminate a mismatch between different CUDA versions on
 # cpu vs. gpu builds that is problematic with CUDA 11.5 Enhanced Compat.
-if [ "$BUILD_LIBRAFT" == '1' ]; then
+if [ "$BUILD_LIBRAFT" == "1" ]; then
   BUILD_RAFT=1
   # If we are doing CUDA + Python builds, libraft package is located at ${CONDA_BLD_DIR}
   CONDA_LOCAL_CHANNEL="${CONDA_BLD_DIR}"
@@ -110,23 +110,23 @@ else
   gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-runtime"
 fi
 
-if [ "$BUILD_RAFT" == "1" ]; then
-  gpuci_logger "Building conda packages for pyraft"
+if [ "$BUILD_RAFT" == '1' ]; then
+  gpuci_logger "Building Python conda packages for raft"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pylibraft --python=$PYTHON
   else
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
-    mkdir -p ${CONDA_BLD_DIR}/pyraft
+    mkdir -p ${CONDA_BLD_DIR}/pyraft/work
     mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pyraft/work
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pylibraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
-    mkdir -p ${CONDA_BLD_DIR}/pylibraft
+    mkdir -p ${CONDA_BLD_DIR}/pylibraft/work
     mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pylibraft/work
 
   fi
 else
-  gpuci_logger "SKIPPING build of conda packages for pyraft"
+  gpuci_logger "SKIPPING build of Python conda packages for raft"
 fi
 
 ################################################################################
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 0b217f115b..28fd34eba8 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -30,7 +30,7 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBRAFT_HEADERS_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers --output`
+export LIBRAFT_RUNTIME_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_runtime --output`
 export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
 export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
 export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
@@ -44,10 +44,10 @@ gpuci_logger "Starting conda uploads"
 
 if [[ "$BUILD_LIBRAFT" == "1" && "$UPLOAD_LIBRAFT" == "1" ]]; then
 
-  test -e ${LIBRAFT_HEADERS_FILE}
-  echo "Upload libraft-nn"
+  test -e ${LIBRAFT_RUNTIME_FILE}
+  echo "Upload libraft-runtime"
   echo ${LIBRAFT_HEADERS_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_HEADERS_FILE} --no-progress
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_RUNTIME_FILE} --no-progress
 
   test -e ${LIBRAFT_NN_FILE}
   echo "Upload libraft-nn"

From 39394c68c1dc495ec9aae8459da4f281bfe7d352 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 12 Mar 2022 20:27:39 -0500
Subject: [PATCH 075/167] Printing out all children of /ci/artifacts/raft

---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 02047e68fe..e82582bcbc 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -97,7 +97,7 @@ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 export RAFT_BUILD_DIR="$WORKSPACE/ci/artifacts/raft"
 export LD_LIBRARY_PATH="$RAFT_BUILD_DIR:$LD_LIBRARY_PATH"
 
-gpuci_logger `ls ${RAFT_BUILD_DIR}`
+gpuci_logger `find ${RAFT_BUILD_DIR}`
 
 gpuci_logger "Build C++ and Python targets"
 # These should link against the existing shared libs

From f263ddee2c938d0ca511c4ec42958343c87a2510 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 12 Mar 2022 20:39:31 -0500
Subject: [PATCH 076/167] Installing libraft-* conda packages from previous job

---
 ci/gpu/build.sh | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index e82582bcbc..e840d65be4 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -37,6 +37,8 @@ export SCCACHE_BUCKET="rapids-sccache"
 export SCCACHE_REGION="us-west-2"
 export SCCACHE_IDLE_TIMEOUT="32768"
 
+export LIBRAFT_CONDA_PACKAGES="$WORKSPACE/ci/artifacts/raft/cpu/.conda-bld/linux-64"
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -73,6 +75,9 @@ pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-de
 pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
+# Install pre-built conda packages from previous CI step
+gpuci_logger "Install libraft conda packages from CPU job"
+gpuci_mamba_retry install --use-local "${LIBRAFT_CONDA_PACKAGES}/libraft*.bz2"
 
 gpuci_logger "Check compiler versions"
 python --version
@@ -90,15 +95,9 @@ conda list --show-channel-urls
 
 gpuci_logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH"
 
-
 export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
-export RAFT_BUILD_DIR="$WORKSPACE/ci/artifacts/raft"
-export LD_LIBRARY_PATH="$RAFT_BUILD_DIR:$LD_LIBRARY_PATH"
-
-gpuci_logger `find ${RAFT_BUILD_DIR}`
-
 gpuci_logger "Build C++ and Python targets"
 # These should link against the existing shared libs
 if hasArg --skip-tests; then

From a5eb563c188b23cdab831fc994e9f36e0b174c9d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 12:17:08 -0400
Subject: [PATCH 077/167] Renaming raft runtime -> client api

---
 BUILD.md                                           |  8 ++++----
 README.md                                          |  8 ++++----
 ci/cpu/build.sh                                    | 14 +++++++-------
 ci/cpu/upload.sh                                   | 10 +++++-----
 ci/gpu/build.sh                                    | 11 +++++------
 .../build.sh                                       |  0
 .../meta.yaml                                      |  4 ++--
 conda/recipes/libraft_distance/meta.yaml           |  4 ++--
 conda/recipes/libraft_nn/meta.yaml                 |  4 ++--
 cpp/CMakeLists.txt                                 |  4 ++--
 .../{raft_runtime => raft_client}/comms/comms.hpp  |  2 +-
 .../{raft_runtime => raft_client}/cudart_utils.hpp |  2 +-
 .../{raft_runtime => raft_client}/error.hpp        |  0
 .../{raft_runtime => raft_client}/handle.hpp       | 12 ++++++------
 .../interruptible.hpp                              |  4 ++--
 .../linalg/cublas_macros.hpp                       |  2 +-
 .../linalg/cusolver_macros.hpp                     |  2 +-
 cpp/include/{raft_runtime => raft_client}/raft.hpp |  0
 .../sparse/cusparse_macros.hpp                     |  2 +-
 19 files changed, 46 insertions(+), 47 deletions(-)
 rename conda/recipes/{libraft_runtime => libraft_client_api}/build.sh (100%)
 rename conda/recipes/{libraft_runtime => libraft_client_api}/meta.yaml (96%)
 rename cpp/include/{raft_runtime => raft_client}/comms/comms.hpp (99%)
 rename cpp/include/{raft_runtime => raft_client}/cudart_utils.hpp (99%)
 rename cpp/include/{raft_runtime => raft_client}/error.hpp (100%)
 rename cpp/include/{raft_runtime => raft_client}/handle.hpp (97%)
 rename cpp/include/{raft_runtime => raft_client}/interruptible.hpp (99%)
 rename cpp/include/{raft_runtime => raft_client}/linalg/cublas_macros.hpp (99%)
 rename cpp/include/{raft_runtime => raft_client}/linalg/cusolver_macros.hpp (99%)
 rename cpp/include/{raft_runtime => raft_client}/raft.hpp (100%)
 rename cpp/include/{raft_runtime => raft_client}/sparse/cusparse_macros.hpp (99%)

diff --git a/BUILD.md b/BUILD.md
index b0d672b3e3..1ecbf26a60 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -136,9 +136,9 @@ py.test -s -v raft
 
 ### <a id="cxx_integration"></a>C++ header-only integration using cmake
 
-The RAFT headers are broken down into two different include paths so that build-time headers can be isolated between projects while runtime headers can be installed globally, exposed to users, and shared across projects.
-- `cpp/include/raft_runtime` contains runtime headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
-- `cpp/include/raft` contains build-time headers that are the core of the RAFT library, containing primitives, algorithms, and other tools.
+The RAFT headers are broken down into two different include paths so that build-time headers can be isolated between projects while client API headers can be installed globally, exposed to users, and shared across projects.
+- `cpp/include/raft_client` contains client API headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
+- `cpp/include/raft` contains the core of the RAFT header-only library, containing primitives, algorithms, and other tools.
 
 Use `find_package(raft)` and the `raft::raft` if using RAFT to interact only with the public APIs of consuming projects.
 
@@ -158,7 +158,7 @@ The following example ignores the pre-compiled templates for the `libraft-distan
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_runtime` headers and `raft::headers` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_client` headers and `raft::headers` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
 
 The following `cmake` snippet enables a flexible configuration of RAFT:
 
diff --git a/README.md b/README.md
index 8bb8c6dc10..cd778abdf2 100755
--- a/README.md
+++ b/README.md
@@ -87,14 +87,14 @@ RAFT can be installed through conda, cmake-package-manager (cpm), or by building
 ### Conda
 
 The easiest way to install RAFT is through conda and several packages are provided.
-- `libraft-runtime` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`
+- `libraft-client-api` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`. The client APIs are also more stable across versions so they can be safely installed globally in an environment with projects which might have been built with different versions of RAFT.
 - `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
 - `libraft-distance` (optional) contains shared libraries for distance primitives.
-- `pyraft` (optional) contains reusable Python tools to accelerate Python algorithm development
+- `pyraft` (optional) contains reusable Python tools to accelerate Python algorithm development.
 
 To install RAFT with conda (change to `rapidsai-nightly` for more up-to-date but less stable nightly packages)
 ```bash
-conda install -c rapidsai libraft-runtime libraft-nn libraft-distance pyraft
+conda install -c rapidsai libraft-client-api libraft-nn libraft-distance pyraft
 ```
 
 After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
@@ -156,7 +156,7 @@ Several cmake targets can be made available by adding components in the table be
 
 | Component | Target | Description | Dependencies |
 | --- | --- | --- | --- |
-| n/a | `raft::raft` | Only RAFT runtime headers. Safe to expose in public APIs | Cudatoolkit libraries, RMM |
+| n/a | `raft::raft` | Only RAFT client API headers. Safe to expose in public APIs. | Cudatoolkit libraries, RMM |
 | headers | `raft::headers` | ALL RAFT headers | std::mdspan, cuCollections, Thrust, NVTools |
 | distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::headers |
 | nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::headers, FAISS |
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index f68b9c8a3e..6983414e77 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -85,16 +85,16 @@ gpuci_mamba_retry install -c conda-forge boa
 ###############################################################################
 
 if [ "$BUILD_LIBRAFT" == '1' ]; then
-  gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-runtime"
+  gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-client-api"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_runtime
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_client_api
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance
   else
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_runtime
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_client_api
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
-    mkdir -p ${CONDA_BLD_DIR}/libraft_runtime/work
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_runtime/work
+    mkdir -p ${CONDA_BLD_DIR}/libraft_client_api/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_client_api/work
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_nn
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
@@ -107,7 +107,7 @@ if [ "$BUILD_LIBRAFT" == '1' ]; then
     mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_distance/work
   fi
 else
-  gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-runtime"
+  gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-client-api"
 fi
 
 if [ "$BUILD_RAFT" == "1" ]; then
@@ -116,7 +116,7 @@ if [ "$BUILD_RAFT" == "1" ]; then
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON
   else
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
-    mkdir -p ${CONDA_BLD_DIR}/pyraft
+    mkdir -p ${CONDA_BLD_DIR}/pyraft/work
     mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pyraft/work
   fi
 else
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index fe1d651c31..74dfa38ff5 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -30,7 +30,7 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBRAFT_HEADERS_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers --output`
+export LIBRAFT_CLIENT_API_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_client_api --output`
 export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
 export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
 export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
@@ -43,10 +43,10 @@ gpuci_logger "Starting conda uploads"
 
 if [[ "$BUILD_LIBRAFT" == "1" && "$UPLOAD_LIBRAFT" == "1" ]]; then
 
-  test -e ${LIBRAFT_HEADERS_FILE}
-  echo "Upload libraft-nn"
-  echo ${LIBRAFT_HEADERS_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_HEADERS_FILE} --no-progress
+  test -e ${LIBRAFT_CLIENT_API_FILE}
+  echo "Upload libraft-client-api"
+  echo ${LIBRAFT_CLIENT_API_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_CLIENT_API_FILE} --no-progress
 
   test -e ${LIBRAFT_NN_FILE}
   echo "Upload libraft-nn"
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 1affaef0b1..1c4a504069 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -37,6 +37,8 @@ export SCCACHE_BUCKET="rapids-sccache"
 export SCCACHE_REGION="us-west-2"
 export SCCACHE_IDLE_TIMEOUT="32768"
 
+export LIBRAFT_CONDA_PACKAGES="$WORKSPACE/ci/artifacts/raft/cpu/.conda-bld/linux-64"
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -73,6 +75,9 @@ pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-de
 pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
+# Install pre-built conda packages from previous CI step
+gpuci_logger "Install libraft conda packages from CPU job"
+gpuci_mamba_retry install --use-local "${LIBRAFT_CONDA_PACKAGES}/libraft*.bz2"
 
 gpuci_logger "Check compiler versions"
 python --version
@@ -90,7 +95,6 @@ conda list --show-channel-urls
 
 gpuci_logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH"
 
-export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
 gpuci_logger "Build C++ and Python targets"
@@ -107,11 +111,6 @@ sccache --show-stats
 gpuci_logger "Building docs"
 "$WORKSPACE/build.sh" docs -v
 
-gpuci_logger "Resetting LD_LIBRARY_PATH"
-
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_CACHED
-export LD_LIBRARY_PATH_CACHED=""
-
 ################################################################################
 # TEST - Run GoogleTest and py.tests for RAFT
 ################################################################################
diff --git a/conda/recipes/libraft_runtime/build.sh b/conda/recipes/libraft_client_api/build.sh
similarity index 100%
rename from conda/recipes/libraft_runtime/build.sh
rename to conda/recipes/libraft_client_api/build.sh
diff --git a/conda/recipes/libraft_runtime/meta.yaml b/conda/recipes/libraft_client_api/meta.yaml
similarity index 96%
rename from conda/recipes/libraft_runtime/meta.yaml
rename to conda/recipes/libraft_client_api/meta.yaml
index b98d26a308..adeaf85e99 100644
--- a/conda/recipes/libraft_runtime/meta.yaml
+++ b/conda/recipes/libraft_client_api/meta.yaml
@@ -8,7 +8,7 @@
 {% set cuda_major=cuda_version.split('.')[0] %}
 {% set ucx_py_version=environ.get('UCX_PY_VERSION') %}
 package:
-  name: libraft-runtime
+  name: libraft-client-api
   version: {{ version }}
 
 source:
@@ -58,4 +58,4 @@ about:
   home: http://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
-  summary: libraft-runtime library
+  summary: libraft-client-api library
diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index c85c5a8ac3..24afa82e80 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-runtime {{ version }}
+    - libraft-client-api {{ version }}
     - nccl>=2.9.9
     - cudatoolkit {{ cuda_version }}.*
     - ucx-py {{ ucx_py_version }}
@@ -47,7 +47,7 @@ requirements:
     - gmock
     - librmm {{ minor_version }}
   run:
-    - libraft-runtime {{ version }}
+    - libraft-client-api {{ version }}
     - nccl>=2.9.9
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
index ffa3a26bb5..2ef17902a9 100644
--- a/conda/recipes/libraft_nn/meta.yaml
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-runtime {{ version }}
+    - libraft-client-api {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - lapack
     - faiss-proc=*=cuda
@@ -48,7 +48,7 @@ requirements:
     - librmm {{ minor_version }}
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
-    - libraft-runtime {{ version }}
+    - libraft-client-api {{ version }}
     - faiss-proc=*=cuda
     - libfaiss 1.7.0 *_cuda
     - libcusolver>=11.2.1
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a57222ba6c..9f4e885e10 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -331,11 +331,11 @@ if(TARGET raft_nn_lib)
 endif()
 
 
-install(DIRECTORY include/raft_runtime
+install(DIRECTORY include/raft_client
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_runtime)
 
 # Temporary install of raft.hpp while the file is removed
-install(FILES include/raft_runtime/raft.hpp
+install(FILES include/raft_client/raft.hpp
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_runtime)
 
 ##############################################################################
diff --git a/cpp/include/raft_runtime/comms/comms.hpp b/cpp/include/raft_client/comms/comms.hpp
similarity index 99%
rename from cpp/include/raft_runtime/comms/comms.hpp
rename to cpp/include/raft_client/comms/comms.hpp
index 9e2aa9fa84..d8ab741610 100644
--- a/cpp/include/raft_runtime/comms/comms.hpp
+++ b/cpp/include/raft_client/comms/comms.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <memory>
-#include <raft_runtime/error.hpp>
+#include <raft_client/error.hpp>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/include/raft_runtime/cudart_utils.hpp b/cpp/include/raft_client/cudart_utils.hpp
similarity index 99%
rename from cpp/include/raft_runtime/cudart_utils.hpp
rename to cpp/include/raft_client/cudart_utils.hpp
index 153f426c09..e737181689 100644
--- a/cpp/include/raft_runtime/cudart_utils.hpp
+++ b/cpp/include/raft_client/cudart_utils.hpp
@@ -19,7 +19,7 @@
 
 #pragma once
 
-#include <raft_runtime/error.hpp>
+#include <raft_client/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
diff --git a/cpp/include/raft_runtime/error.hpp b/cpp/include/raft_client/error.hpp
similarity index 100%
rename from cpp/include/raft_runtime/error.hpp
rename to cpp/include/raft_client/error.hpp
diff --git a/cpp/include/raft_runtime/handle.hpp b/cpp/include/raft_client/handle.hpp
similarity index 97%
rename from cpp/include/raft_runtime/handle.hpp
rename to cpp/include/raft_client/handle.hpp
index 99156b1971..58d5ae3959 100644
--- a/cpp/include/raft_runtime/handle.hpp
+++ b/cpp/include/raft_client/handle.hpp
@@ -35,13 +35,13 @@
 ///@todo: enable once we have migrated cuml-comms layer too
 //#include <common/cuml_comms_int.hpp>
 
-#include <raft_runtime/cudart_utils.hpp>
+#include <raft_client/cudart_utils.hpp>
 
-#include <raft_runtime/comms/comms.hpp>
-#include <raft_runtime/interruptible.hpp>
-#include <raft_runtime/linalg/cublas_macros.hpp>
-#include <raft_runtime/linalg/cusolver_macros.hpp>
-#include <raft_runtime/sparse/cusparse_macros.hpp>
+#include <raft_client/comms/comms.hpp>
+#include <raft_client/interruptible.hpp>
+#include <raft_client/linalg/cublas_macros.hpp>
+#include <raft_client/linalg/cusolver_macros.hpp>
+#include <raft_client/sparse/cusparse_macros.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft_runtime/interruptible.hpp b/cpp/include/raft_client/interruptible.hpp
similarity index 99%
rename from cpp/include/raft_runtime/interruptible.hpp
rename to cpp/include/raft_client/interruptible.hpp
index 8b9c15a4e1..da5d9eccd3 100644
--- a/cpp/include/raft_runtime/interruptible.hpp
+++ b/cpp/include/raft_client/interruptible.hpp
@@ -22,8 +22,8 @@
 #include <memory>
 #include <mutex>
 #include <optional>
-#include <raft_runtime/cudart_utils.hpp>
-#include <raft_runtime/error.hpp>
+#include <raft_client/cudart_utils.hpp>
+#include <raft_client/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <thread>
 #include <unordered_map>
diff --git a/cpp/include/raft_runtime/linalg/cublas_macros.hpp b/cpp/include/raft_client/linalg/cublas_macros.hpp
similarity index 99%
rename from cpp/include/raft_runtime/linalg/cublas_macros.hpp
rename to cpp/include/raft_client/linalg/cublas_macros.hpp
index 6ab105bd0a..16d4e6e6c8 100644
--- a/cpp/include/raft_runtime/linalg/cublas_macros.hpp
+++ b/cpp/include/raft_client/linalg/cublas_macros.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <raft_runtime/error.hpp>
+#include <raft_client/error.hpp>
 
 ///@todo: enable this once we have logger enabled
 //#include <cuml/common/logger.hpp>
diff --git a/cpp/include/raft_runtime/linalg/cusolver_macros.hpp b/cpp/include/raft_client/linalg/cusolver_macros.hpp
similarity index 99%
rename from cpp/include/raft_runtime/linalg/cusolver_macros.hpp
rename to cpp/include/raft_client/linalg/cusolver_macros.hpp
index 7d727f1c6e..4dbcb44908 100644
--- a/cpp/include/raft_runtime/linalg/cusolver_macros.hpp
+++ b/cpp/include/raft_client/linalg/cusolver_macros.hpp
@@ -23,7 +23,7 @@
 #include <cusolverSp.h>
 ///@todo: enable this once logging is enabled
 //#include <cuml/common/logger.hpp>
-#include <raft_runtime/cudart_utils.hpp>
+#include <raft_client/cudart_utils.hpp>
 #include <type_traits>
 
 #define _CUSOLVER_ERR_TO_STR(err) \
diff --git a/cpp/include/raft_runtime/raft.hpp b/cpp/include/raft_client/raft.hpp
similarity index 100%
rename from cpp/include/raft_runtime/raft.hpp
rename to cpp/include/raft_client/raft.hpp
diff --git a/cpp/include/raft_runtime/sparse/cusparse_macros.hpp b/cpp/include/raft_client/sparse/cusparse_macros.hpp
similarity index 99%
rename from cpp/include/raft_runtime/sparse/cusparse_macros.hpp
rename to cpp/include/raft_client/sparse/cusparse_macros.hpp
index 9cb69f9551..d1a436d0cb 100644
--- a/cpp/include/raft_runtime/sparse/cusparse_macros.hpp
+++ b/cpp/include/raft_client/sparse/cusparse_macros.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <cusparse.h>
-#include <raft_runtime/error.hpp>
+#include <raft_client/error.hpp>
 ///@todo: enable this once logging is enabled
 //#include <cuml/common/logger.hpp>
 

From 6f98b72dc0f4b4cb2af005358037fad2b0e1f468 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 16:21:21 -0400
Subject: [PATCH 078/167] Checking in changes. Test runs but results are not
 corre

---
 cpp/include/raft/comms/comms.hpp              |  2 +-
 python/pylibraft/pylibraft/common/handle.pxd  |  2 +-
 .../pylibraft/common/interruptible.pxd        |  2 +-
 .../pylibraft/pylibraft/distance/__init__.py  |  2 +-
 .../pylibraft/distance/pairwise_distance.pyx  |  4 +-
 .../pylibraft/pylibraft/test/test_distance.py | 59 ++++++++++++++++++-
 6 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index c8a77a00bb..28c7bdfccd 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -25,7 +25,7 @@
 #pragma once
 
 #include <memory>
-#include <raft_runtime/error.hpp>
+#include <raft_client/error.hpp>
 #include <vector>
 
 namespace raft {
diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index ed8b11dca0..cb203f3255 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -30,7 +30,7 @@ cdef extern from "raft/mr/device/allocator.hpp" \
     cdef cppclass allocator:
         pass
 
-cdef extern from "raft/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft_client/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
diff --git a/python/pylibraft/pylibraft/common/interruptible.pxd b/python/pylibraft/pylibraft/common/interruptible.pxd
index cb639c0f72..c1f9ca2f00 100644
--- a/python/pylibraft/pylibraft/common/interruptible.pxd
+++ b/python/pylibraft/pylibraft/common/interruptible.pxd
@@ -22,7 +22,7 @@
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
-cdef extern from "raft/interruptible.hpp" namespace "raft" nogil:
+cdef extern from "raft_client/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
 
diff --git a/python/pylibraft/pylibraft/distance/__init__.py b/python/pylibraft/pylibraft/distance/__init__.py
index 278f6d9a81..ca3e6c5a2e 100644
--- a/python/pylibraft/pylibraft/distance/__init__.py
+++ b/python/pylibraft/pylibraft/distance/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 #
 
-from pairwise_distance import distance as pairwise_distance
\ No newline at end of file
+from .pairwise_distance import distance as pairwise_distance
\ No newline at end of file
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 08c63cbdce..3e89088219 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -58,5 +58,5 @@ def distance(X, Y, dists, metric="euclidean"):
                       <int>m,
                       <int>n,
                       <int>k,
-                      <DistanceType>DistanceType2.L2Expanded,
-                      <bool>true, <float>0.0)
+                      <DistanceType>DistanceType.L2SqrtUnexpanded,
+                      <bool>True, <float>0.0)
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
index f748f744d1..d5cb1b607e 100644
--- a/python/pylibraft/pylibraft/test/test_distance.py
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -13,5 +13,60 @@
 # limitations under the License.
 #
 
-def test_distance():
-    assert True
+from scipy.spatial.distance import cdist
+import pytest
+import numpy as np
+
+import rmm
+
+from pylibraft.distance import pairwise_distance
+
+
+class TestDeviceBuffer:
+
+    def __init__(self, ndarray):
+        self.ndarray_ = ndarray
+        self.device_buffer_ = \
+            rmm.DeviceBuffer.to_device(ndarray.ravel(order="C").tobytes())
+
+    @property
+    def __cuda_array_interface__(self):
+        device_cai = self.device_buffer_.__cuda_array_interface__
+        host_cai = self.ndarray_.__array_interface__.copy()
+        host_cai["data"] = (device_cai["data"][0], device_cai["data"][1])
+
+        return host_cai
+
+    def copy_to_host(self):
+        return np.frombuffer(self.device_buffer_.tobytes(),
+                             dtype=self.ndarray_.dtype,
+                             like=self.ndarray_)\
+            .astype(self.ndarray_.dtype)\
+            .reshape(self.ndarray_.shape)
+
+
+@pytest.mark.parametrize("n_rows", [10, 100, 1000])
+@pytest.mark.parametrize("n_cols", [10, 100, 1000])
+@pytest.mark.parametrize("dtype", [np.float32])
+def test_distance(n_rows, n_cols, dtype):
+    input1 = np.random.random_sample((n_rows, n_cols)).astype(dtype)
+    output = np.zeros((n_rows, n_rows), dtype=dtype)
+
+    expected = cdist(input1, input1, "euclidean")
+
+    input1_device = TestDeviceBuffer(input1)
+    output_device = TestDeviceBuffer(output)
+
+    pairwise_distance(input1_device, input1_device, output_device)
+
+    actual = output_device.copy_to_host()
+
+    print(str(expected))
+    print(str(actual))
+
+    assert np.allclose(expected, actual)
+    # result = np.frombuffer(output_device.copy_to_host().tobytes(), dtype)
+
+    # print(str(result.__array_interface__))
+    #
+    # print(str(result.dtype))

From 57399bcd43ad4ae3cd0b8a71762e767076cc0ca1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 16:23:59 -0400
Subject: [PATCH 079/167] and... tests are passing

---
 python/pylibraft/pylibraft/distance/pairwise_distance.pyx | 2 +-
 python/pylibraft/pylibraft/test/test_distance.py          | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 3e89088219..8b44414f1c 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -42,7 +42,7 @@ def distance(X, Y, dists, metric="euclidean"):
 
     m = x_cai["shape"][0]
     n = y_cai["shape"][0]
-    k = dists_cai["shape"][0]
+    k = x_cai["shape"][1]
 
     x_ptr = <uintptr_t>x_cai["data"][0]
     y_ptr = <uintptr_t>y_cai["data"][0]
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
index d5cb1b607e..a44d410186 100644
--- a/python/pylibraft/pylibraft/test/test_distance.py
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -58,12 +58,8 @@ def test_distance(n_rows, n_cols, dtype):
     output_device = TestDeviceBuffer(output)
 
     pairwise_distance(input1_device, input1_device, output_device)
-
     actual = output_device.copy_to_host()
 
-    print(str(expected))
-    print(str(actual))
-
     assert np.allclose(expected, actual)
     # result = np.frombuffer(output_device.copy_to_host().tobytes(), dtype)
 

From 01a0b52eef3f49527346be9a0bbb4a32907e4a92 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 16:32:05 -0400
Subject: [PATCH 080/167] Support for double precision

---
 .../pylibraft/distance/pairwise_distance.pyx  | 40 ++++++++++++++-----
 .../pylibraft/pylibraft/test/test_distance.py |  7 +---
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 8b44414f1c..81b840469b 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import numpy as np
 
 from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
@@ -51,12 +52,33 @@ def distance(X, Y, dists, metric="euclidean"):
     cdef handle_t *h = new handle_t()
 
     # TODO: Support single and double precision
-    pairwise_distance(deref(h),
-                      <float*> x_ptr,
-                      <float*> y_ptr,
-                      <float*> d_ptr,
-                      <int>m,
-                      <int>n,
-                      <int>k,
-                      <DistanceType>DistanceType.L2SqrtUnexpanded,
-                      <bool>True, <float>0.0)
+    x_dt = np.dtype(x_cai["typestr"])
+    y_dt = np.dtype(y_cai["typestr"])
+    d_dt = np.dtype(dists_cai["typestr"])
+
+    if x_dt != y_dt or x_dt != d_dt:
+        raise ValueError("Inputs must have the same dtypes")
+
+
+    if x_dt == np.float32:
+        pairwise_distance(deref(h),
+                          <float*> x_ptr,
+                          <float*> y_ptr,
+                          <float*> d_ptr,
+                          <int>m,
+                          <int>n,
+                          <int>k,
+                          <DistanceType>DistanceType.L2SqrtUnexpanded,
+                          <bool>True, <float>0.0)
+    elif x_dt == np.float64:
+        pairwise_distance(deref(h),
+                          <double*> x_ptr,
+                          <double*> y_ptr,
+                          <double*> d_ptr,
+                          <int>m,
+                          <int>n,
+                          <int>k,
+                          <DistanceType>DistanceType.L2SqrtUnexpanded,
+                          <bool>True, <float>0.0)
+    else:
+        raise ValueError("dtype %s not supported" % x_dt)
\ No newline at end of file
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
index a44d410186..cc4cbab949 100644
--- a/python/pylibraft/pylibraft/test/test_distance.py
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -47,7 +47,7 @@ def copy_to_host(self):
 
 @pytest.mark.parametrize("n_rows", [10, 100, 1000])
 @pytest.mark.parametrize("n_cols", [10, 100, 1000])
-@pytest.mark.parametrize("dtype", [np.float32])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
 def test_distance(n_rows, n_cols, dtype):
     input1 = np.random.random_sample((n_rows, n_cols)).astype(dtype)
     output = np.zeros((n_rows, n_rows), dtype=dtype)
@@ -61,8 +61,3 @@ def test_distance(n_rows, n_cols, dtype):
     actual = output_device.copy_to_host()
 
     assert np.allclose(expected, actual)
-    # result = np.frombuffer(output_device.copy_to_host().tobytes(), dtype)
-
-    # print(str(result.__array_interface__))
-    #
-    # print(str(result.dtype))

From 742eea8f894aedbcefe7ce7877ac7ba920073382 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 18:39:59 -0400
Subject: [PATCH 081/167] Fixing style

---
 python/pylibraft/pylibraft/distance/pairwise_distance.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 81b840469b..1d1b704c23 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -59,7 +59,6 @@ def distance(X, Y, dists, metric="euclidean"):
     if x_dt != y_dt or x_dt != d_dt:
         raise ValueError("Inputs must have the same dtypes")
 
-
     if x_dt == np.float32:
         pairwise_distance(deref(h),
                           <float*> x_ptr,
@@ -81,4 +80,4 @@ def distance(X, Y, dists, metric="euclidean"):
                           <DistanceType>DistanceType.L2SqrtUnexpanded,
                           <bool>True, <float>0.0)
     else:
-        raise ValueError("dtype %s not supported" % x_dt)
\ No newline at end of file
+        raise ValueError("dtype %s not supported" % x_dt)

From f03fac35e01dd50882f3b2bccb3672cb8dc08117 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 18:47:09 -0400
Subject: [PATCH 082/167] Changing libraft_client_api to libraft_frontend

---
 ci/cpu/build.sh                               |  8 ++--
 ci/cpu/upload.sh                              |  2 +-
 conda/recipes/libraft_client_api/meta.yaml    |  4 +-
 conda/recipes/libraft_distance/meta.yaml      |  4 +-
 conda/recipes/libraft_nn/meta.yaml            |  4 +-
 conda/recipes/pyraft/meta.yaml                |  4 +-
 cpp/CMakeLists.txt                            |  4 +-
 cpp/include/raft/comms/comms.hpp              |  2 +-
 .../comms/comms.hpp                           |  2 +-
 .../cudart_utils.hpp                          |  2 +-
 .../{raft_client => raft_frontend}/error.hpp  |  0
 .../{raft_client => raft_frontend}/handle.hpp | 12 ++---
 .../interruptible.hpp                         |  4 +-
 .../linalg/cublas_macros.hpp                  |  2 +-
 .../linalg/cusolver_macros.hpp                |  2 +-
 .../{raft_client => raft_frontend}/raft.hpp   |  0
 .../sparse/cusparse_macros.hpp                |  2 +-
 python/raft/common/handle.pxd                 |  2 +-
 python/raft/common/interruptible.pxd          |  2 +-
 python/raft/record.txt                        | 44 +++++++++++++++++++
 20 files changed, 75 insertions(+), 31 deletions(-)
 rename cpp/include/{raft_client => raft_frontend}/comms/comms.hpp (99%)
 rename cpp/include/{raft_client => raft_frontend}/cudart_utils.hpp (99%)
 rename cpp/include/{raft_client => raft_frontend}/error.hpp (100%)
 rename cpp/include/{raft_client => raft_frontend}/handle.hpp (97%)
 rename cpp/include/{raft_client => raft_frontend}/interruptible.hpp (99%)
 rename cpp/include/{raft_client => raft_frontend}/linalg/cublas_macros.hpp (99%)
 rename cpp/include/{raft_client => raft_frontend}/linalg/cusolver_macros.hpp (99%)
 rename cpp/include/{raft_client => raft_frontend}/raft.hpp (100%)
 rename cpp/include/{raft_client => raft_frontend}/sparse/cusparse_macros.hpp (99%)
 create mode 100644 python/raft/record.txt

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 6983414e77..6c18d24f1b 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -87,14 +87,14 @@ gpuci_mamba_retry install -c conda-forge boa
 if [ "$BUILD_LIBRAFT" == '1' ]; then
   gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-client-api"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_client_api
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_frontend
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance
   else
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_client_api
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_frontend
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
-    mkdir -p ${CONDA_BLD_DIR}/libraft_client_api/work
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_client_api/work
+    mkdir -p ${CONDA_BLD_DIR}/libraft_frontend/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_frontend/work
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_nn
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 74dfa38ff5..0829971962 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -30,7 +30,7 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBRAFT_CLIENT_API_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_client_api --output`
+export LIBRAFT_CLIENT_API_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_frontend --output`
 export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
 export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
 export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
diff --git a/conda/recipes/libraft_client_api/meta.yaml b/conda/recipes/libraft_client_api/meta.yaml
index adeaf85e99..fff8692173 100644
--- a/conda/recipes/libraft_client_api/meta.yaml
+++ b/conda/recipes/libraft_client_api/meta.yaml
@@ -8,7 +8,7 @@
 {% set cuda_major=cuda_version.split('.')[0] %}
 {% set ucx_py_version=environ.get('UCX_PY_VERSION') %}
 package:
-  name: libraft-client-api
+  name: libraft-frontend
   version: {{ version }}
 
 source:
@@ -58,4 +58,4 @@ about:
   home: http://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
-  summary: libraft-client-api library
+  summary: libraft-frontend library
diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index 24afa82e80..ef4aaa5eb9 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-client-api {{ version }}
+    - libraft-frontend {{ version }}
     - nccl>=2.9.9
     - cudatoolkit {{ cuda_version }}.*
     - ucx-py {{ ucx_py_version }}
@@ -47,7 +47,7 @@ requirements:
     - gmock
     - librmm {{ minor_version }}
   run:
-    - libraft-client-api {{ version }}
+    - libraft-frontend {{ version }}
     - nccl>=2.9.9
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
index 2ef17902a9..1baf839dab 100644
--- a/conda/recipes/libraft_nn/meta.yaml
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-client-api {{ version }}
+    - libraft-frontend {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - lapack
     - faiss-proc=*=cuda
@@ -48,7 +48,7 @@ requirements:
     - librmm {{ minor_version }}
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
-    - libraft-client-api {{ version }}
+    - libraft-frontend {{ version }}
     - faiss-proc=*=cuda
     - libfaiss 1.7.0 *_cuda
     - libcusolver>=11.2.1
diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
index 2408701377..5ff0477479 100644
--- a/conda/recipes/pyraft/meta.yaml
+++ b/conda/recipes/pyraft/meta.yaml
@@ -30,7 +30,7 @@ requirements:
     - setuptools
     - cython>=0.29,<0.30
     - rmm {{ minor_version }}
-    - libraft-runtime {{ version }}
+    - libraft-frontend {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0
     - nccl>=2.9.9
@@ -39,7 +39,7 @@ requirements:
   run:
     - python x.x
     - dask-cuda {{ minor_version }}
-    - libraft-runtime {{ version }}
+    - libraft-frontend {{ version }}
     - nccl>=2.9.9
     - rmm {{ minor_version }}
     - ucx-py {{ ucx_py_version }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9f4e885e10..76145d66af 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -331,11 +331,11 @@ if(TARGET raft_nn_lib)
 endif()
 
 
-install(DIRECTORY include/raft_client
+install(DIRECTORY include/raft_frontend
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_runtime)
 
 # Temporary install of raft.hpp while the file is removed
-install(FILES include/raft_client/raft.hpp
+install(FILES include/raft_frontend/raft.hpp
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_runtime)
 
 ##############################################################################
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index c8a77a00bb..d20707f0cb 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -25,7 +25,7 @@
 #pragma once
 
 #include <memory>
-#include <raft_runtime/error.hpp>
+#include <raft_frontend/error.hpp>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/include/raft_client/comms/comms.hpp b/cpp/include/raft_frontend/comms/comms.hpp
similarity index 99%
rename from cpp/include/raft_client/comms/comms.hpp
rename to cpp/include/raft_frontend/comms/comms.hpp
index d8ab741610..a08ef37fc6 100644
--- a/cpp/include/raft_client/comms/comms.hpp
+++ b/cpp/include/raft_frontend/comms/comms.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <memory>
-#include <raft_client/error.hpp>
+#include <raft_frontend/error.hpp>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/include/raft_client/cudart_utils.hpp b/cpp/include/raft_frontend/cudart_utils.hpp
similarity index 99%
rename from cpp/include/raft_client/cudart_utils.hpp
rename to cpp/include/raft_frontend/cudart_utils.hpp
index e737181689..7479468626 100644
--- a/cpp/include/raft_client/cudart_utils.hpp
+++ b/cpp/include/raft_frontend/cudart_utils.hpp
@@ -19,7 +19,7 @@
 
 #pragma once
 
-#include <raft_client/error.hpp>
+#include <raft_frontend/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
diff --git a/cpp/include/raft_client/error.hpp b/cpp/include/raft_frontend/error.hpp
similarity index 100%
rename from cpp/include/raft_client/error.hpp
rename to cpp/include/raft_frontend/error.hpp
diff --git a/cpp/include/raft_client/handle.hpp b/cpp/include/raft_frontend/handle.hpp
similarity index 97%
rename from cpp/include/raft_client/handle.hpp
rename to cpp/include/raft_frontend/handle.hpp
index 58d5ae3959..58522e5b88 100644
--- a/cpp/include/raft_client/handle.hpp
+++ b/cpp/include/raft_frontend/handle.hpp
@@ -35,13 +35,13 @@
 ///@todo: enable once we have migrated cuml-comms layer too
 //#include <common/cuml_comms_int.hpp>
 
-#include <raft_client/cudart_utils.hpp>
+#include <raft_frontend/cudart_utils.hpp>
 
-#include <raft_client/comms/comms.hpp>
-#include <raft_client/interruptible.hpp>
-#include <raft_client/linalg/cublas_macros.hpp>
-#include <raft_client/linalg/cusolver_macros.hpp>
-#include <raft_client/sparse/cusparse_macros.hpp>
+#include <raft_frontend/comms/comms.hpp>
+#include <raft_frontend/interruptible.hpp>
+#include <raft_frontend/linalg/cublas_macros.hpp>
+#include <raft_frontend/linalg/cusolver_macros.hpp>
+#include <raft_frontend/sparse/cusparse_macros.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft_client/interruptible.hpp b/cpp/include/raft_frontend/interruptible.hpp
similarity index 99%
rename from cpp/include/raft_client/interruptible.hpp
rename to cpp/include/raft_frontend/interruptible.hpp
index da5d9eccd3..b5a9691496 100644
--- a/cpp/include/raft_client/interruptible.hpp
+++ b/cpp/include/raft_frontend/interruptible.hpp
@@ -22,8 +22,8 @@
 #include <memory>
 #include <mutex>
 #include <optional>
-#include <raft_client/cudart_utils.hpp>
-#include <raft_client/error.hpp>
+#include <raft_frontend/cudart_utils.hpp>
+#include <raft_frontend/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <thread>
 #include <unordered_map>
diff --git a/cpp/include/raft_client/linalg/cublas_macros.hpp b/cpp/include/raft_frontend/linalg/cublas_macros.hpp
similarity index 99%
rename from cpp/include/raft_client/linalg/cublas_macros.hpp
rename to cpp/include/raft_frontend/linalg/cublas_macros.hpp
index 16d4e6e6c8..28722b18a7 100644
--- a/cpp/include/raft_client/linalg/cublas_macros.hpp
+++ b/cpp/include/raft_frontend/linalg/cublas_macros.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <raft_client/error.hpp>
+#include <raft_frontend/error.hpp>
 
 ///@todo: enable this once we have logger enabled
 //#include <cuml/common/logger.hpp>
diff --git a/cpp/include/raft_client/linalg/cusolver_macros.hpp b/cpp/include/raft_frontend/linalg/cusolver_macros.hpp
similarity index 99%
rename from cpp/include/raft_client/linalg/cusolver_macros.hpp
rename to cpp/include/raft_frontend/linalg/cusolver_macros.hpp
index 4dbcb44908..92151be237 100644
--- a/cpp/include/raft_client/linalg/cusolver_macros.hpp
+++ b/cpp/include/raft_frontend/linalg/cusolver_macros.hpp
@@ -23,7 +23,7 @@
 #include <cusolverSp.h>
 ///@todo: enable this once logging is enabled
 //#include <cuml/common/logger.hpp>
-#include <raft_client/cudart_utils.hpp>
+#include <raft_frontend/cudart_utils.hpp>
 #include <type_traits>
 
 #define _CUSOLVER_ERR_TO_STR(err) \
diff --git a/cpp/include/raft_client/raft.hpp b/cpp/include/raft_frontend/raft.hpp
similarity index 100%
rename from cpp/include/raft_client/raft.hpp
rename to cpp/include/raft_frontend/raft.hpp
diff --git a/cpp/include/raft_client/sparse/cusparse_macros.hpp b/cpp/include/raft_frontend/sparse/cusparse_macros.hpp
similarity index 99%
rename from cpp/include/raft_client/sparse/cusparse_macros.hpp
rename to cpp/include/raft_frontend/sparse/cusparse_macros.hpp
index d1a436d0cb..e0d7af33bc 100644
--- a/cpp/include/raft_client/sparse/cusparse_macros.hpp
+++ b/cpp/include/raft_frontend/sparse/cusparse_macros.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <cusparse.h>
-#include <raft_client/error.hpp>
+#include <raft_frontend/error.hpp>
 ///@todo: enable this once logging is enabled
 //#include <cuml/common/logger.hpp>
 
diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd
index ce9bac434c..b3b12e9f9e 100644
--- a/python/raft/common/handle.pxd
+++ b/python/raft/common/handle.pxd
@@ -31,7 +31,7 @@ cdef extern from "raft/mr/device/allocator.hpp" \
     cdef cppclass allocator:
         pass
 
-cdef extern from "raft_runtime/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft_frontend/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
diff --git a/python/raft/common/interruptible.pxd b/python/raft/common/interruptible.pxd
index d858f88beb..390c3ddb59 100644
--- a/python/raft/common/interruptible.pxd
+++ b/python/raft/common/interruptible.pxd
@@ -22,7 +22,7 @@
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
-cdef extern from "raft_runtime/interruptible.hpp" namespace "raft" nogil:
+cdef extern from "raft_frontend/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
 
diff --git a/python/raft/record.txt b/python/raft/record.txt
new file mode 100644
index 0000000000..61d6e527ff
--- /dev/null
+++ b/python/raft/record.txt
@@ -0,0 +1,44 @@
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/_version.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/test_comms.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/conftest.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/test_raft.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/test_interruptible.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/include_test/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/__init__.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/comms.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/utils.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/ucx.py
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/handle.pxd
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/__init__.pxd
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/cuda.pxd
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/interruptible.pxd
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/__pycache__/_version.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__pycache__/test_comms.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__pycache__/conftest.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__pycache__/test_raft.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__pycache__/test_interruptible.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/include_test/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/__pycache__/__init__.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/__pycache__/comms.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/__pycache__/utils.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/__pycache__/ucx.cpython-39.pyc
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/common/cuda.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/common/handle.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/common/interruptible.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/dask/common/comms_utils.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/dask/common/nccl.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/include_test/raft_include_test.cpython-39-x86_64-linux-gnu.so
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/top_level.txt
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/PKG-INFO
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/SOURCES.txt
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/requires.txt
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/dependency_links.txt
+/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/not-zip-safe

From 117dd5db730bb7e4afbeb088ee7cff334a9161e8 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 18:55:55 -0400
Subject: [PATCH 083/167] Rename client-api -> frontend

---
 BUILD.md                                      | 13 +++---
 README.md                                     | 17 +++----
 .../build.sh                                  |  0
 .../meta.yaml                                 |  0
 cpp/CMakeLists.txt                            | 44 +++++++++----------
 cpp/cmake/thirdparty/get_cuco.cmake           |  4 +-
 cpp/cmake/thirdparty/get_gtest.cmake          |  2 +-
 cpp/cmake/thirdparty/get_libcudacxx.cmake     |  4 +-
 cpp/cmake/thirdparty/get_mdspan.cmake         |  4 +-
 cpp/cmake/thirdparty/get_thrust.cmake         |  4 +-
 cpp/test/CMakeLists.txt                       |  2 +-
 11 files changed, 48 insertions(+), 46 deletions(-)
 rename conda/recipes/{libraft_client_api => libraft_frontend}/build.sh (100%)
 rename conda/recipes/{libraft_client_api => libraft_frontend}/meta.yaml (100%)

diff --git a/BUILD.md b/BUILD.md
index 1ecbf26a60..f718fb3280 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -136,17 +136,17 @@ py.test -s -v raft
 
 ### <a id="cxx_integration"></a>C++ header-only integration using cmake
 
-The RAFT headers are broken down into two different include paths so that build-time headers can be isolated between projects while client API headers can be installed globally, exposed to users, and shared across projects.
-- `cpp/include/raft_client` contains client API headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
-- `cpp/include/raft` contains the core of the RAFT header-only library, containing primitives, algorithms, and other tools.
+The RAFT headers are broken down into two different include paths so that backend headers can be isolated between projects while frontend API headers can be installed globally, exposed to users through public APIs, and shared across projects.
+- `cpp/include/raft_frontend` contains frontend API headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
+- `cpp/include/raft` contains the backend of the RAFT header-only library, containing primitives, algorithms, and other tools.
 
 Use `find_package(raft)` and the `raft::raft` if using RAFT to interact only with the public APIs of consuming projects.
 
-Use `find_package(raft COMPONENTS headers` and both the `raft::raft` and `raft::headers` targets when building a library that uses headers in `include/raft`.
+Use `find_package(raft COMPONENTS backend` and both the `raft::raft` and `raft::backend` targets when building a library that uses headers in `include/raft`.
 
 ### <a id="use_shared_libs"></a>Using pre-compiled shared libraries
 
-Use `find_package(raft COMPONENTS headers nn distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available in addition to `raft::raft` and `raft::headers` for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
+Use `find_package(raft COMPONENTS backend nn distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available in addition to `raft::raft` and `raft::headers` for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
 
 The pre-compiled libraries contain template specializations for commonly used types. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `spectializations.hpp` and located in the base directory for the packages that contain specializations.
 
@@ -188,7 +188,7 @@ function(find_and_configure_raft)
   # Add components
   #-----------------------------------------------------
 
-  string(APPEND RAFT_COMPONENTS "headers")
+  string(APPEND RAFT_COMPONENTS "backend")
   if(PKG_USE_NN_LIBRARY)
     string(APPEND RAFT_COMPONENTS " nn")
   endif()
@@ -212,6 +212,7 @@ function(find_and_configure_raft)
           FIND_PACKAGE_ARGUMENTS "COMPONENTS ${RAFT_COMPONENTS}"
           OPTIONS
           "BUILD_TESTS OFF"
+          "BUILD_BENCH OFF"
           "RAFT_ENABLE_NN_DEPENDENCIES ${PKG_ENABLE_NN_DEPENDENCIES}"
           "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}"
           "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}"
diff --git a/README.md b/README.md
index cd778abdf2..9c7e6cb73c 100755
--- a/README.md
+++ b/README.md
@@ -48,9 +48,9 @@ The `mdarray` forms a convenience layer over RMM and can be constructed in RAFT
 int n_rows = 10;
 int n_cols = 10;
 
-auto scalar = raft::make_device_scalar(handle, 1.0);
-auto vector = raft::make_device_vector(handle, n_cols);
-auto matrix = raft::make_device_matrix(handle, n_rows, n_cols);
+auto scalar = raft::make_device_scalar<float>(handle, 1.0);
+auto vector = raft::make_device_vector<float>(handle, n_cols);
+auto matrix = raft::make_device_matrix<float>(handle, n_rows, n_cols);
 ```
 
 ### C++ Example
@@ -60,7 +60,7 @@ Most of the primitives in RAFT accept a `raft::handle_t` object for the manageme
 The example below demonstrates creating a RAFT handle and using it with `device_matrix` and `device_vector` to allocate memory, generating random clusters, and computing
 pairwise Euclidean distances:
 ```c++
-#include <raft/handle.hpp>
+#include <raft_frontend/handle.hpp>
 #include <raft/mdarray.hpp>
 #include <raft/random/make_blobs.cuh>
 #include <raft/distance/distance.cuh>
@@ -87,7 +87,7 @@ RAFT can be installed through conda, cmake-package-manager (cpm), or by building
 ### Conda
 
 The easiest way to install RAFT is through conda and several packages are provided.
-- `libraft-client-api` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`. The client APIs are also more stable across versions so they can be safely installed globally in an environment with projects which might have been built with different versions of RAFT.
+- `libraft-frontend` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`. The client APIs are also more stable across versions so they can be safely installed globally in an environment with projects which might have been built with different versions of RAFT.
 - `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
 - `libraft-distance` (optional) contains shared libraries for distance primitives.
 - `pyraft` (optional) contains reusable Python tools to accelerate Python algorithm development.
@@ -110,7 +110,7 @@ After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids
 set(RAFT_VERSION "22.04")
 set(RAFT_FORK "rapidsai")
 set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
-set(RAFT_COMPONENTS "headers")
+set(RAFT_COMPONENTS "backend")
 
 function(find_and_configure_raft)
   set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC 
@@ -133,6 +133,7 @@ function(find_and_configure_raft)
           FIND_PACKAGE_ARGUMENTS "COMPONENTS ${RAFT_COMPONENTS}"
           OPTIONS
           "BUILD_TESTS OFF"
+          "BUILD_BENCH OFF"
           "RAFT_ENABLE_NN_DEPENDENCIES ${PKG_ENABLE_NN_DEPENDENCIES}"
           "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}"
           "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}"
@@ -156,8 +157,8 @@ Several cmake targets can be made available by adding components in the table be
 
 | Component | Target | Description | Dependencies |
 | --- | --- | --- | --- |
-| n/a | `raft::raft` | Only RAFT client API headers. Safe to expose in public APIs. | Cudatoolkit libraries, RMM |
-| headers | `raft::headers` | ALL RAFT headers | std::mdspan, cuCollections, Thrust, NVTools |
+| n/a | `raft::raft` | Only RAFT frontend API headers. Safe to expose in public APIs. | Cudatoolkit libraries, RMM |
+| headers | `raft::backend` | RAFT backend headers | std::mdspan, cuCollections, Thrust, NVTools |
 | distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::headers |
 | nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::headers, FAISS |
 
diff --git a/conda/recipes/libraft_client_api/build.sh b/conda/recipes/libraft_frontend/build.sh
similarity index 100%
rename from conda/recipes/libraft_client_api/build.sh
rename to conda/recipes/libraft_frontend/build.sh
diff --git a/conda/recipes/libraft_client_api/meta.yaml b/conda/recipes/libraft_frontend/meta.yaml
similarity index 100%
rename from conda/recipes/libraft_client_api/meta.yaml
rename to conda/recipes/libraft_frontend/meta.yaml
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 76145d66af..250092b449 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -68,7 +68,7 @@ message(VERBOSE "RAFT: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}")
 message(VERBOSE "RAFT: Enable nvtx markers: ${NVTX}")
 message(VERBOSE "RAFT: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 
-list(APPEND raft_FIND_COMPONENTS "headers")
+list(APPEND raft_FIND_COMPONENTS "backend")
 
 # Set RMM logging level
 set(RMM_LOGGING_LEVEL "INFO" CACHE STRING "Choose the logging level.")
@@ -139,26 +139,26 @@ set(RAFT_LINK_LIBS
         CUDA::cusparse
         rmm::rmm)
 
-add_library(raft_headers INTERFACE)
-if(TARGET raft_headers AND (NOT TARGET raft::headers))
-  add_library(raft::headers ALIAS raft_headers)
+add_library(raft_backend INTERFACE)
+if(TARGET raft_backend AND (NOT TARGET raft::backend))
+  add_library(raft::backend ALIAS raft_backend)
 endif()
 
-set_target_properties(raft_headers PROPERTIES EXPORT_NAME headers)
+set_target_properties(raft_backend PROPERTIES EXPORT_NAME backend)
 
-target_include_directories(raft_headers INTERFACE
+target_include_directories(raft_backend INTERFACE
         "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
         "$<INSTALL_INTERFACE:include>")
 
-target_link_libraries(raft_headers INTERFACE
+target_link_libraries(raft_backend INTERFACE
         raft::Thrust
         $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
         ${RAFT_LINK_LIBS}
         cuco::cuco
         std::mdspan)
 
-target_compile_definitions(raft_headers INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
-target_compile_features(raft_headers INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_compile_definitions(raft_backend INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
+target_compile_features(raft_backend INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY OR RAFT_COMPILE_NN_LIBRARY)
   file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
@@ -236,7 +236,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
           )
   set_target_properties(raft_distance_lib PROPERTIES OUTPUT_NAME raft_distance)
 
-  target_link_libraries(raft_distance_lib PRIVATE raft::headers)
+  target_link_libraries(raft_distance_lib PRIVATE raft::backend)
   target_compile_options(raft_distance_lib
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -280,7 +280,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
           )
   set_target_properties(raft_nn_lib PROPERTIES OUTPUT_NAME raft_nn)
 
-  target_link_libraries(raft_nn_lib PRIVATE raft::headers faiss::faiss)
+  target_link_libraries(raft_nn_lib PRIVATE raft::backend faiss::faiss)
   target_compile_options(raft_nn_lib
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -307,9 +307,9 @@ install(TARGETS raft
         DESTINATION ${lib_dir}
         EXPORT raft-exports)
 
-install(TARGETS raft_headers
+install(TARGETS raft_backend
         DESTINATION ${lib_dir}
-        EXPORT raft-headers-exports)
+        EXPORT raft-backend-exports)
 
 install(TARGETS raft_distance
         DESTINATION ${lib_dir}
@@ -352,7 +352,7 @@ Optional Components:
   - distance
 
 Imported Targets:
-  - raft::headers
+  - raft::backend
   - raft::raft
   - raft::nn brought in by the `nn` optional component
   - raft::distance brought in by the `distance` optional component
@@ -364,18 +364,18 @@ set(code_string
 
 if((distance IN_LIST raft_FIND_COMPONENTS OR
     nn IN_LIST raft_FIND_COMPONENTS) AND NOT
-    headers IN_LIST raft_FIND_COMPONENTS))
-    FATAL_ERROR("headers must be included to use components ${raft_FIND_COMPONENTS}")
+    backend IN_LIST raft_FIND_COMPONENTS))
+    FATAL_ERROR("backend must be included to use components ${raft_FIND_COMPONENTS}")
 endif()
 
-if(headers IN_LIST raft_FIND_COMPONENTS)
+if(backend IN_LIST raft_FIND_COMPONENTS)
   if(NOT TARGET raft::Thrust)
     thrust_create_target(raft::Thrust FROM_OPTIONS)
   endif()
 endif()
 
 if(distance IN_LIST raft_FIND_COMPONENTS OR
-   headers IN_LIST raft_FIND_COMPONENTS)
+   backend IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)
 endif()
 
@@ -394,9 +394,9 @@ endif()
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
 raft_export(INSTALL raft
-        COMPONENTS headers nn distance
+        COMPONENTS backend nn distance
         EXPORT_SET raft-exports
-        GLOBAL_TARGETS headers nn distance
+        GLOBAL_TARGETS backend nn distance
         NAMESPACE raft::
         DOCUMENTATION doc_string
         FINAL_CODE_BLOCK code_string)
@@ -405,8 +405,8 @@ raft_export(INSTALL raft
 # - build export -------------------------------------------------------------
 raft_export(BUILD raft
         EXPORT_SET raft-exports
-        COMPONENTS headers nn distance
-        GLOBAL_TARGETS raft_headers raft_distance raft_nn
+        COMPONENTS backend nn distance
+        GLOBAL_TARGETS raft_backend raft_distance raft_nn
         DOCUMENTATION doc_string
         NAMESPACE raft::
         FINAL_CODE_BLOCK code_string)
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 05f1e3ff3a..5002213134 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -18,8 +18,8 @@ function(find_and_configure_cuco VERSION)
 
     rapids_cpm_find(cuco ${VERSION}
       GLOBAL_TARGETS      cuco::cuco
-      BUILD_EXPORT_SET    raft-headers-exports
-      INSTALL_EXPORT_SET  raft-headers-exports
+      BUILD_EXPORT_SET    raft-backend-exports
+      INSTALL_EXPORT_SET  raft-backend-exports
       CPM_ARGS
         EXCLUDE_FROM_ALL TRUE
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index b52518d415..00dce165ea 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -17,7 +17,7 @@
 function(find_and_configure_gtest )
 
     include(${rapids-cmake-dir}/cpm/gtest.cmake)
-    rapids_cpm_gtest(BUILD_EXPORT_SET raft-headers-exports
+    rapids_cpm_gtest(BUILD_EXPORT_SET raft-backend-exports
                      EXCLUDE_FROM_ALL TRUE)
 
     if(GTest_ADDED)
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 222ff3ce59..3345fe66f5 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,8 +16,8 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-headers-exports
-                        INSTALL_EXPORT_SET raft-headers-exports
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-backend-exports
+                        INSTALL_EXPORT_SET raft-backend-exports
                         EXCLUDE_FROM_ALL TRUE)
 
 endfunction()
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index 3d2663bb5b..cf94bf5fd7 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -16,8 +16,8 @@ function(find_and_configure_mdspan VERSION)
   rapids_cpm_find(
     mdspan ${VERSION}
     GLOBAL_TARGETS std::mdspan
-    BUILD_EXPORT_SET    raft-headers-exports
-    INSTALL_EXPORT_SET  raft-headers-exports
+    BUILD_EXPORT_SET    raft-backend-exports
+    INSTALL_EXPORT_SET  raft-backend-exports
     CPM_ARGS
       EXCLUDE_FROM_ALL TRUE
       GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 648ac0b5d5..acad2500ca 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -17,8 +17,8 @@ function(find_and_configure_thrust)
     include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
     rapids_cpm_thrust( NAMESPACE raft )
-    rapids_export_package(BUILD thrust raft-headers-exports)
-    rapids_export_package(INSTALL thrust raft-headers-exports)
+    rapids_export_package(BUILD thrust raft-backend-exports)
+    rapids_export_package(INSTALL thrust raft-backend-exports)
 
 
 endfunction()
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 16a5a1380a..f0f04542b7 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -164,7 +164,7 @@ target_include_directories(test_raft
 target_link_libraries(test_raft
         PRIVATE
         raft::raft
-        raft::headers
+        raft::backend
         raft::distance
         raft::nn
         NCCL::NCCL

From 078322e7b9504faa1947b83ac28dab956b6f68bf Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 18:59:24 -0400
Subject: [PATCH 084/167] More doc updates

---
 BUILD.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index f718fb3280..76df9b4848 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -146,7 +146,7 @@ Use `find_package(raft COMPONENTS backend` and both the `raft::raft` and `raft::
 
 ### <a id="use_shared_libs"></a>Using pre-compiled shared libraries
 
-Use `find_package(raft COMPONENTS backend nn distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available in addition to `raft::raft` and `raft::headers` for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
+Use `find_package(raft COMPONENTS backend nn distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available in addition to `raft::raft` and `raft::backend` for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
 
 The pre-compiled libraries contain template specializations for commonly used types. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `spectializations.hpp` and located in the base directory for the packages that contain specializations.
 
@@ -158,7 +158,7 @@ The following example ignores the pre-compiled templates for the `libraft-distan
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_client` headers and `raft::headers` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_frontend` headers and `raft::headers` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
 
 The following `cmake` snippet enables a flexible configuration of RAFT:
 

From 2c65908f5a28d533857d3ec0370e206ae121e67e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 19:18:24 -0400
Subject: [PATCH 085/167] Fixing another raft::headers -> raft::backend

---
 BUILD.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BUILD.md b/BUILD.md
index 76df9b4848..ac613d4c8d 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -158,7 +158,7 @@ The following example ignores the pre-compiled templates for the `libraft-distan
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_frontend` headers and `raft::headers` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_frontend` headers and `raft::backend` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
 
 The following `cmake` snippet enables a flexible configuration of RAFT:
 

From 0dddac14bb1dd5afbd20078245922d8ab86fbd50 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 19:22:07 -0400
Subject: [PATCH 086/167] ugh

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9c7e6cb73c..fd6db01ae4 100755
--- a/README.md
+++ b/README.md
@@ -94,7 +94,7 @@ The easiest way to install RAFT is through conda and several packages are provid
 
 To install RAFT with conda (change to `rapidsai-nightly` for more up-to-date but less stable nightly packages)
 ```bash
-conda install -c rapidsai libraft-client-api libraft-nn libraft-distance pyraft
+conda install -c rapidsai libraft-frontend libraft-nn libraft-distance pyraft
 ```
 
 After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
@@ -159,8 +159,8 @@ Several cmake targets can be made available by adding components in the table be
 | --- | --- | --- | --- |
 | n/a | `raft::raft` | Only RAFT frontend API headers. Safe to expose in public APIs. | Cudatoolkit libraries, RMM |
 | headers | `raft::backend` | RAFT backend headers | std::mdspan, cuCollections, Thrust, NVTools |
-| distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::headers |
-| nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::headers, FAISS |
+| distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::backend |
+| nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::backend, FAISS |
 
 ### Source
 

From 1e6270b384934bb1e751a858e6873149c7675e1a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 19:50:11 -0400
Subject: [PATCH 087/167] using libraft-frontend in pylibraft meta.yml

---
 conda/recipes/pylibraft/meta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index b3880d9d61..6a0f92bdbb 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -29,10 +29,10 @@ requirements:
     - setuptools
     - cython>=0.29,<0.30
     - rmm {{ minor_version }}
-    - libraft-runtime {{ version }}
+    - libraft-frontend {{ version }}
     - libraft-distance {{ version }}
     - cudatoolkit {{ cuda_version }}.*
-    - cuda-python >=11.5,<12.0
+    - cuda-python >=11.5,<12.00.
   run:
     - python x.x
     - libraft-distance {{ version }}

From cafe000b2fbe849ee486e70d5c0ac7232df409da Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 19:51:03 -0400
Subject: [PATCH 088/167] Updating raft_runtime -> raft_frontend

---
 python/raft/common/interruptible.pxd    | 2 +-
 python/raft/dask/common/comms_utils.pyx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/raft/common/interruptible.pxd b/python/raft/common/interruptible.pxd
index 390c3ddb59..eb257d035b 100644
--- a/python/raft/common/interruptible.pxd
+++ b/python/raft/common/interruptible.pxd
@@ -26,7 +26,7 @@ cdef extern from "raft_frontend/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
 
-cdef extern from "raft_runtime/interruptible.hpp" \
+cdef extern from "raft_frontend/interruptible.hpp" \
         namespace "raft::interruptible" nogil:
     cdef void inter_synchronize \
         "raft::interruptible::synchronize"(cuda_stream_view stream) except+
diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx
index 8928d2f295..0b3f5f6637 100644
--- a/python/raft/dask/common/comms_utils.pyx
+++ b/python/raft/dask/common/comms_utils.pyx
@@ -31,7 +31,7 @@ cdef extern from "nccl.h":
     cdef struct ncclComm
     ctypedef ncclComm *ncclComm_t
 
-cdef extern from "raft_runtime/handle.hpp" namespace "raft":
+cdef extern from "raft_frontend/handle.hpp" namespace "raft":
     cdef cppclass handle_t:
         handle_t() except +
 

From 180898971dde746f26f1d9bbd8a11d3dd0019667 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 20:14:19 -0400
Subject: [PATCH 089/167] fixing typo

---
 conda/recipes/pylibraft/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 6a0f92bdbb..0df2117bd9 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -32,7 +32,7 @@ requirements:
     - libraft-frontend {{ version }}
     - libraft-distance {{ version }}
     - cudatoolkit {{ cuda_version }}.*
-    - cuda-python >=11.5,<12.00.
+    - cuda-python >=11.5,<12.0
   run:
     - python x.x
     - libraft-distance {{ version }}

From e7c86fd188d3f39c49f0d96cdc2ef7661fa096b6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 20:47:10 -0400
Subject: [PATCH 090/167] Fixing raft_client -> raft_frontend

---
 python/pylibraft/pylibraft/common/handle.pxd        | 2 +-
 python/pylibraft/pylibraft/common/interruptible.pxd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index cb203f3255..b8eedd4f49 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -30,7 +30,7 @@ cdef extern from "raft/mr/device/allocator.hpp" \
     cdef cppclass allocator:
         pass
 
-cdef extern from "raft_client/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft_frontend/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
diff --git a/python/pylibraft/pylibraft/common/interruptible.pxd b/python/pylibraft/pylibraft/common/interruptible.pxd
index c1f9ca2f00..d8ff7f57f8 100644
--- a/python/pylibraft/pylibraft/common/interruptible.pxd
+++ b/python/pylibraft/pylibraft/common/interruptible.pxd
@@ -22,7 +22,7 @@
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
-cdef extern from "raft_client/interruptible.hpp" namespace "raft" nogil:
+cdef extern from "raft_frontend/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
 

From cf339c0d260af9cc2601cb072628efdf955e67cc Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 14 Mar 2022 22:32:13 -0400
Subject: [PATCH 091/167] More replacement of raft_runtime

---
 cpp/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 250092b449..b40e764fb7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -177,7 +177,7 @@ if(${RAFT_STATIC_LINK_LIBRARIES})
 endif()
 
 ##############################################################################
-# - raft_runtime -------------------------------------------------------------
+# - raft_frontend -------------------------------------------------------------
 
 add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
@@ -332,11 +332,11 @@ endif()
 
 
 install(DIRECTORY include/raft_frontend
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_runtime)
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_frontend)
 
 # Temporary install of raft.hpp while the file is removed
 install(FILES include/raft_frontend/raft.hpp
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_runtime)
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_frontend)
 
 ##############################################################################
 # - install export -----------------------------------------------------------

From 520d93d009d796d017c2c90596f733caf66c53a9 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 10:13:43 -0400
Subject: [PATCH 092/167] Adding python example to readme

---
 README.md | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 61d187a539..1633594e0b 100755
--- a/README.md
+++ b/README.md
@@ -77,6 +77,25 @@ auto metric = raft::distance::DistanceType::L2SqrtExpanded;
 raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
 ```
 
+### Python Example
+
+The `pylibraft` package contains a Python API for RAFT algorithms and primitives. The package is currently limited to pairwise distances, and we will continue adding more.
+
+The example below demonstrates computing the pairwise Euclidean distances between cupy arrays.
+```python
+import cupy as cp
+
+from pylibraft.distance import pairwise_distance
+
+n_samples = 5000
+n_features = 50
+
+input = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+output = cp.empty((n_samples, n_samples), dtype=cp.float32)
+
+pairwise_distance(input, input, output, "euclidean")
+```
+
 ## Installing
 
 RAFT can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), or by building the repository from source.
@@ -87,15 +106,15 @@ The easiest way to install RAFT is through conda and several packages are provid
 - `libraft-frontend` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`. The client APIs are also more stable across versions so they can be safely installed globally in an environment with projects which might have been built with different versions of RAFT.
 - `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
 - `libraft-distance` (optional) contains shared libraries for distance primitives.
-- `pyraft` (optional) contains reusable Python tools to accelerate Python algorithm development.
 - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives
+- `pyraft` (optional) contains reusable Python infrastructure and tools to accelerate Python algorithm development.
 
 Use the following command to install RAFT with conda (use `-c rapidsai-nightly` for more up-to-date but less stable nightly packages)
 ```bash
 conda install -c rapidsai libraft-frontend libraft-nn libraft-distance pyraft pylibraft
 ```
 
-After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
+After installing RAFT, `find_package(raft COMPONENTS backend nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
 
 ### CPM
 
@@ -156,7 +175,7 @@ Several cmake targets can be made available by adding components in the table be
 | Component | Target | Description | Dependencies |
 | --- | --- | --- | --- |
 | n/a | `raft::raft` | Only RAFT frontend API headers. Safe to expose in public APIs. | Cudatoolkit libraries, RMM |
-| headers | `raft::backend` | RAFT backend headers | std::mdspan, cuCollections, Thrust, NVTools |
+| backend | `raft::backend` | RAFT backend headers | std::mdspan, cuCollections, Thrust, NVTools |
 | distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::backend |
 | nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::backend, FAISS |
 

From c74b90f99a92b7a3ecfba5e4c2d8609768b21ddd Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 11:16:21 -0400
Subject: [PATCH 093/167] Testing more distances.

---
 .../{distance_type.pxd => distance_type.pyx}  | 22 ++++++++++
 .../pylibraft/distance/pairwise_distance.pxd  | 44 -------------------
 .../pylibraft/distance/pairwise_distance.pyx  | 42 ++++++++++++++++--
 .../pylibraft/pylibraft/test/test_distance.py | 30 ++++++++++---
 4 files changed, 85 insertions(+), 53 deletions(-)
 rename python/pylibraft/pylibraft/distance/{distance_type.pxd => distance_type.pyx} (72%)
 delete mode 100644 python/pylibraft/pylibraft/distance/pairwise_distance.pxd

diff --git a/python/pylibraft/pylibraft/distance/distance_type.pxd b/python/pylibraft/pylibraft/distance/distance_type.pyx
similarity index 72%
rename from python/pylibraft/pylibraft/distance/distance_type.pxd
rename to python/pylibraft/pylibraft/distance/distance_type.pyx
index 2c01e42e53..b419a888e8 100644
--- a/python/pylibraft/pylibraft/distance/distance_type.pxd
+++ b/python/pylibraft/pylibraft/distance/distance_type.pyx
@@ -38,3 +38,25 @@ cdef extern from "raft/distance/distance_type.hpp" namespace "raft::distance":
         RusselRaoExpanded "raft::distance::DistanceType::RusselRaoExpanded"
         DiceExpanded "raft::distance::DistanceType::DiceExpanded"
         Precomputed "raft::distance::DistanceType::Precomputed"
+
+DISTANCE_TYPES = {
+    "l2": DistanceType.L2SqrtUnexpanded,
+    "euclidean": DistanceType.L2SqrtUnexpanded,
+    "l1": DistanceType.L1,
+    "cityblock": DistanceType.L1,
+    "inner_product": DistanceType.InnerProduct,
+    "chebyshev": DistanceType.Linf,
+    "canberra": DistanceType.Canberra,
+    "lp": DistanceType.LpUnexpanded,
+    "correlation": DistanceType.CorrelationExpanded,
+    "jaccard": DistanceType.JaccardExpanded,
+    "hellinger": DistanceType.HellingerExpanded,
+    "braycurtis": DistanceType.BrayCurtis,
+    "jensenshannon": DistanceType.JensenShannon,
+    "hamming": DistanceType.HammingUnexpanded,
+    "kl_divergence": DistanceType.KLDivergence,
+    "russellrao": DistanceType.RusselRaoExpanded,
+    "dice": DistanceType.DiceExpanded
+}
+
+SUPPORTED_DISTANCES = list(DISTANCE_TYPES.keys())
\ No newline at end of file
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pxd b/python/pylibraft/pylibraft/distance/pairwise_distance.pxd
deleted file mode 100644
index b48af03e94..0000000000
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pxd
+++ /dev/null
@@ -1,44 +0,0 @@
-#
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from libcpp cimport bool
-from pylibraft.distance.distance_type cimport DistanceType
-from pylibraft.common.handle cimport handle_t
-
-cdef extern from "raft_distance/pairwise_distance.hpp" \
-        namespace "raft::distance::runtime":
-
-    cdef void pairwise_distance(const handle_t &handle,
-                                float *x,
-                                float *y,
-                                float *dists,
-                                int m,
-                                int n,
-                                int k,
-                                DistanceType metric,
-                                bool isRowMajor,
-                                float metric_arg)
-
-    cdef void pairwise_distance(const handle_t &handle,
-                                double *x,
-                                double *y,
-                                double *dists,
-                                int m,
-                                int n,
-                                int k,
-                                DistanceType metric,
-                                bool isRowMajor,
-                                float metric_arg)
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 1d1b704c23..8e4b70ba76 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -19,8 +19,39 @@ from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
 
 from pylibraft.distance.distance_type cimport DistanceType
+from pylibraft.distance.distance_type import DISTANCE_TYPES
+from pylibraft.distance.distance_type import SUPPORTED_DISTANCES
+
 from pylibraft.common.handle cimport handle_t
-from pylibraft.distance.pairwise_distance import *
+
+from libcpp cimport bool
+from pylibraft.distance.distance_type cimport DistanceType
+from pylibraft.common.handle cimport handle_t
+
+cdef extern from "raft_distance/pairwise_distance.hpp" \
+        namespace "raft::distance::runtime":
+
+    cdef void pairwise_distance(const handle_t &handle,
+                                float *x,
+                                float *y,
+                                float *dists,
+                                int m,
+                                int n,
+                                int k,
+                                DistanceType metric,
+                                bool isRowMajor,
+                                float metric_arg)
+
+    cdef void pairwise_distance(const handle_t &handle,
+                                double *x,
+                                double *y,
+                                double *dists,
+                                int m,
+                                int n,
+                                int k,
+                                DistanceType metric,
+                                bool isRowMajor,
+                                float metric_arg)
 
 
 def distance(X, Y, dists, metric="euclidean"):
@@ -56,6 +87,11 @@ def distance(X, Y, dists, metric="euclidean"):
     y_dt = np.dtype(y_cai["typestr"])
     d_dt = np.dtype(dists_cai["typestr"])
 
+    if metric not in SUPPORTED_DISTANCES:
+        raise ValueError("metric %s is not supported" % metric)
+
+    cdef DistanceType distance_type = DISTANCE_TYPES[metric]
+
     if x_dt != y_dt or x_dt != d_dt:
         raise ValueError("Inputs must have the same dtypes")
 
@@ -67,7 +103,7 @@ def distance(X, Y, dists, metric="euclidean"):
                           <int>m,
                           <int>n,
                           <int>k,
-                          <DistanceType>DistanceType.L2SqrtUnexpanded,
+                          <DistanceType>distance_type,
                           <bool>True, <float>0.0)
     elif x_dt == np.float64:
         pairwise_distance(deref(h),
@@ -77,7 +113,7 @@ def distance(X, Y, dists, metric="euclidean"):
                           <int>m,
                           <int>n,
                           <int>k,
-                          <DistanceType>DistanceType.L2SqrtUnexpanded,
+                          <DistanceType>distance_type,
                           <bool>True, <float>0.0)
     else:
         raise ValueError("dtype %s not supported" % x_dt)
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
index cc4cbab949..aaea7ea80e 100644
--- a/python/pylibraft/pylibraft/test/test_distance.py
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -45,19 +45,37 @@ def copy_to_host(self):
             .reshape(self.ndarray_.shape)
 
 
-@pytest.mark.parametrize("n_rows", [10, 100, 1000])
-@pytest.mark.parametrize("n_cols", [10, 100, 1000])
+@pytest.mark.parametrize("n_rows", [10, 100])
+@pytest.mark.parametrize("n_cols", [10, 100])
+@pytest.mark.parametrize("metric", ["euclidean", "cityblock", "chebyshev",
+                                    "canberra", "correlation", "hamming",
+                                    "jensenshannon", "russellrao"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_distance(n_rows, n_cols, dtype):
+def test_distance(n_rows, n_cols, metric, dtype):
     input1 = np.random.random_sample((n_rows, n_cols)).astype(dtype)
+
+    # RussellRao expects boolean arrays
+    if metric == "russellrao":
+        input1[input1 < 0.5] = 0
+        input1[input1 >= 0.5] = 1
+
+    # JensenShannon expects probability arrays
+    elif metric == "jensenshannon":
+        norm = np.sum(input1, axis=1)
+        input1 = (input1.T / norm).T
+
     output = np.zeros((n_rows, n_rows), dtype=dtype)
 
-    expected = cdist(input1, input1, "euclidean")
+    expected = cdist(input1, input1, metric)
+
+    expected[expected <= 1e-5] = 0.0
 
     input1_device = TestDeviceBuffer(input1)
     output_device = TestDeviceBuffer(output)
 
-    pairwise_distance(input1_device, input1_device, output_device)
+    pairwise_distance(input1_device, input1_device, output_device, metric)
     actual = output_device.copy_to_host()
 
-    assert np.allclose(expected, actual)
+    actual[actual <= 1e-5] = 0.0
+
+    assert np.allclose(expected, actual, rtol=1e-4)

From a4d1901f0ed3c60064b602a9f3105ea823bedbd8 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 11:20:55 -0400
Subject: [PATCH 094/167] Fixing style

---
 python/pylibraft/pylibraft/distance/distance_type.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pylibraft/pylibraft/distance/distance_type.pyx b/python/pylibraft/pylibraft/distance/distance_type.pyx
index b419a888e8..3a6f0b30cf 100644
--- a/python/pylibraft/pylibraft/distance/distance_type.pyx
+++ b/python/pylibraft/pylibraft/distance/distance_type.pyx
@@ -59,4 +59,4 @@ DISTANCE_TYPES = {
     "dice": DistanceType.DiceExpanded
 }
 
-SUPPORTED_DISTANCES = list(DISTANCE_TYPES.keys())
\ No newline at end of file
+SUPPORTED_DISTANCES = list(DISTANCE_TYPES.keys())

From 2b710dc5c68284f6ffc5fcd07cf77d1720c9f958 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 12:13:49 -0400
Subject: [PATCH 095/167] Strange cython linking issue

---
 .../{distance_type.pyx => distance_type.pxd}  | 22 -----------------
 .../pylibraft/distance/pairwise_distance.pyx  | 24 +++++++++++++++++--
 2 files changed, 22 insertions(+), 24 deletions(-)
 rename python/pylibraft/pylibraft/distance/{distance_type.pyx => distance_type.pxd} (72%)

diff --git a/python/pylibraft/pylibraft/distance/distance_type.pyx b/python/pylibraft/pylibraft/distance/distance_type.pxd
similarity index 72%
rename from python/pylibraft/pylibraft/distance/distance_type.pyx
rename to python/pylibraft/pylibraft/distance/distance_type.pxd
index 3a6f0b30cf..2c01e42e53 100644
--- a/python/pylibraft/pylibraft/distance/distance_type.pyx
+++ b/python/pylibraft/pylibraft/distance/distance_type.pxd
@@ -38,25 +38,3 @@ cdef extern from "raft/distance/distance_type.hpp" namespace "raft::distance":
         RusselRaoExpanded "raft::distance::DistanceType::RusselRaoExpanded"
         DiceExpanded "raft::distance::DistanceType::DiceExpanded"
         Precomputed "raft::distance::DistanceType::Precomputed"
-
-DISTANCE_TYPES = {
-    "l2": DistanceType.L2SqrtUnexpanded,
-    "euclidean": DistanceType.L2SqrtUnexpanded,
-    "l1": DistanceType.L1,
-    "cityblock": DistanceType.L1,
-    "inner_product": DistanceType.InnerProduct,
-    "chebyshev": DistanceType.Linf,
-    "canberra": DistanceType.Canberra,
-    "lp": DistanceType.LpUnexpanded,
-    "correlation": DistanceType.CorrelationExpanded,
-    "jaccard": DistanceType.JaccardExpanded,
-    "hellinger": DistanceType.HellingerExpanded,
-    "braycurtis": DistanceType.BrayCurtis,
-    "jensenshannon": DistanceType.JensenShannon,
-    "hamming": DistanceType.HammingUnexpanded,
-    "kl_divergence": DistanceType.KLDivergence,
-    "russellrao": DistanceType.RusselRaoExpanded,
-    "dice": DistanceType.DiceExpanded
-}
-
-SUPPORTED_DISTANCES = list(DISTANCE_TYPES.keys())
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 8e4b70ba76..80ef2d4f76 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -19,8 +19,6 @@ from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
 
 from pylibraft.distance.distance_type cimport DistanceType
-from pylibraft.distance.distance_type import DISTANCE_TYPES
-from pylibraft.distance.distance_type import SUPPORTED_DISTANCES
 
 from pylibraft.common.handle cimport handle_t
 
@@ -28,6 +26,28 @@ from libcpp cimport bool
 from pylibraft.distance.distance_type cimport DistanceType
 from pylibraft.common.handle cimport handle_t
 
+DISTANCE_TYPES = {
+    "l2": DistanceType.L2SqrtUnexpanded,
+    "euclidean": DistanceType.L2SqrtUnexpanded,
+    "l1": DistanceType.L1,
+    "cityblock": DistanceType.L1,
+    "inner_product": DistanceType.InnerProduct,
+    "chebyshev": DistanceType.Linf,
+    "canberra": DistanceType.Canberra,
+    "lp": DistanceType.LpUnexpanded,
+    "correlation": DistanceType.CorrelationExpanded,
+    "jaccard": DistanceType.JaccardExpanded,
+    "hellinger": DistanceType.HellingerExpanded,
+    "braycurtis": DistanceType.BrayCurtis,
+    "jensenshannon": DistanceType.JensenShannon,
+    "hamming": DistanceType.HammingUnexpanded,
+    "kl_divergence": DistanceType.KLDivergence,
+    "russellrao": DistanceType.RusselRaoExpanded,
+    "dice": DistanceType.DiceExpanded
+}
+
+SUPPORTED_DISTANCES = list(DISTANCE_TYPES.keys())
+
 cdef extern from "raft_distance/pairwise_distance.hpp" \
         namespace "raft::distance::runtime":
 

From 4d367c0f990f83fe4b18afede96cb086c88e50ec Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 14:55:14 -0400
Subject: [PATCH 096/167] Updates

---
 .../pylibraft/distance/pairwise_distance.pyx  | 49 +++++++++----------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 80ef2d4f76..6d239e5fe7 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -18,36 +18,10 @@ import numpy as np
 from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
 
-from pylibraft.distance.distance_type cimport DistanceType
-
-from pylibraft.common.handle cimport handle_t
-
 from libcpp cimport bool
-from pylibraft.distance.distance_type cimport DistanceType
+from .distance_type cimport DistanceType
 from pylibraft.common.handle cimport handle_t
 
-DISTANCE_TYPES = {
-    "l2": DistanceType.L2SqrtUnexpanded,
-    "euclidean": DistanceType.L2SqrtUnexpanded,
-    "l1": DistanceType.L1,
-    "cityblock": DistanceType.L1,
-    "inner_product": DistanceType.InnerProduct,
-    "chebyshev": DistanceType.Linf,
-    "canberra": DistanceType.Canberra,
-    "lp": DistanceType.LpUnexpanded,
-    "correlation": DistanceType.CorrelationExpanded,
-    "jaccard": DistanceType.JaccardExpanded,
-    "hellinger": DistanceType.HellingerExpanded,
-    "braycurtis": DistanceType.BrayCurtis,
-    "jensenshannon": DistanceType.JensenShannon,
-    "hamming": DistanceType.HammingUnexpanded,
-    "kl_divergence": DistanceType.KLDivergence,
-    "russellrao": DistanceType.RusselRaoExpanded,
-    "dice": DistanceType.DiceExpanded
-}
-
-SUPPORTED_DISTANCES = list(DISTANCE_TYPES.keys())
-
 cdef extern from "raft_distance/pairwise_distance.hpp" \
         namespace "raft::distance::runtime":
 
@@ -73,6 +47,27 @@ cdef extern from "raft_distance/pairwise_distance.hpp" \
                                 bool isRowMajor,
                                 float metric_arg)
 
+DISTANCE_TYPES = {
+    "l2": DistanceType.L2SqrtUnexpanded,
+    "euclidean": DistanceType.L2SqrtUnexpanded,
+    "l1": DistanceType.L1,
+    "cityblock": DistanceType.L1,
+    "inner_product": DistanceType.InnerProduct,
+    "chebyshev": DistanceType.Linf,
+    "canberra": DistanceType.Canberra,
+    "lp": DistanceType.LpUnexpanded,
+    "correlation": DistanceType.CorrelationExpanded,
+    "jaccard": DistanceType.JaccardExpanded,
+    "hellinger": DistanceType.HellingerExpanded,
+    "braycurtis": DistanceType.BrayCurtis,
+    "jensenshannon": DistanceType.JensenShannon,
+    "hamming": DistanceType.HammingUnexpanded,
+    "kl_divergence": DistanceType.KLDivergence,
+    "russellrao": DistanceType.RusselRaoExpanded,
+    "dice": DistanceType.DiceExpanded
+}
+
+SUPPORTED_DISTANCES = list(DISTANCE_TYPES.keys())
 
 def distance(X, Y, dists, metric="euclidean"):
     """

From dad6d9305c69056654ea81b52e8401d74d796c76 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 15:52:33 -0400
Subject: [PATCH 097/167] Changing frontend->public

---
 BUILD.md                                      | 14 ++---
 README.md                                     | 18 +++----
 ci/cpu/build.sh                               |  8 +--
 ci/cpu/upload.sh                              |  2 +-
 conda/recipes/libraft_distance/meta.yaml      |  4 +-
 conda/recipes/libraft_nn/meta.yaml            |  4 +-
 .../build.sh                                  |  0
 .../meta.yaml                                 |  4 +-
 conda/recipes/pyraft/meta.yaml                |  4 +-
 cpp/CMakeLists.txt                            | 54 +++++++++----------
 cpp/cmake/thirdparty/get_cuco.cmake           |  4 +-
 cpp/cmake/thirdparty/get_gtest.cmake          |  2 +-
 cpp/cmake/thirdparty/get_libcudacxx.cmake     |  4 +-
 cpp/cmake/thirdparty/get_mdspan.cmake         |  4 +-
 cpp/cmake/thirdparty/get_thrust.cmake         |  4 +-
 cpp/include/raft/comms/comms.hpp              |  2 +-
 cpp/include/raft/random/rng.cuh               |  2 +-
 cpp/include/raft/random/rng.hpp               |  2 +-
 .../comms/comms.hpp                           |  2 +-
 .../cudart_utils.hpp                          |  2 +-
 .../{raft_frontend => raft_public}/error.hpp  |  0
 .../{raft_frontend => raft_public}/handle.hpp | 12 ++---
 .../interruptible.hpp                         |  4 +-
 .../linalg/cublas_macros.hpp                  |  2 +-
 .../linalg/cusolver_macros.hpp                |  2 +-
 .../{raft_frontend => raft_public}/raft.hpp   |  0
 .../sparse/cusparse_macros.hpp                |  2 +-
 cpp/test/CMakeLists.txt                       |  2 +-
 python/raft/common/handle.pxd                 |  2 +-
 python/raft/common/interruptible.pxd          |  4 +-
 python/raft/dask/common/comms_utils.pyx       |  2 +-
 31 files changed, 86 insertions(+), 86 deletions(-)
 rename conda/recipes/{libraft_frontend => libraft_public}/build.sh (100%)
 rename conda/recipes/{libraft_frontend => libraft_public}/meta.yaml (95%)
 rename cpp/include/{raft_frontend => raft_public}/comms/comms.hpp (99%)
 rename cpp/include/{raft_frontend => raft_public}/cudart_utils.hpp (99%)
 rename cpp/include/{raft_frontend => raft_public}/error.hpp (100%)
 rename cpp/include/{raft_frontend => raft_public}/handle.hpp (97%)
 rename cpp/include/{raft_frontend => raft_public}/interruptible.hpp (99%)
 rename cpp/include/{raft_frontend => raft_public}/linalg/cublas_macros.hpp (99%)
 rename cpp/include/{raft_frontend => raft_public}/linalg/cusolver_macros.hpp (99%)
 rename cpp/include/{raft_frontend => raft_public}/raft.hpp (100%)
 rename cpp/include/{raft_frontend => raft_public}/sparse/cusparse_macros.hpp (99%)

diff --git a/BUILD.md b/BUILD.md
index ac613d4c8d..ba175dad19 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -136,17 +136,17 @@ py.test -s -v raft
 
 ### <a id="cxx_integration"></a>C++ header-only integration using cmake
 
-The RAFT headers are broken down into two different include paths so that backend headers can be isolated between projects while frontend API headers can be installed globally, exposed to users through public APIs, and shared across projects.
-- `cpp/include/raft_frontend` contains frontend API headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
-- `cpp/include/raft` contains the backend of the RAFT header-only library, containing primitives, algorithms, and other tools.
+The RAFT headers are broken down into two different include paths so that core headers can be isolated between projects while public API headers can be installed globally, exposed to users through public APIs, and shared across projects.
+- `cpp/include/raft_public` contains public API headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
+- `cpp/include/raft` contains the core of the RAFT header-only library, containing primitives, algorithms, and other tools.
 
 Use `find_package(raft)` and the `raft::raft` if using RAFT to interact only with the public APIs of consuming projects.
 
-Use `find_package(raft COMPONENTS backend` and both the `raft::raft` and `raft::backend` targets when building a library that uses headers in `include/raft`.
+Use `find_package(raft COMPONENTS core)` and both the `raft::raft` and `raft::core` targets when building a library that uses headers in `include/raft`.
 
 ### <a id="use_shared_libs"></a>Using pre-compiled shared libraries
 
-Use `find_package(raft COMPONENTS backend nn distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available in addition to `raft::raft` and `raft::backend` for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
+Use `find_package(raft COMPONENTS core nn distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available in addition to `raft::raft` and `raft::core` for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
 
 The pre-compiled libraries contain template specializations for commonly used types. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `spectializations.hpp` and located in the base directory for the packages that contain specializations.
 
@@ -158,7 +158,7 @@ The following example ignores the pre-compiled templates for the `libraft-distan
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_frontend` headers and `raft::backend` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_public` headers and `raft::core` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
 
 The following `cmake` snippet enables a flexible configuration of RAFT:
 
@@ -188,7 +188,7 @@ function(find_and_configure_raft)
   # Add components
   #-----------------------------------------------------
 
-  string(APPEND RAFT_COMPONENTS "backend")
+  string(APPEND RAFT_COMPONENTS "core")
   if(PKG_USE_NN_LIBRARY)
     string(APPEND RAFT_COMPONENTS " nn")
   endif()
diff --git a/README.md b/README.md
index fd6db01ae4..91644a8559 100755
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ Most of the primitives in RAFT accept a `raft::handle_t` object for the manageme
 The example below demonstrates creating a RAFT handle and using it with `device_matrix` and `device_vector` to allocate memory, generating random clusters, and computing
 pairwise Euclidean distances:
 ```c++
-#include <raft_frontend/handle.hpp>
+#include <raft_public/handle.hpp>
 #include <raft/mdarray.hpp>
 #include <raft/random/make_blobs.cuh>
 #include <raft/distance/distance.cuh>
@@ -87,14 +87,14 @@ RAFT can be installed through conda, cmake-package-manager (cpm), or by building
 ### Conda
 
 The easiest way to install RAFT is through conda and several packages are provided.
-- `libraft-frontend` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`. The client APIs are also more stable across versions so they can be safely installed globally in an environment with projects which might have been built with different versions of RAFT.
+- `libraft-public-headers` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`. The client APIs are also more stable across versions so they can be safely installed globally in an environment with projects which might have been built with different versions of RAFT.
 - `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
 - `libraft-distance` (optional) contains shared libraries for distance primitives.
-- `pyraft` (optional) contains reusable Python tools to accelerate Python algorithm development.
+- `pyraft` (optional) contains reusable Python infrastructure and tools to accelerate Python algorithm development.
 
 To install RAFT with conda (change to `rapidsai-nightly` for more up-to-date but less stable nightly packages)
 ```bash
-conda install -c rapidsai libraft-frontend libraft-nn libraft-distance pyraft
+conda install -c rapidsai libraft-public libraft-nn libraft-distance pyraft
 ```
 
 After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
@@ -110,7 +110,7 @@ After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids
 set(RAFT_VERSION "22.04")
 set(RAFT_FORK "rapidsai")
 set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
-set(RAFT_COMPONENTS "backend")
+set(RAFT_COMPONENTS "core")
 
 function(find_and_configure_raft)
   set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC 
@@ -157,10 +157,10 @@ Several cmake targets can be made available by adding components in the table be
 
 | Component | Target | Description | Dependencies |
 | --- | --- | --- | --- |
-| n/a | `raft::raft` | Only RAFT frontend API headers. Safe to expose in public APIs. | Cudatoolkit libraries, RMM |
-| headers | `raft::backend` | RAFT backend headers | std::mdspan, cuCollections, Thrust, NVTools |
-| distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::backend |
-| nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::backend, FAISS |
+| n/a | `raft::raft` | Only RAFT public API headers. These are very lightweight and safe to expose in public APIs. | Cudatoolkit libraries, RMM |
+| core | `raft::core` | RAFT core headers | std::mdspan, cuCollections, Thrust, NVTools |
+| distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::core |
+| nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::core, FAISS |
 
 ### Source
 
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 6c18d24f1b..93b8e320c5 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -87,14 +87,14 @@ gpuci_mamba_retry install -c conda-forge boa
 if [ "$BUILD_LIBRAFT" == '1' ]; then
   gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-client-api"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_frontend
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_public
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance
   else
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_frontend
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_public
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
-    mkdir -p ${CONDA_BLD_DIR}/libraft_frontend/work
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_frontend/work
+    mkdir -p ${CONDA_BLD_DIR}/libraft_public/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_public/work
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_nn
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 0829971962..51ed2a6054 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -30,7 +30,7 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBRAFT_CLIENT_API_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_frontend --output`
+export LIBRAFT_CLIENT_API_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_public --output`
 export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
 export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
 export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index ef4aaa5eb9..cda122b24c 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-frontend {{ version }}
+    - libraft-public-headers {{ version }}
     - nccl>=2.9.9
     - cudatoolkit {{ cuda_version }}.*
     - ucx-py {{ ucx_py_version }}
@@ -47,7 +47,7 @@ requirements:
     - gmock
     - librmm {{ minor_version }}
   run:
-    - libraft-frontend {{ version }}
+    - libraft-public-headers {{ version }}
     - nccl>=2.9.9
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
index 1baf839dab..66814cd175 100644
--- a/conda/recipes/libraft_nn/meta.yaml
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-frontend {{ version }}
+    - libraft-public-headers {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - lapack
     - faiss-proc=*=cuda
@@ -48,7 +48,7 @@ requirements:
     - librmm {{ minor_version }}
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
-    - libraft-frontend {{ version }}
+    - libraft-public {{ version }}
     - faiss-proc=*=cuda
     - libfaiss 1.7.0 *_cuda
     - libcusolver>=11.2.1
diff --git a/conda/recipes/libraft_frontend/build.sh b/conda/recipes/libraft_public/build.sh
similarity index 100%
rename from conda/recipes/libraft_frontend/build.sh
rename to conda/recipes/libraft_public/build.sh
diff --git a/conda/recipes/libraft_frontend/meta.yaml b/conda/recipes/libraft_public/meta.yaml
similarity index 95%
rename from conda/recipes/libraft_frontend/meta.yaml
rename to conda/recipes/libraft_public/meta.yaml
index fff8692173..65b3844a09 100644
--- a/conda/recipes/libraft_frontend/meta.yaml
+++ b/conda/recipes/libraft_public/meta.yaml
@@ -8,7 +8,7 @@
 {% set cuda_major=cuda_version.split('.')[0] %}
 {% set ucx_py_version=environ.get('UCX_PY_VERSION') %}
 package:
-  name: libraft-frontend
+  name: libraft-public-headers
   version: {{ version }}
 
 source:
@@ -58,4 +58,4 @@ about:
   home: http://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
-  summary: libraft-frontend library
+  summary: libraft-public-headers library
diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
index 5ff0477479..54c48b6be4 100644
--- a/conda/recipes/pyraft/meta.yaml
+++ b/conda/recipes/pyraft/meta.yaml
@@ -30,7 +30,7 @@ requirements:
     - setuptools
     - cython>=0.29,<0.30
     - rmm {{ minor_version }}
-    - libraft-frontend {{ version }}
+    - libraft-public-headers {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0
     - nccl>=2.9.9
@@ -39,7 +39,7 @@ requirements:
   run:
     - python x.x
     - dask-cuda {{ minor_version }}
-    - libraft-frontend {{ version }}
+    - libraft-public-headers {{ version }}
     - nccl>=2.9.9
     - rmm {{ minor_version }}
     - ucx-py {{ ucx_py_version }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b40e764fb7..94336e52cd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -68,7 +68,7 @@ message(VERBOSE "RAFT: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}")
 message(VERBOSE "RAFT: Enable nvtx markers: ${NVTX}")
 message(VERBOSE "RAFT: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 
-list(APPEND raft_FIND_COMPONENTS "backend")
+list(APPEND raft_FIND_COMPONENTS "core")
 
 # Set RMM logging level
 set(RMM_LOGGING_LEVEL "INFO" CACHE STRING "Choose the logging level.")
@@ -139,26 +139,26 @@ set(RAFT_LINK_LIBS
         CUDA::cusparse
         rmm::rmm)
 
-add_library(raft_backend INTERFACE)
-if(TARGET raft_backend AND (NOT TARGET raft::backend))
-  add_library(raft::backend ALIAS raft_backend)
+add_library(raft_core INTERFACE)
+if(TARGET raft_core AND (NOT TARGET raft::core))
+  add_library(raft::core ALIAS raft_core)
 endif()
 
-set_target_properties(raft_backend PROPERTIES EXPORT_NAME backend)
+set_target_properties(raft_core PROPERTIES EXPORT_NAME core)
 
-target_include_directories(raft_backend INTERFACE
+target_include_directories(raft_core INTERFACE
         "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
         "$<INSTALL_INTERFACE:include>")
 
-target_link_libraries(raft_backend INTERFACE
+target_link_libraries(raft_core INTERFACE
         raft::Thrust
         $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
         ${RAFT_LINK_LIBS}
         cuco::cuco
         std::mdspan)
 
-target_compile_definitions(raft_backend INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
-target_compile_features(raft_backend INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_compile_definitions(raft_core INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
+target_compile_features(raft_core INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY OR RAFT_COMPILE_NN_LIBRARY)
   file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
@@ -177,7 +177,7 @@ if(${RAFT_STATIC_LINK_LIBRARIES})
 endif()
 
 ##############################################################################
-# - raft_frontend -------------------------------------------------------------
+# - raft_public -------------------------------------------------------------
 
 add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
@@ -236,7 +236,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
           )
   set_target_properties(raft_distance_lib PROPERTIES OUTPUT_NAME raft_distance)
 
-  target_link_libraries(raft_distance_lib PRIVATE raft::backend)
+  target_link_libraries(raft_distance_lib PRIVATE raft::core)
   target_compile_options(raft_distance_lib
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -280,7 +280,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
           )
   set_target_properties(raft_nn_lib PROPERTIES OUTPUT_NAME raft_nn)
 
-  target_link_libraries(raft_nn_lib PRIVATE raft::backend faiss::faiss)
+  target_link_libraries(raft_nn_lib PRIVATE raft::core faiss::faiss)
   target_compile_options(raft_nn_lib
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -307,9 +307,9 @@ install(TARGETS raft
         DESTINATION ${lib_dir}
         EXPORT raft-exports)
 
-install(TARGETS raft_backend
+install(TARGETS raft_core
         DESTINATION ${lib_dir}
-        EXPORT raft-backend-exports)
+        EXPORT raft-core-exports)
 
 install(TARGETS raft_distance
         DESTINATION ${lib_dir}
@@ -331,12 +331,12 @@ if(TARGET raft_nn_lib)
 endif()
 
 
-install(DIRECTORY include/raft_frontend
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_frontend)
+install(DIRECTORY include/raft_public
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_public)
 
 # Temporary install of raft.hpp while the file is removed
-install(FILES include/raft_frontend/raft.hpp
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_frontend)
+install(FILES include/raft_public/raft.hpp
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_public)
 
 ##############################################################################
 # - install export -----------------------------------------------------------
@@ -352,7 +352,7 @@ Optional Components:
   - distance
 
 Imported Targets:
-  - raft::backend
+  - raft::core
   - raft::raft
   - raft::nn brought in by the `nn` optional component
   - raft::distance brought in by the `distance` optional component
@@ -364,18 +364,18 @@ set(code_string
 
 if((distance IN_LIST raft_FIND_COMPONENTS OR
     nn IN_LIST raft_FIND_COMPONENTS) AND NOT
-    backend IN_LIST raft_FIND_COMPONENTS))
-    FATAL_ERROR("backend must be included to use components ${raft_FIND_COMPONENTS}")
+    core IN_LIST raft_FIND_COMPONENTS))
+    FATAL_ERROR("core must be included to use components ${raft_FIND_COMPONENTS}")
 endif()
 
-if(backend IN_LIST raft_FIND_COMPONENTS)
+if(core IN_LIST raft_FIND_COMPONENTS)
   if(NOT TARGET raft::Thrust)
     thrust_create_target(raft::Thrust FROM_OPTIONS)
   endif()
 endif()
 
 if(distance IN_LIST raft_FIND_COMPONENTS OR
-   backend IN_LIST raft_FIND_COMPONENTS)
+   core IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)
 endif()
 
@@ -394,9 +394,9 @@ endif()
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
 raft_export(INSTALL raft
-        COMPONENTS backend nn distance
+        COMPONENTS core nn distance
         EXPORT_SET raft-exports
-        GLOBAL_TARGETS backend nn distance
+        GLOBAL_TARGETS core nn distance
         NAMESPACE raft::
         DOCUMENTATION doc_string
         FINAL_CODE_BLOCK code_string)
@@ -405,8 +405,8 @@ raft_export(INSTALL raft
 # - build export -------------------------------------------------------------
 raft_export(BUILD raft
         EXPORT_SET raft-exports
-        COMPONENTS backend nn distance
-        GLOBAL_TARGETS raft_backend raft_distance raft_nn
+        COMPONENTS core nn distance
+        GLOBAL_TARGETS raft_core raft_distance raft_nn
         DOCUMENTATION doc_string
         NAMESPACE raft::
         FINAL_CODE_BLOCK code_string)
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 5002213134..f7937923f5 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -18,8 +18,8 @@ function(find_and_configure_cuco VERSION)
 
     rapids_cpm_find(cuco ${VERSION}
       GLOBAL_TARGETS      cuco::cuco
-      BUILD_EXPORT_SET    raft-backend-exports
-      INSTALL_EXPORT_SET  raft-backend-exports
+      BUILD_EXPORT_SET    raft-core-exports
+      INSTALL_EXPORT_SET  raft-core-exports
       CPM_ARGS
         EXCLUDE_FROM_ALL TRUE
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 00dce165ea..9929fcc25d 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -17,7 +17,7 @@
 function(find_and_configure_gtest )
 
     include(${rapids-cmake-dir}/cpm/gtest.cmake)
-    rapids_cpm_gtest(BUILD_EXPORT_SET raft-backend-exports
+    rapids_cpm_gtest(BUILD_EXPORT_SET raft-core-exports
                      EXCLUDE_FROM_ALL TRUE)
 
     if(GTest_ADDED)
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 3345fe66f5..8cdb4ce2ce 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,8 +16,8 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-backend-exports
-                        INSTALL_EXPORT_SET raft-backend-exports
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-core-exports
+                        INSTALL_EXPORT_SET raft-core-exports
                         EXCLUDE_FROM_ALL TRUE)
 
 endfunction()
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index cf94bf5fd7..ab0a596bbb 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -16,8 +16,8 @@ function(find_and_configure_mdspan VERSION)
   rapids_cpm_find(
     mdspan ${VERSION}
     GLOBAL_TARGETS std::mdspan
-    BUILD_EXPORT_SET    raft-backend-exports
-    INSTALL_EXPORT_SET  raft-backend-exports
+    BUILD_EXPORT_SET    raft-core-exports
+    INSTALL_EXPORT_SET  raft-core-exports
     CPM_ARGS
       EXCLUDE_FROM_ALL TRUE
       GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index acad2500ca..f719b53d2e 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -17,8 +17,8 @@ function(find_and_configure_thrust)
     include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
     rapids_cpm_thrust( NAMESPACE raft )
-    rapids_export_package(BUILD thrust raft-backend-exports)
-    rapids_export_package(INSTALL thrust raft-backend-exports)
+    rapids_export_package(BUILD thrust raft-core-exports)
+    rapids_export_package(INSTALL thrust raft-core-exports)
 
 
 endfunction()
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index d20707f0cb..2ad20ba9d6 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -25,7 +25,7 @@
 #pragma once
 
 #include <memory>
-#include <raft_frontend/error.hpp>
+#include <raft_public/error.hpp>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 3e75b2ae74..cd066c41a9 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -83,7 +83,7 @@ class Rng : public detail::RngImpl {
   /**
    * @brief ctor
    * @param _s 64b seed used to initialize the RNG
-   * @param _t backend device RNG generator type
+   * @param _t core device RNG generator type
    * @note Refer to the `Rng::seed` method for details about seeding the engine
    */
   Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl(_s, _t) {}
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index 2d1af6a97e..450feb563a 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -87,7 +87,7 @@ class Rng : public detail::RngImpl {
   /**
    * @brief ctor
    * @param _s 64b seed used to initialize the RNG
-   * @param _t backend device RNG generator type
+   * @param _t core device RNG generator type
    * @note Refer to the `Rng::seed` method for details about seeding the engine
    */
   Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl(_s, _t) {}
diff --git a/cpp/include/raft_frontend/comms/comms.hpp b/cpp/include/raft_public/comms/comms.hpp
similarity index 99%
rename from cpp/include/raft_frontend/comms/comms.hpp
rename to cpp/include/raft_public/comms/comms.hpp
index a08ef37fc6..fa23c0128f 100644
--- a/cpp/include/raft_frontend/comms/comms.hpp
+++ b/cpp/include/raft_public/comms/comms.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <memory>
-#include <raft_frontend/error.hpp>
+#include <raft_public/error.hpp>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/include/raft_frontend/cudart_utils.hpp b/cpp/include/raft_public/cudart_utils.hpp
similarity index 99%
rename from cpp/include/raft_frontend/cudart_utils.hpp
rename to cpp/include/raft_public/cudart_utils.hpp
index 7479468626..0bbcaf5e13 100644
--- a/cpp/include/raft_frontend/cudart_utils.hpp
+++ b/cpp/include/raft_public/cudart_utils.hpp
@@ -19,7 +19,7 @@
 
 #pragma once
 
-#include <raft_frontend/error.hpp>
+#include <raft_public/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
diff --git a/cpp/include/raft_frontend/error.hpp b/cpp/include/raft_public/error.hpp
similarity index 100%
rename from cpp/include/raft_frontend/error.hpp
rename to cpp/include/raft_public/error.hpp
diff --git a/cpp/include/raft_frontend/handle.hpp b/cpp/include/raft_public/handle.hpp
similarity index 97%
rename from cpp/include/raft_frontend/handle.hpp
rename to cpp/include/raft_public/handle.hpp
index 58522e5b88..448e42504c 100644
--- a/cpp/include/raft_frontend/handle.hpp
+++ b/cpp/include/raft_public/handle.hpp
@@ -35,13 +35,13 @@
 ///@todo: enable once we have migrated cuml-comms layer too
 //#include <common/cuml_comms_int.hpp>
 
-#include <raft_frontend/cudart_utils.hpp>
+#include <raft_public/cudart_utils.hpp>
 
-#include <raft_frontend/comms/comms.hpp>
-#include <raft_frontend/interruptible.hpp>
-#include <raft_frontend/linalg/cublas_macros.hpp>
-#include <raft_frontend/linalg/cusolver_macros.hpp>
-#include <raft_frontend/sparse/cusparse_macros.hpp>
+#include <raft_public/comms/comms.hpp>
+#include <raft_public/interruptible.hpp>
+#include <raft_public/linalg/cublas_macros.hpp>
+#include <raft_public/linalg/cusolver_macros.hpp>
+#include <raft_public/sparse/cusparse_macros.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft_frontend/interruptible.hpp b/cpp/include/raft_public/interruptible.hpp
similarity index 99%
rename from cpp/include/raft_frontend/interruptible.hpp
rename to cpp/include/raft_public/interruptible.hpp
index b5a9691496..194f6f1fd1 100644
--- a/cpp/include/raft_frontend/interruptible.hpp
+++ b/cpp/include/raft_public/interruptible.hpp
@@ -22,8 +22,8 @@
 #include <memory>
 #include <mutex>
 #include <optional>
-#include <raft_frontend/cudart_utils.hpp>
-#include <raft_frontend/error.hpp>
+#include <raft_public/cudart_utils.hpp>
+#include <raft_public/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <thread>
 #include <unordered_map>
diff --git a/cpp/include/raft_frontend/linalg/cublas_macros.hpp b/cpp/include/raft_public/linalg/cublas_macros.hpp
similarity index 99%
rename from cpp/include/raft_frontend/linalg/cublas_macros.hpp
rename to cpp/include/raft_public/linalg/cublas_macros.hpp
index 28722b18a7..f654e0b27e 100644
--- a/cpp/include/raft_frontend/linalg/cublas_macros.hpp
+++ b/cpp/include/raft_public/linalg/cublas_macros.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <raft_frontend/error.hpp>
+#include <raft_public/error.hpp>
 
 ///@todo: enable this once we have logger enabled
 //#include <cuml/common/logger.hpp>
diff --git a/cpp/include/raft_frontend/linalg/cusolver_macros.hpp b/cpp/include/raft_public/linalg/cusolver_macros.hpp
similarity index 99%
rename from cpp/include/raft_frontend/linalg/cusolver_macros.hpp
rename to cpp/include/raft_public/linalg/cusolver_macros.hpp
index 92151be237..f4e8911983 100644
--- a/cpp/include/raft_frontend/linalg/cusolver_macros.hpp
+++ b/cpp/include/raft_public/linalg/cusolver_macros.hpp
@@ -23,7 +23,7 @@
 #include <cusolverSp.h>
 ///@todo: enable this once logging is enabled
 //#include <cuml/common/logger.hpp>
-#include <raft_frontend/cudart_utils.hpp>
+#include <raft_public/cudart_utils.hpp>
 #include <type_traits>
 
 #define _CUSOLVER_ERR_TO_STR(err) \
diff --git a/cpp/include/raft_frontend/raft.hpp b/cpp/include/raft_public/raft.hpp
similarity index 100%
rename from cpp/include/raft_frontend/raft.hpp
rename to cpp/include/raft_public/raft.hpp
diff --git a/cpp/include/raft_frontend/sparse/cusparse_macros.hpp b/cpp/include/raft_public/sparse/cusparse_macros.hpp
similarity index 99%
rename from cpp/include/raft_frontend/sparse/cusparse_macros.hpp
rename to cpp/include/raft_public/sparse/cusparse_macros.hpp
index e0d7af33bc..34643129a0 100644
--- a/cpp/include/raft_frontend/sparse/cusparse_macros.hpp
+++ b/cpp/include/raft_public/sparse/cusparse_macros.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <cusparse.h>
-#include <raft_frontend/error.hpp>
+#include <raft_public/error.hpp>
 ///@todo: enable this once logging is enabled
 //#include <cuml/common/logger.hpp>
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index f0f04542b7..05bad2844d 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -164,7 +164,7 @@ target_include_directories(test_raft
 target_link_libraries(test_raft
         PRIVATE
         raft::raft
-        raft::backend
+        raft::core
         raft::distance
         raft::nn
         NCCL::NCCL
diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd
index b3b12e9f9e..08cf1de499 100644
--- a/python/raft/common/handle.pxd
+++ b/python/raft/common/handle.pxd
@@ -31,7 +31,7 @@ cdef extern from "raft/mr/device/allocator.hpp" \
     cdef cppclass allocator:
         pass
 
-cdef extern from "raft_frontend/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft_public/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
diff --git a/python/raft/common/interruptible.pxd b/python/raft/common/interruptible.pxd
index eb257d035b..1ba2df95a4 100644
--- a/python/raft/common/interruptible.pxd
+++ b/python/raft/common/interruptible.pxd
@@ -22,11 +22,11 @@
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
-cdef extern from "raft_frontend/interruptible.hpp" namespace "raft" nogil:
+cdef extern from "raft_public/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
 
-cdef extern from "raft_frontend/interruptible.hpp" \
+cdef extern from "raft_public/interruptible.hpp" \
         namespace "raft::interruptible" nogil:
     cdef void inter_synchronize \
         "raft::interruptible::synchronize"(cuda_stream_view stream) except+
diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx
index 0b3f5f6637..5b6bedabfe 100644
--- a/python/raft/dask/common/comms_utils.pyx
+++ b/python/raft/dask/common/comms_utils.pyx
@@ -31,7 +31,7 @@ cdef extern from "nccl.h":
     cdef struct ncclComm
     ctypedef ncclComm *ncclComm_t
 
-cdef extern from "raft_frontend/handle.hpp" namespace "raft":
+cdef extern from "raft_public/handle.hpp" namespace "raft":
     cdef cppclass handle_t:
         handle_t() except +
 

From afb80249dfcdf378073932d3cf812844fd941bd2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 15:55:50 -0400
Subject: [PATCH 098/167] Updating raft_frontend->raft_public

---
 python/pylibraft/pylibraft/common/handle.pxd        | 2 +-
 python/pylibraft/pylibraft/common/interruptible.pxd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index b8eedd4f49..72b0d988f0 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -30,7 +30,7 @@ cdef extern from "raft/mr/device/allocator.hpp" \
     cdef cppclass allocator:
         pass
 
-cdef extern from "raft_frontend/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft_public/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
diff --git a/python/pylibraft/pylibraft/common/interruptible.pxd b/python/pylibraft/pylibraft/common/interruptible.pxd
index d8ff7f57f8..dde9add11d 100644
--- a/python/pylibraft/pylibraft/common/interruptible.pxd
+++ b/python/pylibraft/pylibraft/common/interruptible.pxd
@@ -22,7 +22,7 @@
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
-cdef extern from "raft_frontend/interruptible.hpp" namespace "raft" nogil:
+cdef extern from "raft_public/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
 

From b640d4f4025b7ffb70bc65bf232249c3446e013e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 17:37:39 -0400
Subject: [PATCH 099/167] libraft_public -> libraft_core

---
 BUILD.md                                      |  4 +-
 README.md                                     | 14 ++---
 ci/cpu/build.sh                               |  8 +--
 ci/cpu/upload.sh                              |  2 +-
 .../{libraft_public => libraft_core}/build.sh |  0
 .../meta.yaml                                 |  4 +-
 conda/recipes/libraft_distance/meta.yaml      |  4 +-
 conda/recipes/libraft_nn/meta.yaml            |  4 +-
 conda/recipes/pylibraft/meta.yaml             |  3 +-
 conda/recipes/pyraft/meta.yaml                |  4 +-
 cpp/CMakeLists.txt                            | 61 ++++++++++---------
 cpp/cmake/thirdparty/get_cuco.cmake           |  4 +-
 cpp/cmake/thirdparty/get_gtest.cmake          |  2 +-
 cpp/cmake/thirdparty/get_libcudacxx.cmake     |  4 +-
 cpp/cmake/thirdparty/get_mdspan.cmake         |  4 +-
 cpp/include/raft/comms/comms.hpp              |  2 +-
 .../comms/comms.hpp                           |  2 +-
 .../cudart_utils.hpp                          |  2 +-
 .../{raft_public => raft_core}/error.hpp      |  0
 .../{raft_public => raft_core}/handle.hpp     | 12 ++--
 .../interruptible.hpp                         |  4 +-
 .../linalg/cublas_macros.hpp                  |  2 +-
 .../linalg/cusolver_macros.hpp                |  2 +-
 .../{raft_public => raft_core}/raft.hpp       |  0
 .../sparse/cusparse_macros.hpp                |  2 +-
 cpp/test/CMakeLists.txt                       |  1 -
 python/pylibraft/pylibraft/common/handle.pxd  |  2 +-
 .../pylibraft/common/interruptible.pxd        |  2 +-
 python/raft/raft/common/handle.pxd            |  2 +-
 python/raft/raft/common/interruptible.pxd     |  4 +-
 python/raft/raft/dask/common/comms_utils.pyx  |  2 +-
 31 files changed, 82 insertions(+), 81 deletions(-)
 rename conda/recipes/{libraft_public => libraft_core}/build.sh (100%)
 rename conda/recipes/{libraft_public => libraft_core}/meta.yaml (95%)
 rename cpp/include/{raft_public => raft_core}/comms/comms.hpp (99%)
 rename cpp/include/{raft_public => raft_core}/cudart_utils.hpp (99%)
 rename cpp/include/{raft_public => raft_core}/error.hpp (100%)
 rename cpp/include/{raft_public => raft_core}/handle.hpp (97%)
 rename cpp/include/{raft_public => raft_core}/interruptible.hpp (99%)
 rename cpp/include/{raft_public => raft_core}/linalg/cublas_macros.hpp (99%)
 rename cpp/include/{raft_public => raft_core}/linalg/cusolver_macros.hpp (99%)
 rename cpp/include/{raft_public => raft_core}/raft.hpp (100%)
 rename cpp/include/{raft_public => raft_core}/sparse/cusparse_macros.hpp (99%)

diff --git a/BUILD.md b/BUILD.md
index 6480f8fb8d..1c2c741f97 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -138,7 +138,7 @@ py.test -s -v raft
 ### <a id="cxx_integration"></a>C++ header-only integration using cmake
 
 The RAFT headers are broken down into two different include paths so that core headers can be isolated between projects while public API headers can be installed globally, exposed to users through public APIs, and shared across projects.
-- `cpp/include/raft_public` contains public API headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
+- `cpp/include/raft_core` contains public API headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
 - `cpp/include/raft` contains the core of the RAFT header-only library, containing primitives, algorithms, and other tools.
 
 Use `find_package(raft)` and the `raft::raft` if using RAFT to interact only with the public APIs of consuming projects.
@@ -159,7 +159,7 @@ The following example ignores the pre-compiled templates for the `libraft-distan
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_public` headers and `raft::core` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_core` headers and `raft::core` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
 
 The following `cmake` snippet enables a flexible configuration of RAFT:
 
diff --git a/README.md b/README.md
index 3563d5e28c..69761ab988 100755
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ Most of the primitives in RAFT accept a `raft::handle_t` object for the manageme
 The example below demonstrates creating a RAFT handle and using it with `device_matrix` and `device_vector` to allocate memory, generating random clusters, and computing
 pairwise Euclidean distances:
 ```c++
-#include <raft_public/handle.hpp>
+#include <raft_core/handle.hpp>
 #include <raft/mdarray.hpp>
 #include <raft/random/make_blobs.cuh>
 #include <raft/distance/distance.cuh>
@@ -103,7 +103,7 @@ RAFT can be installed through conda, [Cmake Package Manager (CPM)](https://githu
 ### Conda
 
 The easiest way to install RAFT is through conda and several packages are provided.
-- `libraft-public-headers` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`. The client APIs are also more stable across versions so they can be safely installed globally in an environment with projects which might have been built with different versions of RAFT.
+- `libraft-core` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`. The client APIs are also more stable across versions so they can be safely installed globally in an environment with projects which might have been built with different versions of RAFT.
 - `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
 - `libraft-distance` (optional) contains shared libraries for distance primitives.
 - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives
@@ -111,7 +111,7 @@ The easiest way to install RAFT is through conda and several packages are provid
 
 Use the following command to install RAFT with conda (use `-c rapidsai-nightly` for more up-to-date but less stable nightly packages)
 ```bash
-conda install -c rapidsai libraft-public-headers libraft-nn libraft-distance pyraft pylibraft
+conda install -c rapidsai libraft-core libraft-nn libraft-distance pyraft pylibraft
 ```
 
 After installing RAFT, `find_package(raft COMPONENTS backend nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
@@ -174,10 +174,10 @@ Several cmake targets can be made available by adding components in the table be
 
 | Component | Target | Description | Dependencies |
 | --- | --- | --- | --- |
-| n/a | `raft::raft` | Only RAFT public API headers. These are very lightweight and safe to expose in public APIs. | Cudatoolkit libraries, RMM |
-| core | `raft::core` | RAFT core headers | std::mdspan, cuCollections, Thrust, NVTools |
-| distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::core |
-| nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::core, FAISS |
+| n/a | `raft::core` | Only RAFT core headers. These are very lightweight and safe to expose in public APIs. | Cudatoolkit libraries, RMM |
+| core | `raft::raft` | Full RAFT header library | std::mdspan, cuCollections, Thrust, NVTools |
+| distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::raft |
+| nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::raft, FAISS |
 
 ### Source
 
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 34226aa12a..0c2584b936 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -87,14 +87,14 @@ gpuci_mamba_retry install -c conda-forge boa
 if [ "$BUILD_LIBRAFT" == '1' ]; then
   gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-client-api"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_public
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_core
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance
   else
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_public
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_core
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
-    mkdir -p ${CONDA_BLD_DIR}/libraft_public/work
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_public/work
+    mkdir -p ${CONDA_BLD_DIR}/libraft_core/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_core/work
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_nn
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 10000347a1..01ad7aebe7 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -30,7 +30,7 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBRAFT_CLIENT_API_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_public --output`
+export LIBRAFT_CLIENT_API_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_core --output`
 export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
 export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
 export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
diff --git a/conda/recipes/libraft_public/build.sh b/conda/recipes/libraft_core/build.sh
similarity index 100%
rename from conda/recipes/libraft_public/build.sh
rename to conda/recipes/libraft_core/build.sh
diff --git a/conda/recipes/libraft_public/meta.yaml b/conda/recipes/libraft_core/meta.yaml
similarity index 95%
rename from conda/recipes/libraft_public/meta.yaml
rename to conda/recipes/libraft_core/meta.yaml
index 65b3844a09..6a416b3ea2 100644
--- a/conda/recipes/libraft_public/meta.yaml
+++ b/conda/recipes/libraft_core/meta.yaml
@@ -8,7 +8,7 @@
 {% set cuda_major=cuda_version.split('.')[0] %}
 {% set ucx_py_version=environ.get('UCX_PY_VERSION') %}
 package:
-  name: libraft-public-headers
+  name: libraft-core
   version: {{ version }}
 
 source:
@@ -58,4 +58,4 @@ about:
   home: http://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
-  summary: libraft-public-headers library
+  summary: libraft-core library
diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index cda122b24c..30e8ae54bb 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-public-headers {{ version }}
+    - libraft-core {{ version }}
     - nccl>=2.9.9
     - cudatoolkit {{ cuda_version }}.*
     - ucx-py {{ ucx_py_version }}
@@ -47,7 +47,7 @@ requirements:
     - gmock
     - librmm {{ minor_version }}
   run:
-    - libraft-public-headers {{ version }}
+    - libraft-core {{ version }}
     - nccl>=2.9.9
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
index 66814cd175..03a24a025a 100644
--- a/conda/recipes/libraft_nn/meta.yaml
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-public-headers {{ version }}
+    - libraft-core {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - lapack
     - faiss-proc=*=cuda
@@ -48,7 +48,7 @@ requirements:
     - librmm {{ minor_version }}
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
-    - libraft-public {{ version }}
+    - libraft-core {{ version }}
     - faiss-proc=*=cuda
     - libfaiss 1.7.0 *_cuda
     - libcusolver>=11.2.1
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 0df2117bd9..a56c08ce80 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -29,12 +29,13 @@ requirements:
     - setuptools
     - cython>=0.29,<0.30
     - rmm {{ minor_version }}
-    - libraft-frontend {{ version }}
+    - libraft-core {{ version }}
     - libraft-distance {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0
   run:
     - python x.x
+    - libraft-core {{ version }}
     - libraft-distance {{ version }}
     - cuda-python >=11.5,<12.0
     - joblib >=0.11
diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
index 54c48b6be4..e8b9c60e6e 100644
--- a/conda/recipes/pyraft/meta.yaml
+++ b/conda/recipes/pyraft/meta.yaml
@@ -30,7 +30,7 @@ requirements:
     - setuptools
     - cython>=0.29,<0.30
     - rmm {{ minor_version }}
-    - libraft-public-headers {{ version }}
+    - libraft-core {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0
     - nccl>=2.9.9
@@ -39,7 +39,7 @@ requirements:
   run:
     - python x.x
     - dask-cuda {{ minor_version }}
-    - libraft-public-headers {{ version }}
+    - libraft-core {{ version }}
     - nccl>=2.9.9
     - rmm {{ minor_version }}
     - ucx-py {{ ucx_py_version }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e32b111503..ebdcaaf3a3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -101,8 +101,8 @@ endif()
 # * enable the CMake CUDA language
 # * set other CUDA compilation flags
 rapids_find_package(CUDAToolkit REQUIRED
-    BUILD_EXPORT_SET raft-exports
-    INSTALL_EXPORT_SET raft-exports)
+    BUILD_EXPORT_SET raft-core-exports
+    INSTALL_EXPORT_SET raft-core-exports)
 include(cmake/modules/ConfigureCUDA.cmake)
 
 ##############################################################################
@@ -139,26 +139,22 @@ set(RAFT_LINK_LIBS
         CUDA::cusparse
         rmm::rmm)
 
-add_library(raft_core INTERFACE)
-if(TARGET raft_core AND (NOT TARGET raft::core))
-  add_library(raft::core ALIAS raft_core)
-endif()
-
-set_target_properties(raft_core PROPERTIES EXPORT_NAME core)
+add_library(raft INTERFACE)
+add_library(raft::raft ALIAS raft)
 
-target_include_directories(raft_core INTERFACE
+target_include_directories(raft INTERFACE
         "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
         "$<INSTALL_INTERFACE:include>")
 
-target_link_libraries(raft_core INTERFACE
+target_link_libraries(raft INTERFACE
         raft::Thrust
         $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
         ${RAFT_LINK_LIBS}
         cuco::cuco
         std::mdspan)
 
-target_compile_definitions(raft_core INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
-target_compile_features(raft_core INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
+target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY OR RAFT_COMPILE_NN_LIBRARY)
   file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
@@ -177,16 +173,20 @@ if(${RAFT_STATIC_LINK_LIBRARIES})
 endif()
 
 ##############################################################################
-# - raft_public -------------------------------------------------------------
+# - raft_core -------------------------------------------------------------
 
-add_library(raft INTERFACE)
-add_library(raft::raft ALIAS raft)
+add_library(raft_core INTERFACE)
+if(TARGET raft_core AND (NOT TARGET raft::core))
+  add_library(raft::core ALIAS raft)
+endif()
 
-target_include_directories(raft INTERFACE
+target_include_directories(raft_core INTERFACE
         "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
         "$<INSTALL_INTERFACE:include>"
         )
-target_link_libraries(raft INTERFACE ${RAFT_LINK_LIBS})
+target_link_libraries(raft_core INTERFACE ${RAFT_LINK_LIBS})
+
+set_target_properties(raft_core PROPERTIES EXPORT_NAME core)
 
 target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
@@ -237,7 +237,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
   )
   set_target_properties(raft_distance_lib PROPERTIES OUTPUT_NAME raft_distance)
 
-  target_link_libraries(raft_distance_lib PRIVATE raft::core)
+  target_link_libraries(raft_distance_lib PRIVATE raft::raft)
   target_compile_options(raft_distance_lib
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -251,7 +251,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
 endif()
 
 target_link_libraries(raft_distance INTERFACE
-    raft::raft
+    raft::core
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )
@@ -281,7 +281,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
           )
   set_target_properties(raft_nn_lib PROPERTIES OUTPUT_NAME raft_nn)
 
-  target_link_libraries(raft_nn_lib PRIVATE raft::core faiss::faiss)
+  target_link_libraries(raft_nn_lib PRIVATE raft::raft faiss::faiss)
   target_compile_options(raft_nn_lib
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -294,7 +294,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
 
 endif()
 
-target_link_libraries(raft_nn INTERFACE raft::raft faiss::faiss
+target_link_libraries(raft_nn INTERFACE raft::core faiss::faiss
     $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 
@@ -304,14 +304,14 @@ rapids_cmake_install_lib_dir( lib_dir )
 include(GNUInstallDirs)
 include(CPack)
 
-install(TARGETS raft
-        DESTINATION ${lib_dir}
-        EXPORT raft-exports)
-
 install(TARGETS raft_core
         DESTINATION ${lib_dir}
         EXPORT raft-core-exports)
 
+install(TARGETS raft
+        DESTINATION ${lib_dir}
+        EXPORT raft-exports)
+
 install(TARGETS raft_distance
         DESTINATION ${lib_dir}
         EXPORT raft-distance-exports)
@@ -332,12 +332,12 @@ if(TARGET raft_nn_lib)
 endif()
 
 
-install(DIRECTORY include/raft_public
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_public)
+install(DIRECTORY include/raft_core
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_core)
 
 # Temporary install of raft.hpp while the file is removed
-install(FILES include/raft_public/raft.hpp
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_public)
+install(FILES include/raft_core/raft.hpp
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_core)
 
 ##############################################################################
 # - install export -----------------------------------------------------------
@@ -349,12 +349,13 @@ RAFT (Reusable Analytics Functions and other Tools) contains fundamental
 widely-used algorithms and primitives for data science, graph, and ml.
 
 Optional Components:
+  - core
   - nn
   - distance
 
 Imported Targets:
-  - raft::core
   - raft::raft
+  - raft::core brought in by the `core` optional component
   - raft::nn brought in by the `nn` optional component
   - raft::distance brought in by the `distance` optional component
 
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index f7937923f5..da733d0ef1 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -18,8 +18,8 @@ function(find_and_configure_cuco VERSION)
 
     rapids_cpm_find(cuco ${VERSION}
       GLOBAL_TARGETS      cuco::cuco
-      BUILD_EXPORT_SET    raft-core-exports
-      INSTALL_EXPORT_SET  raft-core-exports
+      BUILD_EXPORT_SET    raft-exports
+      INSTALL_EXPORT_SET  raft-exports
       CPM_ARGS
         EXCLUDE_FROM_ALL TRUE
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 9929fcc25d..04da801b79 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -17,7 +17,7 @@
 function(find_and_configure_gtest )
 
     include(${rapids-cmake-dir}/cpm/gtest.cmake)
-    rapids_cpm_gtest(BUILD_EXPORT_SET raft-core-exports
+    rapids_cpm_gtest(BUILD_EXPORT_SET raft-exports
                      EXCLUDE_FROM_ALL TRUE)
 
     if(GTest_ADDED)
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 8cdb4ce2ce..4333ba3fcd 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,8 +16,8 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-core-exports
-                        INSTALL_EXPORT_SET raft-core-exports
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
+                        INSTALL_EXPORT_SET raft-exports
                         EXCLUDE_FROM_ALL TRUE)
 
 endfunction()
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index ab0a596bbb..03fafd4577 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -16,8 +16,8 @@ function(find_and_configure_mdspan VERSION)
   rapids_cpm_find(
     mdspan ${VERSION}
     GLOBAL_TARGETS std::mdspan
-    BUILD_EXPORT_SET    raft-core-exports
-    INSTALL_EXPORT_SET  raft-core-exports
+    BUILD_EXPORT_SET    raft-exports
+    INSTALL_EXPORT_SET  raft-exports
     CPM_ARGS
       EXCLUDE_FROM_ALL TRUE
       GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 1658763e56..3049f4f23e 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -25,7 +25,7 @@
 #pragma once
 
 #include <memory>
-#include <raft_public/error.hpp>
+#include <raft_core/error.hpp>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/include/raft_public/comms/comms.hpp b/cpp/include/raft_core/comms/comms.hpp
similarity index 99%
rename from cpp/include/raft_public/comms/comms.hpp
rename to cpp/include/raft_core/comms/comms.hpp
index fa23c0128f..e2ddb6250b 100644
--- a/cpp/include/raft_public/comms/comms.hpp
+++ b/cpp/include/raft_core/comms/comms.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <memory>
-#include <raft_public/error.hpp>
+#include <raft_core/error.hpp>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/include/raft_public/cudart_utils.hpp b/cpp/include/raft_core/cudart_utils.hpp
similarity index 99%
rename from cpp/include/raft_public/cudart_utils.hpp
rename to cpp/include/raft_core/cudart_utils.hpp
index 0bbcaf5e13..72cf5f0317 100644
--- a/cpp/include/raft_public/cudart_utils.hpp
+++ b/cpp/include/raft_core/cudart_utils.hpp
@@ -19,7 +19,7 @@
 
 #pragma once
 
-#include <raft_public/error.hpp>
+#include <raft_core/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
diff --git a/cpp/include/raft_public/error.hpp b/cpp/include/raft_core/error.hpp
similarity index 100%
rename from cpp/include/raft_public/error.hpp
rename to cpp/include/raft_core/error.hpp
diff --git a/cpp/include/raft_public/handle.hpp b/cpp/include/raft_core/handle.hpp
similarity index 97%
rename from cpp/include/raft_public/handle.hpp
rename to cpp/include/raft_core/handle.hpp
index 448e42504c..dc31f6e732 100644
--- a/cpp/include/raft_public/handle.hpp
+++ b/cpp/include/raft_core/handle.hpp
@@ -35,13 +35,13 @@
 ///@todo: enable once we have migrated cuml-comms layer too
 //#include <common/cuml_comms_int.hpp>
 
-#include <raft_public/cudart_utils.hpp>
+#include <raft_core/cudart_utils.hpp>
 
-#include <raft_public/comms/comms.hpp>
-#include <raft_public/interruptible.hpp>
-#include <raft_public/linalg/cublas_macros.hpp>
-#include <raft_public/linalg/cusolver_macros.hpp>
-#include <raft_public/sparse/cusparse_macros.hpp>
+#include <raft_core/comms/comms.hpp>
+#include <raft_core/interruptible.hpp>
+#include <raft_core/linalg/cublas_macros.hpp>
+#include <raft_core/linalg/cusolver_macros.hpp>
+#include <raft_core/sparse/cusparse_macros.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft_public/interruptible.hpp b/cpp/include/raft_core/interruptible.hpp
similarity index 99%
rename from cpp/include/raft_public/interruptible.hpp
rename to cpp/include/raft_core/interruptible.hpp
index 194f6f1fd1..454d8c4a6b 100644
--- a/cpp/include/raft_public/interruptible.hpp
+++ b/cpp/include/raft_core/interruptible.hpp
@@ -22,8 +22,8 @@
 #include <memory>
 #include <mutex>
 #include <optional>
-#include <raft_public/cudart_utils.hpp>
-#include <raft_public/error.hpp>
+#include <raft_core/cudart_utils.hpp>
+#include <raft_core/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <thread>
 #include <unordered_map>
diff --git a/cpp/include/raft_public/linalg/cublas_macros.hpp b/cpp/include/raft_core/linalg/cublas_macros.hpp
similarity index 99%
rename from cpp/include/raft_public/linalg/cublas_macros.hpp
rename to cpp/include/raft_core/linalg/cublas_macros.hpp
index f654e0b27e..5ca93fc185 100644
--- a/cpp/include/raft_public/linalg/cublas_macros.hpp
+++ b/cpp/include/raft_core/linalg/cublas_macros.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <raft_public/error.hpp>
+#include <raft_core/error.hpp>
 
 ///@todo: enable this once we have logger enabled
 //#include <cuml/common/logger.hpp>
diff --git a/cpp/include/raft_public/linalg/cusolver_macros.hpp b/cpp/include/raft_core/linalg/cusolver_macros.hpp
similarity index 99%
rename from cpp/include/raft_public/linalg/cusolver_macros.hpp
rename to cpp/include/raft_core/linalg/cusolver_macros.hpp
index f4e8911983..ae7cad31d0 100644
--- a/cpp/include/raft_public/linalg/cusolver_macros.hpp
+++ b/cpp/include/raft_core/linalg/cusolver_macros.hpp
@@ -23,7 +23,7 @@
 #include <cusolverSp.h>
 ///@todo: enable this once logging is enabled
 //#include <cuml/common/logger.hpp>
-#include <raft_public/cudart_utils.hpp>
+#include <raft_core/cudart_utils.hpp>
 #include <type_traits>
 
 #define _CUSOLVER_ERR_TO_STR(err) \
diff --git a/cpp/include/raft_public/raft.hpp b/cpp/include/raft_core/raft.hpp
similarity index 100%
rename from cpp/include/raft_public/raft.hpp
rename to cpp/include/raft_core/raft.hpp
diff --git a/cpp/include/raft_public/sparse/cusparse_macros.hpp b/cpp/include/raft_core/sparse/cusparse_macros.hpp
similarity index 99%
rename from cpp/include/raft_public/sparse/cusparse_macros.hpp
rename to cpp/include/raft_core/sparse/cusparse_macros.hpp
index 34643129a0..bc0aa374d6 100644
--- a/cpp/include/raft_public/sparse/cusparse_macros.hpp
+++ b/cpp/include/raft_core/sparse/cusparse_macros.hpp
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <cusparse.h>
-#include <raft_public/error.hpp>
+#include <raft_core/error.hpp>
 ///@todo: enable this once logging is enabled
 //#include <cuml/common/logger.hpp>
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 05bad2844d..71357829ca 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -164,7 +164,6 @@ target_include_directories(test_raft
 target_link_libraries(test_raft
         PRIVATE
         raft::raft
-        raft::core
         raft::distance
         raft::nn
         NCCL::NCCL
diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index 72b0d988f0..d4009da0fb 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -30,7 +30,7 @@ cdef extern from "raft/mr/device/allocator.hpp" \
     cdef cppclass allocator:
         pass
 
-cdef extern from "raft_public/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft_core/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
diff --git a/python/pylibraft/pylibraft/common/interruptible.pxd b/python/pylibraft/pylibraft/common/interruptible.pxd
index dde9add11d..3eda29c602 100644
--- a/python/pylibraft/pylibraft/common/interruptible.pxd
+++ b/python/pylibraft/pylibraft/common/interruptible.pxd
@@ -22,7 +22,7 @@
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
-cdef extern from "raft_public/interruptible.hpp" namespace "raft" nogil:
+cdef extern from "raft_core/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
 
diff --git a/python/raft/raft/common/handle.pxd b/python/raft/raft/common/handle.pxd
index 08cf1de499..1bbb57f1c7 100644
--- a/python/raft/raft/common/handle.pxd
+++ b/python/raft/raft/common/handle.pxd
@@ -31,7 +31,7 @@ cdef extern from "raft/mr/device/allocator.hpp" \
     cdef cppclass allocator:
         pass
 
-cdef extern from "raft_public/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft_core/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
diff --git a/python/raft/raft/common/interruptible.pxd b/python/raft/raft/common/interruptible.pxd
index 1ba2df95a4..2b937e7299 100644
--- a/python/raft/raft/common/interruptible.pxd
+++ b/python/raft/raft/common/interruptible.pxd
@@ -22,11 +22,11 @@
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
-cdef extern from "raft_public/interruptible.hpp" namespace "raft" nogil:
+cdef extern from "raft_core/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
 
-cdef extern from "raft_public/interruptible.hpp" \
+cdef extern from "raft_core/interruptible.hpp" \
         namespace "raft::interruptible" nogil:
     cdef void inter_synchronize \
         "raft::interruptible::synchronize"(cuda_stream_view stream) except+
diff --git a/python/raft/raft/dask/common/comms_utils.pyx b/python/raft/raft/dask/common/comms_utils.pyx
index 5b6bedabfe..7f805a9e9c 100644
--- a/python/raft/raft/dask/common/comms_utils.pyx
+++ b/python/raft/raft/dask/common/comms_utils.pyx
@@ -31,7 +31,7 @@ cdef extern from "nccl.h":
     cdef struct ncclComm
     ctypedef ncclComm *ncclComm_t
 
-cdef extern from "raft_public/handle.hpp" namespace "raft":
+cdef extern from "raft_core/handle.hpp" namespace "raft":
     cdef cppclass handle_t:
         handle_t() except +
 

From 66fbe3e4f0934863943512eaa84000bb5f7d43cd Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 18:39:44 -0400
Subject: [PATCH 100/167] Updates

---
 build.sh                                  | 29 +++++++++++++++++------
 cpp/cmake/thirdparty/get_cuco.cmake       |  4 ++--
 cpp/cmake/thirdparty/get_libcudacxx.cmake |  6 ++---
 cpp/cmake/thirdparty/get_mdspan.cmake     |  4 ++--
 cpp/cmake/thirdparty/get_rmm.cmake        | 10 ++------
 cpp/cmake/thirdparty/get_thrust.cmake     |  4 ++--
 6 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/build.sh b/build.sh
index bbe1f3c57e..8ad83d5963 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft pylibraft docs tests bench -v -g --noinstall --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
+VALIDARGS="clean libraft pyraft pylibraft docs tests bench uninstall -v -g --install --remove-cmake-deps --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -38,11 +38,11 @@ HELP="$0 [<target> ...] [<flag> ...]
    --compile-dist   - compile shared library for distance component
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
-   --noinstall      - do not install cmake targets
+   --install        - install cmake targets
    --clean
-   --nvtx           - Enable nvtx for profiling support
-   --show_depr_warn - show cmake deprecation warnings
-   -h               - print this text
+   --nvtx              - Enable nvtx for profiling support
+   --show_depr_warn    - show cmake deprecation warnings
+   -h                  - print this text
 
  default action (no args) is to build both libraft and pyraft targets
 "
@@ -65,9 +65,10 @@ COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=OFF
 NVTX=OFF
 CLEAN=0
+UNINSTALL=0
 DISABLE_DEPRECATION_WARNINGS=ON
 CMAKE_TARGET=""
-INSTALL_TARGET="install"
+INSTALL_TARGET=""
 
 # Set defaults for vars that may not have been defined externally
 #  FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check
@@ -154,6 +155,9 @@ fi
 if hasArg clean; then
     CLEAN=1
 fi
+if hasArg uninstall; then
+  UNINSTALL=1
+fi
 
 # If clean given, run it prior to any other steps
 if (( ${CLEAN} == 1 )); then
@@ -177,6 +181,17 @@ if (( ${CLEAN} == 1 )); then
     cd ${REPODIR}
 fi
 
+if (( ${UNINSTALL} == 1 )); then
+    rm -rf ${INSTALL_PREFIX}/include/raft*
+    rm -rf ${INSTALL_PREFIX}/lib/cmake/raft*
+    rm -rf ${INSTALL_PREFIX}/include/cub
+    rm -rf ${INSTALL_PREFIX}/include/lib/cmake/cub
+    rm -rf ${INSTALL_PREFIX}/include/include/cuco
+    rm -rf ${INSTALL_PREFIX}/include/lib/cmake/cuco
+    rm -rf ${INSTALL_PREFIX}/include/include/rmm
+    rm -rf ${INSTALL_PREFIX}/include/cmake/rmm
+fi
+
 
 ################################################################################
 # Configure for building all C++ targets
@@ -205,7 +220,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
           -DRAFT_COMPILE_DIST_LIBRARY=${COMPILE_DIST_LIBRARY} \
           -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS}
 
-  if [[ ${CMAKE_TARGET} != "" ]] || [[ ${INSTALL_TARGET} != "" ]]; then
+  if [[ ${CMAKE_TARGET} != "" ]]; then
       echo "-- Compiling targets: ${CMAKE_TARGET}, verbose=${VERBOSE_FLAG}"
       cmake --build  "${LIBRAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} ${INSTALL_TARGET}
   fi
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index da733d0ef1..3c57fca5b4 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -19,9 +19,7 @@ function(find_and_configure_cuco VERSION)
     rapids_cpm_find(cuco ${VERSION}
       GLOBAL_TARGETS      cuco::cuco
       BUILD_EXPORT_SET    raft-exports
-      INSTALL_EXPORT_SET  raft-exports
       CPM_ARGS
-        EXCLUDE_FROM_ALL TRUE
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
         GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
         OPTIONS        "BUILD_TESTS OFF"
@@ -29,6 +27,8 @@ function(find_and_configure_cuco VERSION)
                        "BUILD_EXAMPLES OFF"
     )
 
+    rapids_export_package(INSTALL cuco raft-exports)
+
 endfunction()
 
 # cuCollections doesn't have a version yet
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 4333ba3fcd..29cd764817 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,10 +16,8 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
-                        INSTALL_EXPORT_SET raft-exports
-                        EXCLUDE_FROM_ALL TRUE)
-
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports)
+  rapids_export_package(INSTALL libcudacxx raft-exports)
 endfunction()
 
 find_and_configure_libcudacxx()
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index 03fafd4577..63a931f302 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -17,14 +17,14 @@ function(find_and_configure_mdspan VERSION)
     mdspan ${VERSION}
     GLOBAL_TARGETS std::mdspan
     BUILD_EXPORT_SET    raft-exports
-    INSTALL_EXPORT_SET  raft-exports
     CPM_ARGS
-      EXCLUDE_FROM_ALL TRUE
       GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
       GIT_TAG b3042485358d2ee168ae2b486c98c2c61ec5aec1
       OPTIONS "MDSPAN_ENABLE_CUDA ON"
               "MDSPAN_CXX_STANDARD ON"
   )
+
+  rapids_export_package(INSTALL mdspan raft-exports)
 endfunction()
 
 find_and_configure_mdspan(0.2.0)
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 84c6dc0d7f..6e7a68502c 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -15,15 +15,9 @@
 #=============================================================================
 
 function(find_and_configure_rmm)
-
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
-    rapids_cpm_rmm(
-        GLOBAL_TARGETS      rmm::rmm
-        BUILD_EXPORT_SET    raft-exports
-        INSTALL_EXPORT_SET  raft-exports
-        EXCLUDE_FROM_ALL TRUE
-    )
-
+    rapids_cpm_rmm(BUILD_EXPORT_SET raft-exports)
+    rapids_export_package(INSTALL rmm raft-exports)
 endfunction()
 
 find_and_configure_rmm()
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 099913af29..cf1bc94c73 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -17,8 +17,8 @@ function(find_and_configure_thrust)
     include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
     rapids_cpm_thrust( NAMESPACE raft )
-    rapids_export_package(BUILD thrust raft-core-exports)
-    rapids_export_package(INSTALL thrust raft-core-exports)
+    rapids_export_package(BUILD thrust raft-exports)
+    rapids_export_package(INSTALL thrust raft-exports)
 
 endfunction()
 

From 9412e997dcc1688d8f86bda84416479c1989a759 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 19:06:13 -0400
Subject: [PATCH 101/167] Fixing style

---
 python/pylibraft/pylibraft/distance/pairwise_distance.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 6d239e5fe7..9b918396f6 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -69,6 +69,7 @@ DISTANCE_TYPES = {
 
 SUPPORTED_DISTANCES = list(DISTANCE_TYPES.keys())
 
+
 def distance(X, Y, dists, metric="euclidean"):
     """
     Compute pairwise distances between X and Y

From 9d3ffc83a1dbafe602fba7463066d56ac9cb6ca7 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 19:26:37 -0400
Subject: [PATCH 102/167] Installing targets

---
 conda/recipes/libraft_core/build.sh     | 2 +-
 conda/recipes/libraft_distance/build.sh | 2 +-
 conda/recipes/libraft_nn/build.sh       | 2 +-
 conda/recipes/pylibraft/build.sh        | 2 +-
 conda/recipes/pyraft/build.sh           | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/conda/recipes/libraft_core/build.sh b/conda/recipes/libraft_core/build.sh
index 876f46cdfe..578ffa98c4 100644
--- a/conda/recipes/libraft_core/build.sh
+++ b/conda/recipes/libraft_core/build.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch
+./build.sh libraft  -v --allgpuarch
diff --git a/conda/recipes/libraft_distance/build.sh b/conda/recipes/libraft_distance/build.sh
index 062a5219db..d0843fdd79 100644
--- a/conda/recipes/libraft_distance/build.sh
+++ b/conda/recipes/libraft_distance/build.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch --compile-dist
+./build.sh libraft --install -v --allgpuarch --compile-dist
diff --git a/conda/recipes/libraft_nn/build.sh b/conda/recipes/libraft_nn/build.sh
index 4f6ffbca25..9d53362738 100644
--- a/conda/recipes/libraft_nn/build.sh
+++ b/conda/recipes/libraft_nn/build.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch --compile-nn
+./build.sh libraft --install -v --allgpuarch --compile-nn
diff --git a/conda/recipes/pylibraft/build.sh b/conda/recipes/pylibraft/build.sh
index 5ac2f5e33c..442428e0ee 100644
--- a/conda/recipes/pylibraft/build.sh
+++ b/conda/recipes/pylibraft/build.sh
@@ -2,4 +2,4 @@
 #!/usr/bin/env bash
 
 # This assumes the script is executed from the root of the repo directory
-./build.sh pylibraft
+./build.sh pylibraft --install
diff --git a/conda/recipes/pyraft/build.sh b/conda/recipes/pyraft/build.sh
index 538dc60f29..4745f583f3 100644
--- a/conda/recipes/pyraft/build.sh
+++ b/conda/recipes/pyraft/build.sh
@@ -3,4 +3,4 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
 # This assumes the script is executed from the root of the repo directory
-./build.sh pyraft
+./build.sh pyraft --install

From 8cb3c3302c3c24c8ce90709e64c334110d5bd741 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 19:54:15 -0400
Subject: [PATCH 103/167] Proper install target

---
 build.sh                                  | 13 ++++++++-----
 cpp/cmake/thirdparty/get_cuco.cmake       |  2 --
 cpp/cmake/thirdparty/get_libcudacxx.cmake |  1 -
 cpp/cmake/thirdparty/get_mdspan.cmake     |  2 --
 cpp/cmake/thirdparty/get_rmm.cmake        |  1 -
 cpp/cmake/thirdparty/get_thrust.cmake     |  2 --
 6 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/build.sh b/build.sh
index 8ad83d5963..cf195ce534 100755
--- a/build.sh
+++ b/build.sh
@@ -84,10 +84,6 @@ function hasArg {
     (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
 }
 
-if hasArg --noinstall; then
-    INSTALL_TARGET=""
-fi
-
 if hasArg -h || hasArg --help; then
     echo "${HELP}"
     exit 0
@@ -104,6 +100,9 @@ if (( ${NUMARGS} != 0 )); then
 fi
 
 # Process flags
+if hasArg --install; then
+  INSTALL_TARGET="install"
+fi
 if hasArg -v; then
     VERBOSE_FLAG="-v"
     CMAKE_LOG_LEVEL="VERBOSE"
@@ -222,7 +221,11 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
 
   if [[ ${CMAKE_TARGET} != "" ]]; then
       echo "-- Compiling targets: ${CMAKE_TARGET}, verbose=${VERBOSE_FLAG}"
-      cmake --build  "${LIBRAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} ${INSTALL_TARGET}
+      if [[ ${INSTALL_TARGET} != "" ]]; then
+        cmake --build  "${LIBRAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} ${INSTALL_TARGET}
+      else
+        cmake --build  "${LIBRAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET}
+      fi
   fi
 fi
 
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 3c57fca5b4..f3aaef7f42 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -27,8 +27,6 @@ function(find_and_configure_cuco VERSION)
                        "BUILD_EXAMPLES OFF"
     )
 
-    rapids_export_package(INSTALL cuco raft-exports)
-
 endfunction()
 
 # cuCollections doesn't have a version yet
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 29cd764817..a81de940d1 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -17,7 +17,6 @@ function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
   rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports)
-  rapids_export_package(INSTALL libcudacxx raft-exports)
 endfunction()
 
 find_and_configure_libcudacxx()
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index 63a931f302..6cd2b5c96c 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -23,8 +23,6 @@ function(find_and_configure_mdspan VERSION)
       OPTIONS "MDSPAN_ENABLE_CUDA ON"
               "MDSPAN_CXX_STANDARD ON"
   )
-
-  rapids_export_package(INSTALL mdspan raft-exports)
 endfunction()
 
 find_and_configure_mdspan(0.2.0)
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 6e7a68502c..ffab703091 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -17,7 +17,6 @@
 function(find_and_configure_rmm)
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
     rapids_cpm_rmm(BUILD_EXPORT_SET raft-exports)
-    rapids_export_package(INSTALL rmm raft-exports)
 endfunction()
 
 find_and_configure_rmm()
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index cf1bc94c73..fb9632ba5e 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -18,8 +18,6 @@ function(find_and_configure_thrust)
 
     rapids_cpm_thrust( NAMESPACE raft )
     rapids_export_package(BUILD thrust raft-exports)
-    rapids_export_package(INSTALL thrust raft-exports)
-
 endfunction()
 
 find_and_configure_thrust()

From 2fe7d891009ab42ecf5bfcd8053dd08191bdcd73 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 20:48:43 -0400
Subject: [PATCH 104/167] removing cuco from installing headers

---
 cpp/cmake/thirdparty/get_cuco.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index f3aaef7f42..0fbc2e6202 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -20,6 +20,7 @@ function(find_and_configure_cuco VERSION)
       GLOBAL_TARGETS      cuco::cuco
       BUILD_EXPORT_SET    raft-exports
       CPM_ARGS
+        EXCLUDE_FROM_ALL TRUE
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
         GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
         OPTIONS        "BUILD_TESTS OFF"

From d06ef66215b4b2c960def8d3fae8899f8eaf8786 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Mar 2022 21:27:32 -0400
Subject: [PATCH 105/167] Changing actual conda package back to
 `libraft-headers` just to minimize build issues in the meantime.

---
 README.md                                | 4 ++--
 conda/recipes/libraft_core/meta.yaml     | 4 ++--
 conda/recipes/libraft_distance/meta.yaml | 4 ++--
 conda/recipes/libraft_nn/meta.yaml       | 4 ++--
 conda/recipes/pylibraft/meta.yaml        | 4 ++--
 conda/recipes/pyraft/meta.yaml           | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 69761ab988..ff19ddac6e 100755
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ RAFT can be installed through conda, [Cmake Package Manager (CPM)](https://githu
 ### Conda
 
 The easiest way to install RAFT is through conda and several packages are provided.
-- `libraft-core` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`. The client APIs are also more stable across versions so they can be safely installed globally in an environment with projects which might have been built with different versions of RAFT.
+- `libraft-headers` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`. The client APIs are also more stable across versions so they can be safely installed globally in an environment with projects which might have been built with different versions of RAFT.
 - `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
 - `libraft-distance` (optional) contains shared libraries for distance primitives.
 - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives
@@ -111,7 +111,7 @@ The easiest way to install RAFT is through conda and several packages are provid
 
 Use the following command to install RAFT with conda (use `-c rapidsai-nightly` for more up-to-date but less stable nightly packages)
 ```bash
-conda install -c rapidsai libraft-core libraft-nn libraft-distance pyraft pylibraft
+conda install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft pylibraft
 ```
 
 After installing RAFT, `find_package(raft COMPONENTS backend nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
diff --git a/conda/recipes/libraft_core/meta.yaml b/conda/recipes/libraft_core/meta.yaml
index 6a416b3ea2..aec6fa4351 100644
--- a/conda/recipes/libraft_core/meta.yaml
+++ b/conda/recipes/libraft_core/meta.yaml
@@ -8,7 +8,7 @@
 {% set cuda_major=cuda_version.split('.')[0] %}
 {% set ucx_py_version=environ.get('UCX_PY_VERSION') %}
 package:
-  name: libraft-core
+  name: libraft-headers
   version: {{ version }}
 
 source:
@@ -58,4 +58,4 @@ about:
   home: http://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
-  summary: libraft-core library
+  summary: libraft-headers library
diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index 30e8ae54bb..4474629df4 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-core {{ version }}
+    - libraft-headers {{ version }}
     - nccl>=2.9.9
     - cudatoolkit {{ cuda_version }}.*
     - ucx-py {{ ucx_py_version }}
@@ -47,7 +47,7 @@ requirements:
     - gmock
     - librmm {{ minor_version }}
   run:
-    - libraft-core {{ version }}
+    - libraft-headers {{ version }}
     - nccl>=2.9.9
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
index 03a24a025a..9d6732d56b 100644
--- a/conda/recipes/libraft_nn/meta.yaml
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-core {{ version }}
+    - libraft-headers {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - lapack
     - faiss-proc=*=cuda
@@ -48,7 +48,7 @@ requirements:
     - librmm {{ minor_version }}
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
-    - libraft-core {{ version }}
+    - libraft-headers {{ version }}
     - faiss-proc=*=cuda
     - libfaiss 1.7.0 *_cuda
     - libcusolver>=11.2.1
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index a56c08ce80..1c49f00b06 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -29,13 +29,13 @@ requirements:
     - setuptools
     - cython>=0.29,<0.30
     - rmm {{ minor_version }}
-    - libraft-core {{ version }}
+    - libraft-headers {{ version }}
     - libraft-distance {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0
   run:
     - python x.x
-    - libraft-core {{ version }}
+    - libraft-headers {{ version }}
     - libraft-distance {{ version }}
     - cuda-python >=11.5,<12.0
     - joblib >=0.11
diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
index e8b9c60e6e..eae9963204 100644
--- a/conda/recipes/pyraft/meta.yaml
+++ b/conda/recipes/pyraft/meta.yaml
@@ -30,7 +30,7 @@ requirements:
     - setuptools
     - cython>=0.29,<0.30
     - rmm {{ minor_version }}
-    - libraft-core {{ version }}
+    - libraft-headers {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0
     - nccl>=2.9.9
@@ -39,7 +39,7 @@ requirements:
   run:
     - python x.x
     - dask-cuda {{ minor_version }}
-    - libraft-core {{ version }}
+    - libraft-headers {{ version }}
     - nccl>=2.9.9
     - rmm {{ minor_version }}
     - ucx-py {{ ucx_py_version }}

From 7fddb0bf52287f0732e95f3566ddc0568796c4ed Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 16 Mar 2022 12:00:47 -0400
Subject: [PATCH 106/167] Backing out recent build changes, removing INSTALL
 exports everywhere. Only installing / including RAFT headers w/
 libraft-headers conda package.

---
 BUILD.md                                      |  72 +-
 README.md                                     |   2 +-
 build.sh                                      |  35 +-
 ci/cpu/build.sh                               |   8 +-
 ci/cpu/upload.sh                              |   2 +-
 .../build.sh                                  |   2 +-
 .../meta.yaml                                 |   0
 cpp/CMakeLists.txt                            |  79 +--
 cpp/cmake/thirdparty/get_cuco.cmake           |  24 +-
 cpp/cmake/thirdparty/get_faiss.cmake          |   3 +-
 cpp/include/raft/comms/comms.hpp              |   2 +-
 cpp/include/raft_core/comms/comms.hpp         | 638 ------------------
 cpp/include/raft_core/cudart_utils.hpp        | 441 ------------
 cpp/include/raft_core/error.hpp               | 176 -----
 cpp/include/raft_core/handle.hpp              | 339 ----------
 cpp/include/raft_core/interruptible.hpp       | 271 --------
 .../raft_core/linalg/cublas_macros.hpp        | 119 ----
 .../raft_core/linalg/cusolver_macros.hpp      | 117 ----
 cpp/include/raft_core/raft.hpp                |  20 -
 .../raft_core/sparse/cusparse_macros.hpp      | 128 ----
 cpp/test/CMakeLists.txt                       |   1 -
 python/pylibraft/pylibraft/common/handle.pxd  |   2 +-
 .../pylibraft/common/interruptible.pxd        |   2 +-
 python/raft/raft/common/handle.pxd            |   2 +-
 python/raft/raft/common/interruptible.pxd     |   4 +-
 python/raft/raft/dask/common/comms_utils.pyx  |   2 +-
 26 files changed, 121 insertions(+), 2370 deletions(-)
 rename conda/recipes/{libraft_core => libraft_headers}/build.sh (58%)
 rename conda/recipes/{libraft_core => libraft_headers}/meta.yaml (100%)
 delete mode 100644 cpp/include/raft_core/comms/comms.hpp
 delete mode 100644 cpp/include/raft_core/cudart_utils.hpp
 delete mode 100644 cpp/include/raft_core/error.hpp
 delete mode 100644 cpp/include/raft_core/handle.hpp
 delete mode 100644 cpp/include/raft_core/interruptible.hpp
 delete mode 100644 cpp/include/raft_core/linalg/cublas_macros.hpp
 delete mode 100644 cpp/include/raft_core/linalg/cusolver_macros.hpp
 delete mode 100644 cpp/include/raft_core/raft.hpp
 delete mode 100644 cpp/include/raft_core/sparse/cusparse_macros.hpp

diff --git a/BUILD.md b/BUILD.md
index 1c2c741f97..ea82276ec1 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -2,7 +2,8 @@
 
 - [Building and installing RAFT](#build_install)
     - [CUDA/GPU Requirements](#cuda_gpu_req)
-    - [Header-only C++](#nstall_header_only_cpp)
+    - [Build Dependencies](#required_depenencies)
+    - [Header-only C++](#install_header_only_cpp)
     - [C++ Shared Libraries](#shared_cpp_libs)
     - [Googletests](#gtests)
     - [C++ Using Cmake](#cpp_using_cmake)
@@ -16,28 +17,46 @@
 ## <a id="build_install"></a>Building and installing RAFT
 
 ### <a id="cuda_gpu_req"></a>CUDA/GPU Requirements
-- CUDA 11.0+
+- CUDA Toolkit 11.0+
 - NVIDIA driver 450.80.02+
 - Pascal architecture of better (Compute capability >= 6.0)
 
-C++ RAFT is a header-only library but provides the option of building shared libraries with template instantiations for common types to speed up compile times for larger projects.
+### <a id="required_dependencies"></a>Build Dependencies
 
-The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python code and provides options for building and installing the headers, Googletests, benchmarks, and individual shared libraries.
+Below are the dependencies for building RAFT from source. Many of these dependencies can be installed with [conda](https://anaconda.org) or [rapids-cpm](https://github.com/rapidsai/rapids-cmake#cpm).
+
+#### Required
+- [Thrust](https://github.com/NVIDIA/thrust) v1.15 / [CUB](https://github.com/NVIDIA/cub)
+- [RMM](https://github.com/rapidsai/rmm)
+- [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0
+- [mdspan](https://github.com/rapidsai/mdspan)
+  
+#### Optional
+- [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API
+- [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::spatial::knn` API
+- [NCCL](https://github.com/NVIDIA/nccl) - Used in `raft::comms` API and needed to build `Pyraft`
+- [UCX](https://github.com/openucx/ucx) - Used in `raft::comms` API and needed to build `Pyraft`
+- [Googletest](https://github.com/google/googletest) - Needed to build tests
+- [Googlebench](https://github.com/google/benchmark) - Needed to build benchmarks
+- [Doxygen](https://github.com/doxygen/doxygen) - Needed to build docs
+
+
+C++ RAFT is a header-only library with the option of building shared libraries with template instantiations for common types to speed up compile times for larger projects.
+
+The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python code and provides options for building and installing the core headers, tests, benchmarks, and individual shared libraries.
 
 ### <a id="install_header_only_cpp"></a>Header-only C++
 
-RAFT depends on many different core libraries such as `thrust`, `cub`, `cucollections`, and `rmm`, which will be downloaded automatically by `cmake` even when only installing the headers. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which can also be downloaded in the RAFT build but will need to be told to do so.
+`build.sh` uses [rapids-cmake](https://github.com/rapidsai/rapids-cmake), which will automatically download any dependencies. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which will need to be explicitly enabled in `build.sh`.
 
-The following example builds and installs raft in header-only mode:
+The following example will download the needed dependencies and install the `raft core` headers in `$INSTALL_PREFIX/include/raft`. The `raft core` headers are a subset of the RAFT headers which are safe to install and expose through public APIs in consuming projects as they require only RMM and the libraries provided by the CUDA toolkit.
 ```bash
 ./build.sh libraft
 ```
 
 ###<a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
 
-Shared libraries are provided to speed up compile times for larger libraries which may heavily utilize some of the APIs. These shared libraries can also significantly improve re-compile times while developing against the APIs.
-
-Build all the shared libraries by passing `--compile-libs` flag to `build.sh`:
+Shared libraries can be built to speed up compile times for larger libraries which may heavily utilize some of the APIs. These shared libraries can also significantly improve re-compile times while developing against the APIs. Build all the shared libraries by passing `--compile-libs` flag to `build.sh`:
 
 ```bash
 ./build.sh libraft --compile-libs
@@ -48,9 +67,11 @@ Individual shared libraries have their own flags and multiple can be used (thoug
 ./build.sh libraft --compile-nn --compile-dist
 ```
 
-###<a id="gtests"></a>Googletests
+The `--install` flag can be passed to `build.sh` to install the shared libraries.
 
-Compile the Googletests using the `tests` target in `build.sh`:
+###<a id="gtests"></a>Tests
+
+Compile the tests using the `tests` target in `build.sh`:
 ```bash
 ./build.sh libraft tests --compile-libs
 ```
@@ -76,7 +97,7 @@ To run the benchmarks:
 
 ### <a id="cpp_using_cmake"></a>C++ Using Cmake
 
-To install RAFT into a specific location, use `CMAKE_INSTALL_PREFIX`. The snippet below will install it into the current conda environment:
+Use `CMAKE_INSTALL_PREFIX` to install RAFT into a specific location. The snippet below will install it into the current conda environment:
 ```bash
 cd cpp
 mkdir build
@@ -94,7 +115,10 @@ RAFT's cmake has the following configurable flags available:.
 | RAFT_COMPILE_LIBRARIES | ON, OFF | OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
 | RAFT_COMPILE_NN_LIBRARY | ON, OFF | ON | Compiles the `libraft-nn` shared library |
 | RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |
-| RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. |
+| RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. Needed for `raft::spatial::knn` |
+| RAFT_ENABLE_cuco_DEPENDENCY | ON, OFF | ON | Enables the cuCollections dependency used by `raft::sparse::distance` |
+| RAFT_ENABLE_nccl_DEPENDENCY | ON, OFF | OFF | Enables NCCL dependency used by `raft::comms` and needed to build `pyraft` |
+| RAFT_ENABLE_ucx_DEPENDENCY | ON, OFF | OFF | Enables UCX dependency used by `raft::comms` and needed to build `pyraft` |
 | RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | 
 | RAFT_STATIC_LINK_LIBRARIES | ON, OFF | ON | Build static link libraries instead of shared libraries |
 | DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies |
@@ -103,7 +127,7 @@ RAFT's cmake has the following configurable flags available:.
 | CUDA_ENABLE_LINEINFO  | ON, OFF | OFF | Enable the -lineinfo option for nvcc |
 | CUDA_STATIC_RUNTIME | ON, OFF | OFF | Statically link the CUDA runtime |
 
-Shared libraries are provided for the `libraft-nn` and `libraft-distance` components currently. The `libraft-nn` component depends upon [FAISS](https://github.com/facebookresearch/faiss) and the `RAFT_ENABLE_NN_DEPENDENCIES` option will build it from source if it is not already installed.
+Currently, shared libraries are provided for the `libraft-nn` and `libraft-distance` components. The `libraft-nn` component depends upon [FAISS](https://github.com/facebookresearch/faiss) and the `RAFT_ENABLE_NN_DEPENDENCIES` option will build it from source if it is not already installed.
 
 ### <a id="python"></a>Python
 
@@ -137,11 +161,25 @@ py.test -s -v raft
 
 ### <a id="cxx_integration"></a>C++ header-only integration using cmake
 
+When the needed [build dependencies](#required_depenencies) are already satisfied, RAFT can be trivially integrated into downstream projects by cloning the repository and adding `cpp/include` from RAFT to the include path:
+```cmake
+set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo")
+ExternalProject_Add(raft
+  GIT_REPOSITORY    git@github.com:rapidsai/raft.git
+  GIT_TAG           branch-22.04
+  PREFIX            ${RAFT_GIT_DIR}
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   "")
+set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/src/raft/cpp/include CACHE STRING "RAFT include variable")
+```
+
 The RAFT headers are broken down into two different include paths so that core headers can be isolated between projects while public API headers can be installed globally, exposed to users through public APIs, and shared across projects.
-- `cpp/include/raft_core` contains public API headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
+- `cpp/include/raft` contains public API headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
 - `cpp/include/raft` contains the core of the RAFT header-only library, containing primitives, algorithms, and other tools.
 
-Use `find_package(raft)` and the `raft::raft` if using RAFT to interact only with the public APIs of consuming projects.
+If RAFT has already been installed, such as by using the `build.sh` script, 
+Use `find_package(raft)` and the `raft::raft` target if using RAFT to interact only with the public APIs of consuming projects.
 
 Use `find_package(raft COMPONENTS core)` and both the `raft::raft` and `raft::core` targets when building a library that uses headers in `include/raft`.
 
@@ -159,7 +197,7 @@ The following example ignores the pre-compiled templates for the `libraft-distan
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft_core` headers and `raft::core` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft` headers and `raft::core` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
 
 The following `cmake` snippet enables a flexible configuration of RAFT:
 
diff --git a/README.md b/README.md
index ff19ddac6e..e91744c835 100755
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ Most of the primitives in RAFT accept a `raft::handle_t` object for the manageme
 The example below demonstrates creating a RAFT handle and using it with `device_matrix` and `device_vector` to allocate memory, generating random clusters, and computing
 pairwise Euclidean distances:
 ```c++
-#include <raft_core/handle.hpp>
+#include <raft/handle.hpp>
 #include <raft/mdarray.hpp>
 #include <raft/random/make_blobs.cuh>
 #include <raft/distance/distance.cuh>
diff --git a/build.sh b/build.sh
index cf195ce534..c5f4bec764 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft pylibraft docs tests bench uninstall -v -g --install --remove-cmake-deps --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
+VALIDARGS="clean libraft pyraft pylibraft docs tests bench -v -g --install --remove-cmake-deps --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -39,10 +39,10 @@ HELP="$0 [<target> ...] [<flag> ...]
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
    --install        - install cmake targets
-   --clean
-   --nvtx              - Enable nvtx for profiling support
-   --show_depr_warn    - show cmake deprecation warnings
-   -h                  - print this text
+   --clean          - perform clean of all build directories
+   --nvtx           - enable nvtx for profiling support
+   --show_depr_warn - show cmake deprecation warnings
+   -h               - print this text
 
  default action (no args) is to build both libraft and pyraft targets
 "
@@ -63,11 +63,14 @@ COMPILE_LIBRARIES=OFF
 COMPILE_NN_LIBRARY=OFF
 COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=OFF
+ENABLE_ucx_DEPENDENCY=OFF
+ENABLE_nccl_DEPENDENCY=OFF
+
 NVTX=OFF
 CLEAN=0
 UNINSTALL=0
 DISABLE_DEPRECATION_WARNINGS=ON
-CMAKE_TARGET=""
+CMAKE_TARGET=";"
 INSTALL_TARGET=""
 
 # Set defaults for vars that may not have been defined externally
@@ -180,18 +183,11 @@ if (( ${CLEAN} == 1 )); then
     cd ${REPODIR}
 fi
 
-if (( ${UNINSTALL} == 1 )); then
-    rm -rf ${INSTALL_PREFIX}/include/raft*
-    rm -rf ${INSTALL_PREFIX}/lib/cmake/raft*
-    rm -rf ${INSTALL_PREFIX}/include/cub
-    rm -rf ${INSTALL_PREFIX}/include/lib/cmake/cub
-    rm -rf ${INSTALL_PREFIX}/include/include/cuco
-    rm -rf ${INSTALL_PREFIX}/include/lib/cmake/cuco
-    rm -rf ${INSTALL_PREFIX}/include/include/rmm
-    rm -rf ${INSTALL_PREFIX}/include/cmake/rmm
+# Pyraft requires ucx + nccl
+if (( ${NUMARGS} == 0 )) || hasArg pyraft || hasArg docs; then
+  ENABLE_nccl_DEPENDENCY=ON
+  ENABLE_ucx_DEPENDENCY=ON
 fi
-
-
 ################################################################################
 # Configure for building all C++ targets
 if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench; then
@@ -217,7 +213,9 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           -DRAFT_COMPILE_NN_LIBRARY=${COMPILE_NN_LIBRARY} \
           -DRAFT_COMPILE_DIST_LIBRARY=${COMPILE_DIST_LIBRARY} \
-          -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS}
+          -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS} \
+          -DRAFT_ENABLE_nccl_DEPENDENCY=${ENABLE_nccl_DEPENDENCY} \
+          -DRAFT_ENABLE_ucx_DEPENDENCY=${ENABLE_ucx_DEPENDENCY}
 
   if [[ ${CMAKE_TARGET} != "" ]]; then
       echo "-- Compiling targets: ${CMAKE_TARGET}, verbose=${VERBOSE_FLAG}"
@@ -251,7 +249,6 @@ if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
     fi
 fi
 
-
 if hasArg docs; then
     cmake --build ${LIBRAFT_BUILD_DIR} --target docs_raft
     cd ${SPHINX_BUILD_DIR}
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 0c2584b936..f7f777c791 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -87,14 +87,14 @@ gpuci_mamba_retry install -c conda-forge boa
 if [ "$BUILD_LIBRAFT" == '1' ]; then
   gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-client-api"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_core
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance
   else
-    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_core
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_headers
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
-    mkdir -p ${CONDA_BLD_DIR}/libraft_core/work
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_core/work
+    mkdir -p ${CONDA_BLD_DIR}/libraft_headers/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_headers/work
 
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_nn
     gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 01ad7aebe7..8206c7984c 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -30,7 +30,7 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBRAFT_CLIENT_API_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_core --output`
+export LIBRAFT_CLIENT_API_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers --output`
 export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
 export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
 export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
diff --git a/conda/recipes/libraft_core/build.sh b/conda/recipes/libraft_headers/build.sh
similarity index 58%
rename from conda/recipes/libraft_core/build.sh
rename to conda/recipes/libraft_headers/build.sh
index 578ffa98c4..f239e545ef 100644
--- a/conda/recipes/libraft_core/build.sh
+++ b/conda/recipes/libraft_headers/build.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft  -v --allgpuarch
+./build.sh libraft --install -v --allgpuarch
diff --git a/conda/recipes/libraft_core/meta.yaml b/conda/recipes/libraft_headers/meta.yaml
similarity index 100%
rename from conda/recipes/libraft_core/meta.yaml
rename to conda/recipes/libraft_headers/meta.yaml
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ebdcaaf3a3..2f5aca7c50 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -55,6 +55,13 @@ option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiation
 option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations" OFF)
 option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations" OFF)
 option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss" ${RAFT_COMPILE_LIBRARIES})
+
+option(RAFT_ENABLE_cuco_DEPENDENCY "Enable cuCollections dependency" ON)
+
+# Currently, UCX and NCCL are only needed to build Pyraft and so a simple find_package() is sufficient
+option(RAFT_ENABLE_nccl_DEPENDENCY "Enable NCCL dependency" OFF)
+option(RAFT_ENABLE_ucx_DEPENDENCY "Enable ucx dependency" OFF)
+
 include(CMakeDependentOption)
 cmake_dependent_option(RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARIES OFF)
 
@@ -68,8 +75,6 @@ message(VERBOSE "RAFT: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}")
 message(VERBOSE "RAFT: Enable nvtx markers: ${NVTX}")
 message(VERBOSE "RAFT: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 
-list(APPEND raft_FIND_COMPONENTS "core")
-
 # Set RMM logging level
 set(RMM_LOGGING_LEVEL "INFO" CACHE STRING "Choose the logging level.")
 set_property(CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF")
@@ -101,8 +106,8 @@ endif()
 # * enable the CMake CUDA language
 # * set other CUDA compilation flags
 rapids_find_package(CUDAToolkit REQUIRED
-    BUILD_EXPORT_SET raft-core-exports
-    INSTALL_EXPORT_SET raft-core-exports)
+    BUILD_EXPORT_SET raft-exports
+    INSTALL_EXPORT_SET raft-exports)
 include(cmake/modules/ConfigureCUDA.cmake)
 
 ##############################################################################
@@ -121,8 +126,6 @@ include(cmake/thirdparty/get_mdspan.cmake)
 
 if(BUILD_TESTS)
   include(cmake/thirdparty/get_gtest.cmake)
-  include(cmake/thirdparty/get_nccl.cmake)
-  include(cmake/thirdparty/get_ucx.cmake)
 endif()
 
 if(BUILD_BENCH)
@@ -150,7 +153,7 @@ target_link_libraries(raft INTERFACE
         raft::Thrust
         $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
         ${RAFT_LINK_LIBS}
-        cuco::cuco
+        $<$<BOOL:${cuco_ADDED}>:cuco::cuco>
         std::mdspan)
 
 target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
@@ -172,25 +175,6 @@ if(${RAFT_STATIC_LINK_LIBRARIES})
   set(RAFT_LIB_TYPE STATIC)
 endif()
 
-##############################################################################
-# - raft_core -------------------------------------------------------------
-
-add_library(raft_core INTERFACE)
-if(TARGET raft_core AND (NOT TARGET raft::core))
-  add_library(raft::core ALIAS raft)
-endif()
-
-target_include_directories(raft_core INTERFACE
-        "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
-        "$<INSTALL_INTERFACE:include>"
-        )
-target_link_libraries(raft_core INTERFACE ${RAFT_LINK_LIBS})
-
-set_target_properties(raft_core PROPERTIES EXPORT_NAME core)
-
-target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
-target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
-
 ##############################################################################
 # - raft_distance ------------------------------------------------------------
 add_library(raft_distance INTERFACE)
@@ -251,7 +235,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
 endif()
 
 target_link_libraries(raft_distance INTERFACE
-    raft::core
+    raft::raft
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )
@@ -291,10 +275,10 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
 
   target_compile_definitions(raft_nn_lib
           INTERFACE "RAFT_NN_COMPILED")
-
 endif()
 
-target_link_libraries(raft_nn INTERFACE raft::core faiss::faiss
+target_link_libraries(raft_nn INTERFACE
+    raft::raft
     $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 
@@ -304,10 +288,6 @@ rapids_cmake_install_lib_dir( lib_dir )
 include(GNUInstallDirs)
 include(CPack)
 
-install(TARGETS raft_core
-        DESTINATION ${lib_dir}
-        EXPORT raft-core-exports)
-
 install(TARGETS raft
         DESTINATION ${lib_dir}
         EXPORT raft-exports)
@@ -332,12 +312,12 @@ if(TARGET raft_nn_lib)
 endif()
 
 
-install(DIRECTORY include/raft_core
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_core)
+install(DIRECTORY include/raft
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft)
 
 # Temporary install of raft.hpp while the file is removed
-install(FILES include/raft_core/raft.hpp
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft_core)
+install(FILES include/raft.hpp
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft)
 
 ##############################################################################
 # - install export -----------------------------------------------------------
@@ -349,13 +329,11 @@ RAFT (Reusable Analytics Functions and other Tools) contains fundamental
 widely-used algorithms and primitives for data science, graph, and ml.
 
 Optional Components:
-  - core
   - nn
   - distance
 
 Imported Targets:
   - raft::raft
-  - raft::core brought in by the `core` optional component
   - raft::nn brought in by the `nn` optional component
   - raft::distance brought in by the `distance` optional component
 
@@ -364,20 +342,7 @@ Imported Targets:
 set(code_string
 [=[
 
-if((distance IN_LIST raft_FIND_COMPONENTS OR
-    nn IN_LIST raft_FIND_COMPONENTS) AND NOT
-    core IN_LIST raft_FIND_COMPONENTS))
-    FATAL_ERROR("core must be included to use components ${raft_FIND_COMPONENTS}")
-endif()
-
-if(core IN_LIST raft_FIND_COMPONENTS)
-  if(NOT TARGET raft::Thrust)
-    thrust_create_target(raft::Thrust FROM_OPTIONS)
-  endif()
-endif()
-
-if(distance IN_LIST raft_FIND_COMPONENTS OR
-   core IN_LIST raft_FIND_COMPONENTS)
+if(distance IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)
 endif()
 
@@ -396,9 +361,9 @@ endif()
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
 raft_export(INSTALL raft
-        COMPONENTS core nn distance
+        COMPONENTS nn distance
         EXPORT_SET raft-exports
-        GLOBAL_TARGETS core nn distance
+        GLOBAL_TARGETS nn distance
         NAMESPACE raft::
         DOCUMENTATION doc_string
         FINAL_CODE_BLOCK code_string)
@@ -407,8 +372,8 @@ raft_export(INSTALL raft
 # - build export -------------------------------------------------------------
 raft_export(BUILD raft
         EXPORT_SET raft-exports
-        COMPONENTS core nn distance
-        GLOBAL_TARGETS raft_core raft_distance raft_nn
+        COMPONENTS nn distance
+        GLOBAL_TARGETS raft_distance raft_nn
         DOCUMENTATION doc_string
         NAMESPACE raft::
         FINAL_CODE_BLOCK code_string)
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 0fbc2e6202..3a70d34283 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -16,17 +16,19 @@
 
 function(find_and_configure_cuco VERSION)
 
-    rapids_cpm_find(cuco ${VERSION}
-      GLOBAL_TARGETS      cuco::cuco
-      BUILD_EXPORT_SET    raft-exports
-      CPM_ARGS
-        EXCLUDE_FROM_ALL TRUE
-        GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-        GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
-        OPTIONS        "BUILD_TESTS OFF"
-                       "BUILD_BENCHMARKS OFF"
-                       "BUILD_EXAMPLES OFF"
-    )
+    if(RAFT_ENABLE_cuco_DEPENDENCY)
+        rapids_cpm_find(cuco ${VERSION}
+          GLOBAL_TARGETS      cuco::cuco
+          BUILD_EXPORT_SET    raft-exports
+          CPM_ARGS
+            EXCLUDE_FROM_ALL TRUE
+            GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
+            GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
+            OPTIONS        "BUILD_TESTS OFF"
+                           "BUILD_BENCHMARKS OFF"
+                           "BUILD_EXAMPLES OFF"
+        )
+    endif()
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index 51ed34754b..b3c9abba75 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -69,5 +69,4 @@ endfunction()
 
 find_and_configure_faiss(VERSION    1.7.0
                          PINNED_TAG  bde7c0027191f29c9dadafe4f6e68ca0ee31fb30
-                         BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
-                        )
+                         BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC})
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 3049f4f23e..b30a4648a6 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -25,7 +25,7 @@
 #pragma once
 
 #include <memory>
-#include <raft_core/error.hpp>
+#include <raft/error.hpp>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/include/raft_core/comms/comms.hpp b/cpp/include/raft_core/comms/comms.hpp
deleted file mode 100644
index e2ddb6250b..0000000000
--- a/cpp/include/raft_core/comms/comms.hpp
+++ /dev/null
@@ -1,638 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __RAFT_RT_COMMS_H
-#define __RAFT_RT_COMMS_H
-
-#pragma once
-
-#include <memory>
-#include <raft_core/error.hpp>
-#include <vector>
-
-namespace raft {
-namespace comms {
-
-typedef unsigned int request_t;
-enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
-enum class op_t { SUM, PROD, MIN, MAX };
-
-/**
- * The resulting status of distributed stream synchronization
- */
-enum class status_t {
-  SUCCESS,  // Synchronization successful
-  ERROR,    // An error occured querying sync status
-  ABORT     // A failure occurred in sync, queued operations aborted
-};
-
-template <typename value_t>
-constexpr datatype_t
-
-get_type();
-
-template <>
-constexpr datatype_t
-
-get_type<char>()
-{
-  return datatype_t::CHAR;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<uint8_t>()
-{
-  return datatype_t::UINT8;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<int>()
-{
-  return datatype_t::INT32;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<uint32_t>()
-{
-  return datatype_t::UINT32;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<int64_t>()
-{
-  return datatype_t::INT64;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<uint64_t>()
-{
-  return datatype_t::UINT64;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<float>()
-{
-  return datatype_t::FLOAT32;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<double>()
-{
-  return datatype_t::FLOAT64;
-}
-
-class comms_iface {
- public:
-  virtual ~comms_iface() {}
-
-  virtual int get_size() const = 0;
-
-  virtual int get_rank() const = 0;
-
-  virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
-
-  virtual void barrier() const = 0;
-
-  virtual status_t sync_stream(cudaStream_t stream) const = 0;
-
-  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
-
-  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
-
-  virtual void waitall(int count, request_t array_of_requests[]) const = 0;
-
-  virtual void allreduce(const void* sendbuff,
-                         void* recvbuff,
-                         size_t count,
-                         datatype_t datatype,
-                         op_t op,
-                         cudaStream_t stream) const = 0;
-
-  virtual void bcast(
-    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
-
-  virtual void bcast(const void* sendbuff,
-                     void* recvbuff,
-                     size_t count,
-                     datatype_t datatype,
-                     int root,
-                     cudaStream_t stream) const = 0;
-
-  virtual void reduce(const void* sendbuff,
-                      void* recvbuff,
-                      size_t count,
-                      datatype_t datatype,
-                      op_t op,
-                      int root,
-                      cudaStream_t stream) const = 0;
-
-  virtual void allgather(const void* sendbuff,
-                         void* recvbuff,
-                         size_t sendcount,
-                         datatype_t datatype,
-                         cudaStream_t stream) const = 0;
-
-  virtual void allgatherv(const void* sendbuf,
-                          void* recvbuf,
-                          const size_t* recvcounts,
-                          const size_t* displs,
-                          datatype_t datatype,
-                          cudaStream_t stream) const = 0;
-
-  virtual void gather(const void* sendbuff,
-                      void* recvbuff,
-                      size_t sendcount,
-                      datatype_t datatype,
-                      int root,
-                      cudaStream_t stream) const = 0;
-
-  virtual void gatherv(const void* sendbuf,
-                       void* recvbuf,
-                       size_t sendcount,
-                       const size_t* recvcounts,
-                       const size_t* displs,
-                       datatype_t datatype,
-                       int root,
-                       cudaStream_t stream) const = 0;
-
-  virtual void reducescatter(const void* sendbuff,
-                             void* recvbuff,
-                             size_t recvcount,
-                             datatype_t datatype,
-                             op_t op,
-                             cudaStream_t stream) const = 0;
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
-
-  virtual void device_sendrecv(const void* sendbuf,
-                               size_t sendsize,
-                               int dest,
-                               void* recvbuf,
-                               size_t recvsize,
-                               int source,
-                               cudaStream_t stream) const = 0;
-
-  virtual void device_multicast_sendrecv(const void* sendbuf,
-                                         std::vector<size_t> const& sendsizes,
-                                         std::vector<size_t> const& sendoffsets,
-                                         std::vector<int> const& dests,
-                                         void* recvbuf,
-                                         std::vector<size_t> const& recvsizes,
-                                         std::vector<size_t> const& recvoffsets,
-                                         std::vector<int> const& sources,
-                                         cudaStream_t stream) const = 0;
-};
-
-class comms_t {
- public:
-  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
-  {
-    ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
-  }
-
-  /**
-   * Virtual Destructor to enable polymorphism
-   */
-  virtual ~comms_t() {}
-
-  /**
-   * Returns the size of the communicator clique
-   */
-
-  int get_size() const { return impl_->get_size(); }
-
-  /**
-   * Returns the local rank
-   */
-  int get_rank() const { return impl_->get_rank(); }
-
-  /**
-   * Splits the current communicator clique into sub-cliques matching
-   * the given color and key
-   *
-   * @param color ranks w/ the same color are placed in the same communicator
-   * @param key controls rank assignment
-   */
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const
-  {
-    return impl_->comm_split(color, key);
-  }
-
-  /**
-   * Performs a collective barrier synchronization
-   */
-  void barrier() const { impl_->barrier(); }
-
-  /**
-   * Some collective communications implementations (eg. NCCL) might use asynchronous
-   * collectives that are explicitly synchronized. It's important to always synchronize
-   * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
-   * to prevent the potential for deadlocks.
-   *
-   * @param stream the cuda stream to sync collective operations on
-   */
-  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
-
-  /**
-   * Performs an asynchronous point-to-point send
-   * @tparam value_t the type of data to send
-   * @param buf pointer to array of data to send
-   * @param size number of elements in buf
-   * @param dest destination rank
-   * @param tag a tag to use for the receiver to filter
-   * @param request pointer to hold returned request_t object.
-   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-   */
-  template <typename value_t>
-  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
-  {
-    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
-  }
-
-  /**
-   * Performs an asynchronous point-to-point receive
-   * @tparam value_t the type of data to be received
-   * @param buf pointer to (initialized) array that will hold received data
-   * @param size number of elements in buf
-   * @param source source rank
-   * @param tag a tag to use for message filtering
-   * @param request pointer to hold returned request_t object.
-   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-   */
-  template <typename value_t>
-  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
-  {
-    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
-  }
-
-  /**
-   * Synchronize on an array of request_t objects returned from isend/irecv
-   * @param count number of requests to synchronize on
-   * @param array_of_requests an array of request_t objects returned from isend/irecv
-   */
-  void waitall(int count, request_t array_of_requests[]) const
-  {
-    impl_->waitall(count, array_of_requests);
-  }
-
-  /**
-   * Perform an allreduce collective
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff data to reduce
-   * @param recvbuff buffer to hold the reduced result
-   * @param count number of elements in sendbuff
-   * @param op reduction operation to perform
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allreduce(
-    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
-  {
-    impl_->allreduce(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff),
-                     count,
-                     get_type<value_t>(),
-                     op,
-                     stream);
-  }
-
-  /**
-   * Broadcast data from one rank to the rest
-   * @tparam value_t datatype of underlying buffers
-   * @param buff buffer to send
-   * @param count number of elements if buff
-   * @param root the rank initiating the broadcast
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
-  {
-    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
-  }
-
-  /**
-   * Broadcast data from one rank to the rest
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to broadcast (only used in root)
-   * @param recvbuff buffer to receive broadcasted data
-   * @param count number of elements if buff
-   * @param root the rank initiating the broadcast
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void bcast(
-    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
-  {
-    impl_->bcast(static_cast<const void*>(sendbuff),
-                 static_cast<void*>(recvbuff),
-                 count,
-                 get_type<value_t>(),
-                 root,
-                 stream);
-  }
-
-  /**
-   * Reduce data from many ranks down to a single rank
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to reduce
-   * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
-   * @param count number of elements in sendbuff
-   * @param op reduction operation to perform
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void reduce(const value_t* sendbuff,
-              value_t* recvbuff,
-              size_t count,
-              op_t op,
-              int root,
-              cudaStream_t stream) const
-  {
-    impl_->reduce(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff),
-                  count,
-                  get_type<value_t>(),
-                  op,
-                  root,
-                  stream);
-  }
-
-  /**
-   * Gathers data from each rank onto all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to gather
-   * @param recvbuff buffer containing gathered data from all ranks
-   * @param sendcount number of elements in send buffer
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allgather(const value_t* sendbuff,
-                 value_t* recvbuff,
-                 size_t sendcount,
-                 cudaStream_t stream) const
-  {
-    impl_->allgather(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff),
-                     sendcount,
-                     get_type<value_t>(),
-                     stream);
-  }
-
-  /**
-   * Gathers data from all ranks and delivers to combined data to all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuf buffer containing data to send
-   * @param recvbuf buffer containing data to receive
-   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-   *                   elements that are to be received from each rank
-   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-   *               (relative to recvbuf) at which to place the incoming data from each rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allgatherv(const value_t* sendbuf,
-                  value_t* recvbuf,
-                  const size_t* recvcounts,
-                  const size_t* displs,
-                  cudaStream_t stream) const
-  {
-    impl_->allgatherv(static_cast<const void*>(sendbuf),
-                      static_cast<void*>(recvbuf),
-                      recvcounts,
-                      displs,
-                      get_type<value_t>(),
-                      stream);
-  }
-
-  /**
-   * Gathers data from each rank onto all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to gather
-   * @param recvbuff buffer containing gathered data from all ranks
-   * @param sendcount number of elements in send buffer
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void gather(const value_t* sendbuff,
-              value_t* recvbuff,
-              size_t sendcount,
-              int root,
-              cudaStream_t stream) const
-  {
-    impl_->gather(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff),
-                  sendcount,
-                  get_type<value_t>(),
-                  root,
-                  stream);
-  }
-
-  /**
-   * Gathers data from all ranks and delivers to combined data to all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuf buffer containing data to send
-   * @param recvbuf buffer containing data to receive
-   * @param sendcount number of elements in send buffer
-   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-   *                   elements that are to be received from each rank
-   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-   *               (relative to recvbuf) at which to place the incoming data from each rank
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void gatherv(const value_t* sendbuf,
-               value_t* recvbuf,
-               size_t sendcount,
-               const size_t* recvcounts,
-               const size_t* displs,
-               int root,
-               cudaStream_t stream) const
-  {
-    impl_->gatherv(static_cast<const void*>(sendbuf),
-                   static_cast<void*>(recvbuf),
-                   sendcount,
-                   recvcounts,
-                   displs,
-                   get_type<value_t>(),
-                   root,
-                   stream);
-  }
-
-  /**
-   * Reduces data from all ranks then scatters the result across ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
-   * @param recvbuff buffer containing received data
-   * @param recvcount number of items to receive
-   * @param op reduction operation to perform
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void reducescatter(const value_t* sendbuff,
-                     value_t* recvbuff,
-                     size_t recvcount,
-                     op_t op,
-                     cudaStream_t stream) const
-  {
-    impl_->reducescatter(static_cast<const void*>(sendbuff),
-                         static_cast<void*>(recvbuff),
-                         recvcount,
-                         get_type<value_t>(),
-                         op,
-                         stream);
-  }
-
-  /**
-   * Performs a point-to-point send
-   *
-   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-   *
-   * @tparam value_t the type of data to send
-   * @param buf pointer to array of data to send
-   * @param size number of elements in buf
-   * @param dest destination rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
-  {
-    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
-  }
-
-  /**
-   * Performs a point-to-point receive
-   *
-   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-   *
-   * @tparam value_t the type of data to be received
-   * @param buf pointer to (initialized) array that will hold received data
-   * @param size number of elements in buf
-   * @param source source rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
-  {
-    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
-  }
-
-  /**
-   * Performs a point-to-point send/receive
-   *
-   * @tparam value_t the type of data to be sent & received
-   * @param sendbuf pointer to array of data to send
-   * @param sendsize number of elements in sendbuf
-   * @param dest destination rank
-   * @param recvbuf pointer to (initialized) array that will hold received data
-   * @param recvsize number of elements in recvbuf
-   * @param source source rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_sendrecv(const value_t* sendbuf,
-                       size_t sendsize,
-                       int dest,
-                       value_t* recvbuf,
-                       size_t recvsize,
-                       int source,
-                       cudaStream_t stream) const
-  {
-    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
-                           sendsize * sizeof(value_t),
-                           dest,
-                           static_cast<void*>(recvbuf),
-                           recvsize * sizeof(value_t),
-                           source,
-                           stream);
-  }
-
-  /**
-   * Performs a multicast send/receive
-   *
-   * @tparam value_t the type of data to be sent & received
-   * @param sendbuf pointer to array of data to send
-   * @param sendsizes numbers of elements to send
-   * @param sendoffsets offsets in a number of elements from sendbuf
-   * @param dests destination ranks
-   * @param recvbuf pointer to (initialized) array that will hold received data
-   * @param recvsizes numbers of elements to recv
-   * @param recvoffsets offsets in a number of elements from recvbuf
-   * @param sources source ranks
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_multicast_sendrecv(const value_t* sendbuf,
-                                 std::vector<size_t> const& sendsizes,
-                                 std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests,
-                                 value_t* recvbuf,
-                                 std::vector<size_t> const& recvsizes,
-                                 std::vector<size_t> const& recvoffsets,
-                                 std::vector<int> const& sources,
-                                 cudaStream_t stream) const
-  {
-    auto sendbytesizes   = sendsizes;
-    auto sendbyteoffsets = sendoffsets;
-    for (size_t i = 0; i < sendsizes.size(); ++i) {
-      sendbytesizes[i] *= sizeof(value_t);
-      sendbyteoffsets[i] *= sizeof(value_t);
-    }
-    auto recvbytesizes   = recvsizes;
-    auto recvbyteoffsets = recvoffsets;
-    for (size_t i = 0; i < recvsizes.size(); ++i) {
-      recvbytesizes[i] *= sizeof(value_t);
-      recvbyteoffsets[i] *= sizeof(value_t);
-    }
-    impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
-                                     sendbytesizes,
-                                     sendbyteoffsets,
-                                     dests,
-                                     static_cast<void*>(recvbuf),
-                                     recvbytesizes,
-                                     recvbyteoffsets,
-                                     sources,
-                                     stream);
-  }
-
- private:
-  std::unique_ptr<comms_iface> impl_;
-};
-
-}  // namespace comms
-}  // namespace raft
-
-#endif
\ No newline at end of file
diff --git a/cpp/include/raft_core/cudart_utils.hpp b/cpp/include/raft_core/cudart_utils.hpp
deleted file mode 100644
index 72cf5f0317..0000000000
--- a/cpp/include/raft_core/cudart_utils.hpp
+++ /dev/null
@@ -1,441 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __RAFT_RT_CUDART_UTILS_H
-#define __RAFT_RT_CUDART_UTILS_H
-
-#pragma once
-
-#include <raft_core/error.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-#include <cuda_runtime.h>
-
-#include <chrono>
-#include <cstdio>
-#include <execinfo.h>
-#include <iomanip>
-#include <iostream>
-#include <mutex>
-#include <unordered_map>
-
-///@todo: enable once logging has been enabled in raft
-//#include "logger.hpp"
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a CUDA error is encountered.
- */
-struct cuda_error : public raft::exception {
-  explicit cuda_error(char const* const message) : raft::exception(message) {}
-  explicit cuda_error(std::string const& message) : raft::exception(message) {}
-};
-
-}  // namespace raft
-
-/**
- * @brief Error checking macro for CUDA runtime API functions.
- *
- * Invokes a CUDA runtime API function call, if the call does not return
- * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
- * exception detailing the CUDA error that occurred
- *
- */
-#define RAFT_CUDA_TRY(call)                        \
-  do {                                             \
-    cudaError_t const status = call;               \
-    if (status != cudaSuccess) {                   \
-      cudaGetLastError();                          \
-      std::string msg{};                           \
-      SET_ERROR_MSG(msg,                           \
-                    "CUDA error encountered at: ", \
-                    "call='%s', Reason=%s:%s",     \
-                    #call,                         \
-                    cudaGetErrorName(status),      \
-                    cudaGetErrorString(status));   \
-      throw raft::cuda_error(msg);                 \
-    }                                              \
-  } while (0)
-
-// FIXME: Remove after consumers rename
-#ifndef CUDA_TRY
-#define CUDA_TRY(call) RAFT_CUDA_TRY(call)
-#endif
-
-/**
- * @brief Debug macro to check for CUDA errors
- *
- * In a non-release build, this macro will synchronize the specified stream
- * before error checking. In both release and non-release builds, this macro
- * checks for any pending CUDA errors from previous calls. If an error is
- * reported, an exception is thrown detailing the CUDA error that occurred.
- *
- * The intent of this macro is to provide a mechanism for synchronous and
- * deterministic execution for debugging asynchronous CUDA execution. It should
- * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an
- * asynchronous kernel launch.
- */
-#ifndef NDEBUG
-#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-#else
-#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError());
-#endif
-
-// FIXME: Remove after consumers rename
-#ifndef CHECK_CUDA
-#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call)
-#endif
-
-/** FIXME: remove after cuml rename */
-#ifndef CUDA_CHECK
-#define CUDA_CHECK(call) RAFT_CUDA_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUDA_TRY_NO_THROW(call)                               \
-  do {                                                             \
-    cudaError_t const status = call;                               \
-    if (cudaSuccess != status) {                                   \
-      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                \
-             __FILE__,                                             \
-             __LINE__,                                             \
-             cudaGetErrorString(status));                          \
-    }                                                              \
-  } while (0)
-
-// FIXME: Remove after cuml rename
-#ifndef CUDA_CHECK_NO_THROW
-#define CUDA_CHECK_NO_THROW(call) RAFT_CUDA_TRY_NO_THROW(call)
-#endif
-
-/**
- * Alias to raft scope for now.
- * TODO: Rename original implementations in 22.04 to fix
- * https://github.com/rapidsai/raft/issues/128
- */
-
-namespace raft {
-
-/** Helper method to get to know warp size in device code */
-__host__ __device__ constexpr inline int warp_size() { return 32; }
-
-__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
-
-/**
- * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
- * elements to threads.
- */
-class grid_1d_thread_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   * @param elements_per_thread Typically, a single kernel thread processes more than a single
-   * element; this affects the number of threads the grid must contain
-   * @param max_num_blocks_1d maximum number of 1d blocks
-   */
-  grid_1d_thread_t(size_t overall_num_elements,
-                   size_t num_threads_per_block,
-                   size_t max_num_blocks_1d,
-                   size_t elements_per_thread = 1)
-    : block_size(num_threads_per_block),
-      num_blocks(
-        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
-                   (elements_per_thread * num_threads_per_block),
-                 max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-    RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
-  }
-};
-
-/**
- * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
- * elements to warps.
- */
-class grid_1d_warp_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   * @param max_num_blocks_1d maximum number of 1d blocks
-   */
-  grid_1d_warp_t(size_t overall_num_elements,
-                 size_t num_threads_per_block,
-                 size_t max_num_blocks_1d)
-    : block_size(num_threads_per_block),
-      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
-                            (num_threads_per_block / warp_size()),
-                          max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-  }
-};
-
-/**
- * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
- * elements to blocks.
- */
-class grid_1d_block_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   * @param max_num_blocks_1d maximium number of 1d blocks
-   */
-  grid_1d_block_t(size_t overall_num_elements,
-                  size_t num_threads_per_block,
-                  size_t max_num_blocks_1d)
-    : block_size(num_threads_per_block),
-      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-  }
-};
-
-/**
- * @brief Generic copy method for all kinds of transfers
- * @tparam Type data type
- * @param dst destination pointer
- * @param src source pointer
- * @param len lenth of the src/dst buffers in terms of number of elements
- * @param stream cuda stream
- */
-template <typename Type>
-void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
-{
-  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
-}
-
-/**
- * @defgroup Copy Copy methods
- * These are here along with the generic 'copy' method in order to improve
- * code readability using explicitly specified function names
- * @{
- */
-/** performs a host to device copy */
-template <typename Type>
-void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
-{
-  copy(d_ptr, h_ptr, len, stream);
-}
-
-/** performs a device to host copy */
-template <typename Type>
-void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
-{
-  copy(h_ptr, d_ptr, len, stream);
-}
-
-template <typename Type>
-void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
-{
-  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
-}
-/** @} */
-
-/**
- * @defgroup Debug Utils for debugging host/device buffers
- * @{
- */
-template <class T, class OutStream>
-void print_host_vector(const char* variable_name,
-                       const T* host_mem,
-                       size_t componentsCount,
-                       OutStream& out)
-{
-  out << variable_name << "=[";
-  for (size_t i = 0; i < componentsCount; ++i) {
-    if (i != 0) out << ",";
-    out << host_mem[i];
-  }
-  out << "];\n";
-}
-
-template <class T, class OutStream>
-void print_device_vector(const char* variable_name,
-                         const T* devMem,
-                         size_t componentsCount,
-                         OutStream& out)
-{
-  T* host_mem = new T[componentsCount];
-  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
-  print_host_vector(variable_name, host_mem, componentsCount, out);
-  delete[] host_mem;
-}
-/** @} */
-
-static std::mutex mutex_;
-static std::unordered_map<void*, size_t> allocations;
-
-template <typename Type>
-void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false)
-{
-  size_t size = len * sizeof(Type);
-  ptr         = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
-  if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
-
-  std::lock_guard<std::mutex> _(mutex_);
-  allocations[ptr] = size;
-}
-
-template <typename Type>
-void deallocate(Type*& ptr, rmm::cuda_stream_view stream)
-{
-  std::lock_guard<std::mutex> _(mutex_);
-  size_t size = allocations[ptr];
-  allocations.erase(ptr);
-  rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
-}
-
-inline void deallocate_all(rmm::cuda_stream_view stream)
-{
-  std::lock_guard<std::mutex> _(mutex_);
-  for (auto& alloc : allocations) {
-    void* ptr   = alloc.first;
-    size_t size = alloc.second;
-    rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
-  }
-  allocations.clear();
-}
-
-/** helper method to get max usable shared mem per block parameter */
-inline int getSharedMemPerBlock()
-{
-  int devId;
-  RAFT_CUDA_TRY(cudaGetDevice(&devId));
-  int smemPerBlk;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
-  return smemPerBlk;
-}
-
-/** helper method to get multi-processor count parameter */
-inline int getMultiProcessorCount()
-{
-  int devId;
-  RAFT_CUDA_TRY(cudaGetDevice(&devId));
-  int mpCount;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
-  return mpCount;
-}
-
-/** helper method to convert an array on device to a string on host */
-template <typename T>
-std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
-{
-  std::stringstream ss;
-
-  T* arr_h = (T*)malloc(size * sizeof(T));
-  update_host(arr_h, arr, size, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-
-  ss << name << " = [ ";
-  for (int i = 0; i < size; i++) {
-    ss << std::setw(width) << arr_h[i];
-
-    if (i < size - 1) ss << ", ";
-  }
-  ss << " ]" << std::endl;
-
-  free(arr_h);
-
-  return ss.str();
-}
-
-/** this seems to be unused, but may be useful in the future */
-template <typename T>
-void ASSERT_DEVICE_MEM(T* ptr, std::string name)
-{
-  cudaPointerAttributes s_att;
-  cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
-
-  if (s_err != 0 || s_att.device == -1)
-    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
-              << ", err=" << s_err << std::endl;
-}
-
-inline uint32_t curTimeMillis()
-{
-  auto now      = std::chrono::high_resolution_clock::now();
-  auto duration = now.time_since_epoch();
-  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
-}
-
-/** Helper function to calculate need memory for allocate to store dense matrix.
- * @param rows number of rows in matrix
- * @param columns number of columns in matrix
- * @return need number of items to allocate via allocate()
- * @sa allocate()
- */
-inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
-
-/** Helper function to check alignment of pointer.
- * @param ptr the pointer to check
- * @param alignment to be checked for
- * @return true if address in bytes is a multiple of alignment
- */
-template <typename Type>
-bool is_aligned(Type* ptr, size_t alignment)
-{
-  return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
-}
-
-/** calculate greatest common divisor of two numbers
- * @a integer
- * @b integer
- * @ return gcd of a and b
- */
-template <typename IntType>
-IntType gcd(IntType a, IntType b)
-{
-  while (b != 0) {
-    IntType tmp = b;
-    b           = a % b;
-    a           = tmp;
-  }
-  return a;
-}
-
-}  // namespace raft
-
-#endif
diff --git a/cpp/include/raft_core/error.hpp b/cpp/include/raft_core/error.hpp
deleted file mode 100644
index a65b9a8469..0000000000
--- a/cpp/include/raft_core/error.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __RAFT_RT_ERROR
-#define __RAFT_RT_ERROR
-
-#pragma once
-
-#include <cstdio>
-#include <execinfo.h>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
-namespace raft {
-
-/** base exception class for the whole of raft */
-class exception : public std::exception {
- public:
-  /** default ctor */
-  explicit exception() noexcept : std::exception(), msg_() {}
-
-  /** copy ctor */
-  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
-  {
-    collect_call_stack();
-  }
-
-  /** ctor from an input message */
-  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
-  {
-    collect_call_stack();
-  }
-
-  /** get the message associated with this exception */
-  char const* what() const noexcept override { return msg_.c_str(); }
-
- private:
-  /** message associated with this exception */
-  std::string msg_;
-
-  /** append call stack info to this exception's message for ease of debug */
-  // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-  void collect_call_stack() noexcept
-  {
-#ifdef __GNUC__
-    constexpr int kMaxStackDepth = 64;
-    void* stack[kMaxStackDepth];  // NOLINT
-    auto depth = backtrace(stack, kMaxStackDepth);
-    std::ostringstream oss;
-    oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
-    char** strings = backtrace_symbols(stack, depth);
-    if (strings == nullptr) {
-      oss << "But no stack trace could be found!" << std::endl;
-      msg_ += oss.str();
-      return;
-    }
-    ///@todo: support for demangling of C++ symbol names
-    for (int i = 0; i < depth; ++i) {
-      oss << "#" << i << " in " << strings[i] << std::endl;
-    }
-    free(strings);
-    msg_ += oss.str();
-#endif  // __GNUC__
-  }
-};
-
-/**
- * @brief Exception thrown when logical precondition is violated.
- *
- * This exception should not be thrown directly and is instead thrown by the
- * RAFT_EXPECTS and  RAFT_FAIL macros.
- *
- */
-struct logic_error : public raft::exception {
-  explicit logic_error(char const* const message) : raft::exception(message) {}
-  explicit logic_error(std::string const& message) : raft::exception(message) {}
-};
-
-}  // namespace raft
-
-// FIXME: Need to be replaced with RAFT_FAIL
-/** macro to throw a runtime error */
-#define THROW(fmt, ...)                                                                      \
-  do {                                                                                       \
-    int size1 =                                                                              \
-      std::snprintf(nullptr, 0, "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
-    int size2 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                               \
-    if (size1 < 0 || size2 < 0)                                                              \
-      throw raft::exception("Error in snprintf, cannot handle raft exception.");             \
-    auto size = size1 + size2 + 1; /* +1 for final '\0' */                                   \
-    auto buf  = std::make_unique<char[]>(size_t(size));                                      \
-    std::snprintf(buf.get(),                                                                 \
-                  size1 + 1 /* +1 for '\0' */,                                               \
-                  "exception occured! file=%s line=%d: ",                                    \
-                  __FILE__,                                                                  \
-                  __LINE__);                                                                 \
-    std::snprintf(buf.get() + size1, size2 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);       \
-    std::string msg(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
-    throw raft::exception(msg);                                                              \
-  } while (0)
-
-// FIXME: Need to be replaced with RAFT_EXPECTS
-/** macro to check for a conditional and assert on failure */
-#define ASSERT(check, fmt, ...)              \
-  do {                                       \
-    if (!(check)) THROW(fmt, ##__VA_ARGS__); \
-  } while (0)
-
-/**
- * Macro to append error message to first argument.
- * This should only be called in contexts where it is OK to throw exceptions!
- */
-#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                           \
-  do {                                                                                          \
-    int size1 = std::snprintf(nullptr, 0, "%s", location_prefix);                               \
-    int size2 = std::snprintf(nullptr, 0, "file=%s line=%d: ", __FILE__, __LINE__);             \
-    int size3 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                                  \
-    if (size1 < 0 || size2 < 0 || size3 < 0)                                                    \
-      throw raft::exception("Error in snprintf, cannot handle raft exception.");                \
-    auto size = size1 + size2 + size3 + 1; /* +1 for final '\0' */                              \
-    auto buf  = std::make_unique<char[]>(size_t(size));                                         \
-    std::snprintf(buf.get(), size1 + 1 /* +1 for '\0' */, "%s", location_prefix);               \
-    std::snprintf(                                                                              \
-      buf.get() + size1, size2 + 1 /* +1 for '\0' */, "file=%s line=%d: ", __FILE__, __LINE__); \
-    std::snprintf(buf.get() + size1 + size2, size3 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);  \
-    msg += std::string(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
-  } while (0)
-
-/**
- * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false
- *
- * @param[in] cond Expression that evaluates to true or false
- * @param[in] fmt String literal description of the reason that cond is expected to be true with
- * optinal format tagas
- * @throw raft::logic_error if the condition evaluates to false.
- */
-#define RAFT_EXPECTS(cond, fmt, ...)                              \
-  do {                                                            \
-    if (!(cond)) {                                                \
-      std::string msg{};                                          \
-      SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
-      throw raft::logic_error(msg);                               \
-    }                                                             \
-  } while (0)
-
-/**
- * @brief Indicates that an erroneous code path has been taken.
- *
- * @param[in] fmt String literal description of the reason that this code path is erroneous with
- * optinal format tagas
- * @throw always throws raft::logic_error
- */
-#define RAFT_FAIL(fmt, ...)                                     \
-  do {                                                          \
-    std::string msg{};                                          \
-    SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
-    throw raft::logic_error(msg);                               \
-  } while (0)
-
-#endif
\ No newline at end of file
diff --git a/cpp/include/raft_core/handle.hpp b/cpp/include/raft_core/handle.hpp
deleted file mode 100644
index dc31f6e732..0000000000
--- a/cpp/include/raft_core/handle.hpp
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __RAFT_RT_HANDLE
-#define __RAFT_RT_HANDLE
-
-#pragma once
-
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <cusolverDn.h>
-#include <cusolverSp.h>
-#include <cusparse.h>
-
-///@todo: enable once we have migrated cuml-comms layer too
-//#include <common/cuml_comms_int.hpp>
-
-#include <raft_core/cudart_utils.hpp>
-
-#include <raft_core/comms/comms.hpp>
-#include <raft_core/interruptible.hpp>
-#include <raft_core/linalg/cublas_macros.hpp>
-#include <raft_core/linalg/cusolver_macros.hpp>
-#include <raft_core/sparse/cusparse_macros.hpp>
-#include <rmm/cuda_stream_pool.hpp>
-#include <rmm/exec_policy.hpp>
-
-namespace raft {
-
-/**
- * @brief Main handle object that stores all necessary context used for calling
- *        necessary cuda kernels and/or libraries
- */
-class handle_t {
- public:
-  // delete copy/move constructors and assignment operators as
-  // copying and moving underlying resources is unsafe
-  handle_t(const handle_t&) = delete;
-  handle_t& operator=(const handle_t&) = delete;
-  handle_t(handle_t&&)                 = delete;
-  handle_t& operator=(handle_t&&) = delete;
-
-  /**
-   * @brief Construct a handle with a stream view and stream pool
-   *
-   * @param[in] stream_view the default stream (which has the default per-thread stream if
-   * unspecified)
-   * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
-   */
-  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
-           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
-    : dev_id_([]() -> int {
-        int cur_dev = -1;
-        RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
-        return cur_dev;
-      }()),
-      stream_view_{stream_view},
-      stream_pool_{stream_pool}
-  {
-    create_resources();
-  }
-
-  /** Destroys all held-up resources */
-  virtual ~handle_t() { destroy_resources(); }
-
-  int get_device() const { return dev_id_; }
-
-  cublasHandle_t get_cublas_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cublas_initialized_) {
-      RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
-      RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
-      cublas_initialized_ = true;
-    }
-    return cublas_handle_;
-  }
-
-  cusolverDnHandle_t get_cusolver_dn_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
-      cusolver_dn_initialized_ = true;
-    }
-    return cusolver_dn_handle_;
-  }
-
-  cusolverSpHandle_t get_cusolver_sp_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
-      cusolver_sp_initialized_ = true;
-    }
-    return cusolver_sp_handle_;
-  }
-
-  cusparseHandle_t get_cusparse_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusparse_initialized_) {
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
-      cusparse_initialized_ = true;
-    }
-    return cusparse_handle_;
-  }
-
-  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
-
-  /**
-   * @brief synchronize a stream on the handle
-   */
-  void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
-
-  /**
-   * @brief synchronize main stream on the handle
-   */
-  void sync_stream() const { sync_stream(stream_view_); }
-
-  /**
-   * @brief returns main stream on the handle
-   */
-  rmm::cuda_stream_view get_stream() const { return stream_view_; }
-
-  /**
-   * @brief returns whether stream pool was initialized on the handle
-   */
-
-  bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
-
-  /**
-   * @brief returns stream pool on the handle
-   */
-  const rmm::cuda_stream_pool& get_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return *stream_pool_;
-  }
-
-  std::size_t get_stream_pool_size() const
-  {
-    return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
-  }
-
-  /**
-   * @brief return stream from pool
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream();
-  }
-
-  /**
-   * @brief return stream from pool at index
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream(stream_idx);
-  }
-
-  /**
-   * @brief return stream from pool if size > 0, else main stream on handle
-   */
-  rmm::cuda_stream_view get_next_usable_stream() const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
-  }
-
-  /**
-   * @brief return stream from pool at index if size > 0, else main stream on handle
-   *
-   * @param[in] stream_idx the required index of the stream in the stream pool if available
-   */
-  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
-  }
-
-  /**
-   * @brief synchronize the stream pool on the handle
-   */
-  void sync_stream_pool() const
-  {
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      sync_stream(stream_pool_->get_stream(i));
-    }
-  }
-
-  /**
-   * @brief synchronize subset of stream pool
-   *
-   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
-   */
-  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    for (const auto& stream_index : stream_indices) {
-      sync_stream(stream_pool_->get_stream(stream_index));
-    }
-  }
-
-  /**
-   * @brief ask stream pool to wait on last event in main stream
-   */
-  void wait_stream_pool_on_stream() const
-  {
-    RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
-    }
-  }
-
-  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
-
-  const comms::comms_t& get_comms() const
-  {
-    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
-    return *communicator_;
-  }
-
-  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
-  {
-    subcomms_[key] = subcomm;
-  }
-
-  const comms::comms_t& get_subcomm(std::string key) const
-  {
-    RAFT_EXPECTS(
-      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
-
-    auto subcomm = subcomms_.at(key);
-
-    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
-
-    return *subcomm;
-  }
-
-  bool comms_initialized() const { return (nullptr != communicator_.get()); }
-
-  const cudaDeviceProp& get_device_properties() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!device_prop_initialized_) {
-      RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
-      device_prop_initialized_ = true;
-    }
-    return prop_;
-  }
-
- private:
-  std::shared_ptr<comms::comms_t> communicator_;
-  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
-
-  const int dev_id_;
-  mutable cublasHandle_t cublas_handle_;
-  mutable bool cublas_initialized_{false};
-  mutable cusolverDnHandle_t cusolver_dn_handle_;
-  mutable bool cusolver_dn_initialized_{false};
-  mutable cusolverSpHandle_t cusolver_sp_handle_;
-  mutable bool cusolver_sp_initialized_{false};
-  mutable cusparseHandle_t cusparse_handle_;
-  mutable bool cusparse_initialized_{false};
-  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
-  rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
-  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
-  cudaEvent_t event_;
-  mutable cudaDeviceProp prop_;
-  mutable bool device_prop_initialized_{false};
-  mutable std::mutex mutex_;
-
-  void create_resources()
-  {
-    thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
-
-    RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-  }
-
-  void destroy_resources()
-  {
-    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
-    if (cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
-    }
-    if (cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
-    }
-    if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
-    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
-  }
-};  // class handle_t
-
-/**
- * @brief RAII approach to synchronizing across all streams in the handle
- */
-class stream_syncer {
- public:
-  explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
-  ~stream_syncer()
-  {
-    handle_.wait_stream_pool_on_stream();
-    handle_.sync_stream_pool();
-  }
-
-  stream_syncer(const stream_syncer& other) = delete;
-  stream_syncer& operator=(const stream_syncer& other) = delete;
-
- private:
-  const handle_t& handle_;
-};  // class stream_syncer
-
-}  // namespace raft
-
-#endif
\ No newline at end of file
diff --git a/cpp/include/raft_core/interruptible.hpp b/cpp/include/raft_core/interruptible.hpp
deleted file mode 100644
index 454d8c4a6b..0000000000
--- a/cpp/include/raft_core/interruptible.hpp
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __RAFT_RT_INTERRUPTIBLE_H
-#define __RAFT_RT_INTERRUPTIBLE_H
-
-#pragma once
-
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <raft_core/cudart_utils.hpp>
-#include <raft_core/error.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <thread>
-#include <unordered_map>
-
-namespace raft {
-
-/**
- * @brief Exception thrown during `interruptible::synchronize` call when it detects a request
- * to cancel the work performed in this CPU thread.
- */
-struct interrupted_exception : public raft::exception {
-  using raft::exception::exception;
-};
-
-/**
- * @brief Cooperative-style interruptible execution.
- *
- * This class provides facilities for interrupting execution of a C++ thread at designated points
- * in code from outside of the thread. In particular, it provides an interruptible version of the
- * blocking CUDA synchronization function, that allows dropping a long-running GPU work.
- *
- *
- * **Important:** Although CUDA synchronize calls serve as cancellation points, the interruptible
- * machinery has nothing to do with CUDA streams or events. In other words, when you call `cancel`,
- * it’s the CPU waiting function what is interrupted, not the GPU stream work. This means, when the
- * `interrupted_exception` is raised, any unfinished GPU stream work continues to run. It’s the
- * responsibility of the developer then to make sure the unfinished stream work does not affect the
- * program in an undesirable way.
- *
- *
- * What can happen to CUDA stream when the `synchronize` is cancelled? If you catch the
- * `interrupted_exception` immediately, you can safely wait on the stream again.
- * Otherwise, some of the allocated resources may be released before the active kernel finishes
- * using them, which will result in writing into deallocated or reallocated memory and undefined
- * behavior in general. A dead-locked kernel may never finish (or may crash if you’re lucky). In
- * practice, the outcome is usually acceptable for the use case of emergency program interruption
- * (e.g., CTRL+C), but extra effort on the use side is required to allow safe interrupting and
- * resuming of the GPU stream work.
- */
-class interruptible {
- public:
-  /**
-   * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
-   * called on this CPU thread.
-   *
-   * @param [in] stream a CUDA stream.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread before the currently captured work has been finished.
-   * @throw raft::cuda_error if another CUDA error happens.
-   */
-  static inline void synchronize(rmm::cuda_stream_view stream)
-  {
-    get_token()->synchronize_impl(cudaStreamQuery, stream);
-  }
-
-  /**
-   * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
-   * called on this CPU thread.
-   *
-   * @param [in] event a CUDA event.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread before the currently captured work has been finished.
-   * @throw raft::cuda_error if another CUDA error happens.
-   */
-  static inline void synchronize(cudaEvent_t event)
-  {
-    get_token()->synchronize_impl(cudaEventQuery, event);
-  }
-
-  /**
-   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-   * `interruptible::cancel`.
-   *
-   * This is a cancellation point for an interruptible thread. It's called in the internals of
-   * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
-   * recommended to call `interruptible::yield()` in between to make sure the thread does not become
-   * unresponsive for too long.
-   *
-   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread.
-   */
-  static inline void yield() { get_token()->yield_impl(); }
-
-  /**
-   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-   * `interruptible::cancel`.
-   *
-   * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
-   *
-   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-   *
-   * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
-   */
-  static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
-
-  /**
-   * @brief Get a cancellation token for this CPU thread.
-   *
-   * @return an object that can be used to cancel the GPU work waited on this CPU thread.
-   */
-  static inline auto get_token() -> std::shared_ptr<interruptible>
-  {
-    // NB: using static thread-local storage to keep the token alive once it is initialized
-    static thread_local std::shared_ptr<interruptible> s(
-      get_token_impl<true>(std::this_thread::get_id()));
-    return s;
-  }
-
-  /**
-   * @brief Get a cancellation token for a CPU thread given by its id.
-   *
-   * The returned token may live longer than the associated thread. In that case, using its
-   * `cancel` method has no effect.
-   *
-   * @param [in] thread_id an id of a C++ CPU thread.
-   * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
-   */
-  static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-  {
-    return get_token_impl<false>(thread_id);
-  }
-
-  /**
-   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-   * CPU thread given by the `thread_id`
-   *
-   * Note, this function uses a mutex to safely get a cancellation token that may be shared
-   * among multiple threads. If you plan to use it from a signal handler, consider the non-static
-   * `cancel()` instead.
-   *
-   * @param [in] thread_id a CPU thread, in which the work should be interrupted.
-   */
-  static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
-
-  /**
-   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-   * CPU thread given by this `interruptible` token.
-   *
-   * Note, this function does not involve thread synchronization/locks and does not throw any
-   * exceptions, so it's safe to call from a signal handler.
-   */
-  inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
-
-  // don't allow the token to leave the shared_ptr
-  interruptible(interruptible const&) = delete;
-  interruptible(interruptible&&)      = delete;
-  auto operator=(interruptible const&) -> interruptible& = delete;
-  auto operator=(interruptible&&) -> interruptible& = delete;
-
- private:
-  /** Global registry of thread-local cancellation stores. */
-  static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
-  /** Protect the access to the registry. */
-  static inline std::mutex mutex_;
-
-  /**
-   * Create a new interruptible token or get an existing from the global registry_.
-   *
-   * Presumptions:
-   *
-   *   1. get_token_impl<true> must be called at most once per thread.
-   *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
-   *   3. get_token_impl<false> can be called as many times as needed, producing a valid
-   *      token for any input thread_id, independent of whether a C++ thread with this
-   *      id exists or not.
-   *
-   * @tparam Claim whether to bind the token to the given thread.
-   * @param [in] thread_id the id of the associated C++ thread.
-   * @return new or existing interruptible token.
-   */
-  template <bool Claim>
-  static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-  {
-    std::lock_guard<std::mutex> guard_get(mutex_);
-    // the following constructs an empty shared_ptr if the key does not exist.
-    auto& weak_store  = registry_[thread_id];
-    auto thread_store = weak_store.lock();
-    if (!thread_store || (Claim && thread_store->claimed_)) {
-      // Create a new thread_store in two cases:
-      //  1. It does not exist in the map yet
-      //  2. The previous store in the map has not yet been deleted
-      thread_store.reset(new interruptible(), [thread_id](auto ts) {
-        std::lock_guard<std::mutex> guard_erase(mutex_);
-        auto found = registry_.find(thread_id);
-        if (found != registry_.end()) {
-          auto stored = found->second.lock();
-          // thread_store is not moveable, thus retains its original location.
-          // Not equal pointers below imply the new store has been already placed
-          // in the registry_ by the same std::thread::id
-          if (!stored || stored.get() == ts) { registry_.erase(found); }
-        }
-        delete ts;
-      });
-      std::weak_ptr<interruptible>(thread_store).swap(weak_store);
-    }
-    // The thread_store is "claimed" by the thread
-    if constexpr (Claim) { thread_store->claimed_ = true; }
-    return thread_store;
-  }
-
-  /**
-   * Communicate whether the thread is in a cancelled state or can continue execution.
-   *
-   * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
-   * These are the only two places where it's used.
-   */
-  std::atomic_flag continue_;
-  /** This flag is set to true when the created token is placed into a thread-local storage. */
-  bool claimed_ = false;
-
-  interruptible() noexcept { yield_no_throw_impl(); }
-
-  void yield_impl()
-  {
-    if (!yield_no_throw_impl()) {
-      throw interrupted_exception("The work in this thread was cancelled.");
-    }
-  }
-
-  auto yield_no_throw_impl() noexcept -> bool
-  {
-    return continue_.test_and_set(std::memory_order_relaxed);
-  }
-
-  template <typename Query, typename Object>
-  inline void synchronize_impl(Query query, Object object)
-  {
-    cudaError_t query_result;
-    while (true) {
-      yield_impl();
-      query_result = query(object);
-      if (query_result != cudaErrorNotReady) { break; }
-      std::this_thread::yield();
-    }
-    RAFT_CUDA_TRY(query_result);
-  }
-};
-
-}  // namespace raft
-
-#endif
\ No newline at end of file
diff --git a/cpp/include/raft_core/linalg/cublas_macros.hpp b/cpp/include/raft_core/linalg/cublas_macros.hpp
deleted file mode 100644
index 5ca93fc185..0000000000
--- a/cpp/include/raft_core/linalg/cublas_macros.hpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __RAFT_RT_CUBLAS_MACROS_H
-#define __RAFT_RT_CUBLAS_MACROS_H
-
-#pragma once
-
-#include <cublas_v2.h>
-#include <raft_core/error.hpp>
-
-///@todo: enable this once we have logger enabled
-//#include <cuml/common/logger.hpp>
-
-#include <cstdint>
-
-#define _CUBLAS_ERR_TO_STR(err) \
-  case err: return #err
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuBLAS error is encountered.
- */
-struct cublas_error : public raft::exception {
-  explicit cublas_error(char const* const message) : raft::exception(message) {}
-  explicit cublas_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-
-inline const char* cublas_error_to_string(cublasStatus_t err)
-{
-  switch (err) {
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default: return "CUBLAS_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace linalg
-}  // namespace raft
-
-#undef _CUBLAS_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuBLAS runtime API functions.
- *
- * Invokes a cuBLAS runtime API function call, if the call does not return
- * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
- */
-#define RAFT_CUBLAS_TRY(call)                                      \
-  do {                                                             \
-    cublasStatus_t const status = (call);                          \
-    if (CUBLAS_STATUS_SUCCESS != status) {                         \
-      std::string msg{};                                           \
-      SET_ERROR_MSG(msg,                                           \
-                    "cuBLAS error encountered at: ",               \
-                    "call='%s', Reason=%d:%s",                     \
-                    #call,                                         \
-                    status,                                        \
-                    raft::linalg::cublas_error_to_string(status)); \
-      throw raft::cublas_error(msg);                               \
-    }                                                              \
-  } while (0)
-
-// FIXME: Remove after consumers rename
-#ifndef CUBLAS_TRY
-#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUBLAS_TRY_NO_THROW(call)                               \
-  do {                                                               \
-    cublasStatus_t const status = call;                              \
-    if (CUBLAS_STATUS_SUCCESS != status) {                           \
-      printf("CUBLAS call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                  \
-             __FILE__,                                               \
-             __LINE__,                                               \
-             raft::linalg::cublas_error_to_string(status));          \
-    }                                                                \
-  } while (0)
-
-/** FIXME: remove after cuml rename */
-#ifndef CUBLAS_CHECK
-#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
-#endif
-
-/** FIXME: remove after cuml rename */
-#ifndef CUBLAS_CHECK_NO_THROW
-#define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
-#endif
-
-#endif
\ No newline at end of file
diff --git a/cpp/include/raft_core/linalg/cusolver_macros.hpp b/cpp/include/raft_core/linalg/cusolver_macros.hpp
deleted file mode 100644
index ae7cad31d0..0000000000
--- a/cpp/include/raft_core/linalg/cusolver_macros.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __RAFT_RT_CUSOLVER_MACROS_H
-#define __RAFT_RT_CUSOLVER_MACROS_H
-
-#pragma once
-
-#include <cusolverDn.h>
-#include <cusolverSp.h>
-///@todo: enable this once logging is enabled
-//#include <cuml/common/logger.hpp>
-#include <raft_core/cudart_utils.hpp>
-#include <type_traits>
-
-#define _CUSOLVER_ERR_TO_STR(err) \
-  case err: return #err;
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuSOLVER error is encountered.
- */
-struct cusolver_error : public raft::exception {
-  explicit cusolver_error(char const* const message) : raft::exception(message) {}
-  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-
-inline const char* cusolver_error_to_string(cusolverStatus_t err)
-{
-  switch (err) {
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-    default: return "CUSOLVER_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace linalg
-}  // namespace raft
-
-#undef _CUSOLVER_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuSOLVER runtime API functions.
- *
- * Invokes a cuSOLVER runtime API function call, if the call does not return
- * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
- */
-#define RAFT_CUSOLVER_TRY(call)                                      \
-  do {                                                               \
-    cusolverStatus_t const status = (call);                          \
-    if (CUSOLVER_STATUS_SUCCESS != status) {                         \
-      std::string msg{};                                             \
-      SET_ERROR_MSG(msg,                                             \
-                    "cuSOLVER error encountered at: ",               \
-                    "call='%s', Reason=%d:%s",                       \
-                    #call,                                           \
-                    status,                                          \
-                    raft::linalg::cusolver_error_to_string(status)); \
-      throw raft::cusolver_error(msg);                               \
-    }                                                                \
-  } while (0)
-
-// FIXME: remove after consumer rename
-#ifndef CUSOLVER_TRY
-#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUSOLVER_TRY_NO_THROW(call)                               \
-  do {                                                                 \
-    cusolverStatus_t const status = call;                              \
-    if (CUSOLVER_STATUS_SUCCESS != status) {                           \
-      printf("CUSOLVER call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                    \
-             __FILE__,                                                 \
-             __LINE__,                                                 \
-             raft::linalg::cusolver_error_to_string(status));          \
-    }                                                                  \
-  } while (0)
-
-// FIXME: remove after cuml rename
-#ifndef CUSOLVER_CHECK
-#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
-#endif
-
-#ifndef CUSOLVER_CHECK_NO_THROW
-#define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
-#endif
-
-#endif
\ No newline at end of file
diff --git a/cpp/include/raft_core/raft.hpp b/cpp/include/raft_core/raft.hpp
deleted file mode 100644
index 82e8514ddb..0000000000
--- a/cpp/include/raft_core/raft.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "comms/comms.hpp"
-#include "error.hpp"
-#include "handle.hpp"
-#include "interruptible.hpp"
diff --git a/cpp/include/raft_core/sparse/cusparse_macros.hpp b/cpp/include/raft_core/sparse/cusparse_macros.hpp
deleted file mode 100644
index bc0aa374d6..0000000000
--- a/cpp/include/raft_core/sparse/cusparse_macros.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __RAFT_RT_CUSPARSE_MACROS_H
-#define __RAFT_RT_CUSPARSE_MACROS_H
-
-#pragma once
-
-#include <cusparse.h>
-#include <raft_core/error.hpp>
-///@todo: enable this once logging is enabled
-//#include <cuml/common/logger.hpp>
-
-#define _CUSPARSE_ERR_TO_STR(err) \
-  case err: return #err;
-
-// Notes:
-//(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
-//(2.) to enforce a lower version,
-//
-//`#define CUDA_ENFORCE_LOWER
-// #include <raft/sparse/detail/cusparse_wrappers.h>`
-//
-// (i.e., before including this header)
-//
-#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10100)
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuSparse error is encountered.
- */
-struct cusparse_error : public raft::exception {
-  explicit cusparse_error(char const* const message) : raft::exception(message) {}
-  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace sparse {
-namespace detail {
-
-inline const char* cusparse_error_to_string(cusparseStatus_t err)
-{
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
-  return cusparseGetErrorString(err);
-#else   // CUDART_VERSION
-  switch (err) {
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    default: return "CUSPARSE_STATUS_UNKNOWN";
-  };
-#endif  // CUDART_VERSION
-}
-
-}  // namespace detail
-}  // namespace sparse
-}  // namespace raft
-
-#undef _CUSPARSE_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuSparse runtime API functions.
- *
- * Invokes a cuSparse runtime API function call, if the call does not return
- * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred
- */
-#define RAFT_CUSPARSE_TRY(call)                                              \
-  do {                                                                       \
-    cusparseStatus_t const status = (call);                                  \
-    if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                     \
-      SET_ERROR_MSG(msg,                                                     \
-                    "cuSparse error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                               \
-                    #call,                                                   \
-                    status,                                                  \
-                    raft::sparse::detail::cusparse_error_to_string(status)); \
-      throw raft::cusparse_error(msg);                                       \
-    }                                                                        \
-  } while (0)
-
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_TRY
-#define CUSPARSE_TRY(call) RAFT_CUSPARSE_TRY(call)
-#endif
-
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_CHECK
-#define CUSPARSE_CHECK(call) CUSPARSE_TRY(call)
-#endif
-
-//@todo: use logger here once logging is enabled
-/** check for cusparse runtime API errors but do not assert */
-#define RAFT_CUSPARSE_TRY_NO_THROW(call)                           \
-  do {                                                             \
-    cusparseStatus_t err = call;                                   \
-    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
-      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
-             #call,                                                \
-             err,                                                  \
-             raft::sparse::detail::cusparse_error_to_string(err)); \
-    }                                                              \
-  } while (0)
-
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_CHECK_NO_THROW
-#define CUSPARSE_CHECK_NO_THROW(call) RAFT_CUSPARSE_TRY_NO_THROW(call)
-#endif
-
-#endif
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 71357829ca..b3a2013715 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -166,7 +166,6 @@ target_link_libraries(test_raft
         raft::raft
         raft::distance
         raft::nn
-        NCCL::NCCL
         faiss::faiss
         GTest::gtest
         GTest::gtest_main
diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index d4009da0fb..ed8b11dca0 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -30,7 +30,7 @@ cdef extern from "raft/mr/device/allocator.hpp" \
     cdef cppclass allocator:
         pass
 
-cdef extern from "raft_core/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
diff --git a/python/pylibraft/pylibraft/common/interruptible.pxd b/python/pylibraft/pylibraft/common/interruptible.pxd
index 3eda29c602..cb639c0f72 100644
--- a/python/pylibraft/pylibraft/common/interruptible.pxd
+++ b/python/pylibraft/pylibraft/common/interruptible.pxd
@@ -22,7 +22,7 @@
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
-cdef extern from "raft_core/interruptible.hpp" namespace "raft" nogil:
+cdef extern from "raft/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
 
diff --git a/python/raft/raft/common/handle.pxd b/python/raft/raft/common/handle.pxd
index 1bbb57f1c7..8415b7e3d7 100644
--- a/python/raft/raft/common/handle.pxd
+++ b/python/raft/raft/common/handle.pxd
@@ -31,7 +31,7 @@ cdef extern from "raft/mr/device/allocator.hpp" \
     cdef cppclass allocator:
         pass
 
-cdef extern from "raft_core/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
diff --git a/python/raft/raft/common/interruptible.pxd b/python/raft/raft/common/interruptible.pxd
index 2b937e7299..a73e8c1ac7 100644
--- a/python/raft/raft/common/interruptible.pxd
+++ b/python/raft/raft/common/interruptible.pxd
@@ -22,11 +22,11 @@
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
-cdef extern from "raft_core/interruptible.hpp" namespace "raft" nogil:
+cdef extern from "raft/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
 
-cdef extern from "raft_core/interruptible.hpp" \
+cdef extern from "raft/interruptible.hpp" \
         namespace "raft::interruptible" nogil:
     cdef void inter_synchronize \
         "raft::interruptible::synchronize"(cuda_stream_view stream) except+
diff --git a/python/raft/raft/dask/common/comms_utils.pyx b/python/raft/raft/dask/common/comms_utils.pyx
index 7f805a9e9c..38c5670372 100644
--- a/python/raft/raft/dask/common/comms_utils.pyx
+++ b/python/raft/raft/dask/common/comms_utils.pyx
@@ -31,7 +31,7 @@ cdef extern from "nccl.h":
     cdef struct ncclComm
     ctypedef ncclComm *ncclComm_t
 
-cdef extern from "raft_core/handle.hpp" namespace "raft":
+cdef extern from "raft/handle.hpp" namespace "raft":
     cdef cppclass handle_t:
         handle_t() except +
 

From 413a1086af9713abf4add39bc3f0eea29e6c3766 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 16 Mar 2022 12:21:55 -0400
Subject: [PATCH 107/167] More updates

---
 build.sh                                  |  6 ++++-
 cpp/CMakeLists.txt                        | 19 ++++++-------
 cpp/cmake/modules/raft_export.cmake       | 33 ++++++++++++-----------
 cpp/cmake/thirdparty/get_libcudacxx.cmake |  4 ++-
 cpp/cmake/thirdparty/get_mdspan.cmake     |  1 +
 5 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/build.sh b/build.sh
index c5f4bec764..23c5635614 100755
--- a/build.sh
+++ b/build.sh
@@ -70,7 +70,7 @@ NVTX=OFF
 CLEAN=0
 UNINSTALL=0
 DISABLE_DEPRECATION_WARNINGS=ON
-CMAKE_TARGET=";"
+CMAKE_TARGET=""
 INSTALL_TARGET=""
 
 # Set defaults for vars that may not have been defined externally
@@ -161,6 +161,10 @@ if hasArg uninstall; then
   UNINSTALL=1
 fi
 
+if [[ ${CMAKE_TARGET} == "" ]]; then
+  CMAKE_TARGET="all"
+fi
+
 # If clean given, run it prior to any other steps
 if (( ${CLEAN} == 1 )); then
     # If the dirs to clean are mounted dirs in a container, the
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2f5aca7c50..06cfad1e00 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -134,14 +134,6 @@ endif()
 
 ##############################################################################
 # - raft ---------------------------------------------------------------------
-set(RAFT_LINK_LIBS
-        CUDA::cublas
-        CUDA::curand
-        CUDA::cusolver
-        CUDA::cudart
-        CUDA::cusparse
-        rmm::rmm)
-
 add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
 
@@ -152,7 +144,12 @@ target_include_directories(raft INTERFACE
 target_link_libraries(raft INTERFACE
         raft::Thrust
         $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
-        ${RAFT_LINK_LIBS}
+        CUDA::cublas
+        CUDA::curand
+        CUDA::cusolver
+        CUDA::cudart
+        CUDA::cusparse
+        rmm::rmm
         $<$<BOOL:${cuco_ADDED}>:cuco::cuco>
         std::mdspan)
 
@@ -363,7 +360,7 @@ include(cmake/modules/raft_export.cmake)
 raft_export(INSTALL raft
         COMPONENTS nn distance
         EXPORT_SET raft-exports
-        GLOBAL_TARGETS nn distance
+        GLOBAL_TARGETS raft nn distance
         NAMESPACE raft::
         DOCUMENTATION doc_string
         FINAL_CODE_BLOCK code_string)
@@ -373,7 +370,7 @@ raft_export(INSTALL raft
 raft_export(BUILD raft
         EXPORT_SET raft-exports
         COMPONENTS nn distance
-        GLOBAL_TARGETS raft_distance raft_nn
+        GLOBAL_TARGETS raft raft_distance raft_nn
         DOCUMENTATION doc_string
         NAMESPACE raft::
         FINAL_CODE_BLOCK code_string)
diff --git a/cpp/cmake/modules/raft_export.cmake b/cpp/cmake/modules/raft_export.cmake
index 6a66b5420b..4121088bb3 100644
--- a/cpp/cmake/modules/raft_export.cmake
+++ b/cpp/cmake/modules/raft_export.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -138,6 +138,7 @@ function(raft_export type project_name)
     # Choose the project version when an explicit version isn't provided
     set(RAPIDS_VERSION "${PROJECT_VERSION}")
   endif()
+
   if(rapids_version_set)
     include("${rapids-cmake-dir}/export/detail/parse_version.cmake")
     rapids_export_parse_version(${RAPIDS_VERSION} rapids_orig rapids_project_version)
@@ -173,18 +174,18 @@ function(raft_export type project_name)
 
     set(scratch_dir "${PROJECT_BINARY_DIR}/rapids-cmake/${project_name}/export")
 
-      configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
-                                  "${scratch_dir}/${project_name}-config.cmake"
-                                  INSTALL_DESTINATION "${install_location}")
+    configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
+            "${scratch_dir}/${project_name}-config.cmake"
+            INSTALL_DESTINATION "${install_location}")
 
     if(rapids_version_set)
       write_basic_package_version_file(
-        "${scratch_dir}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
-        COMPATIBILITY ${rapids_project_version_compat})
+              "${scratch_dir}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
+              COMPATIBILITY ${rapids_project_version_compat})
     endif()
 
-      install(EXPORT ${RAPIDS_EXPORT_SET} FILE ${project_name}-targets.cmake
-              NAMESPACE ${RAPIDS_PROJECT_VERSION} DESTINATION "${install_location}")
+    install(EXPORT ${RAPIDS_EXPORT_SET} FILE ${project_name}-targets.cmake
+            NAMESPACE ${RAPIDS_PROJECT_VERSION} DESTINATION "${install_location}")
 
     if(TARGET rapids_export_install_${RAPIDS_EXPORT_SET})
       include("${rapids-cmake-dir}/export/write_dependencies.cmake")
@@ -206,32 +207,32 @@ function(raft_export type project_name)
   else()
     set(install_location "${PROJECT_BINARY_DIR}")
     configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
-                                  "${install_location}/${project_name}-config.cmake"
-                                  INSTALL_DESTINATION "${install_location}")
+            "${install_location}/${project_name}-config.cmake"
+            INSTALL_DESTINATION "${install_location}")
 
     if(rapids_version_set)
       write_basic_package_version_file(
-        "${install_location}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
-        COMPATIBILITY ${rapids_project_version_compat})
+              "${install_location}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
+              COMPATIBILITY ${rapids_project_version_compat})
     endif()
 
     export(EXPORT ${RAPIDS_EXPORT_SET} NAMESPACE ${RAPIDS_PROJECT_VERSION}
-           FILE "${install_location}/${project_name}-targets.cmake")
+            FILE "${install_location}/${project_name}-targets.cmake")
 
     if(TARGET rapids_export_build_${RAPIDS_EXPORT_SET})
       include("${rapids-cmake-dir}/export/write_dependencies.cmake")
       rapids_export_write_dependencies(BUILD ${RAPIDS_EXPORT_SET}
-                                       "${install_location}/${project_name}-dependencies.cmake")
+              "${install_location}/${project_name}-dependencies.cmake")
     endif()
 
     if(DEFINED RAPIDS_LANGUAGES)
       include("${rapids-cmake-dir}/export/write_language.cmake")
       foreach(lang IN LISTS RAPIDS_LANGUAGES)
         rapids_export_write_language(BUILD ${lang}
-                                     "${install_location}/${project_name}-${lang}-language.cmake")
+                "${install_location}/${project_name}-${lang}-language.cmake")
       endforeach()
     endif()
 
   endif()
 
-endfunction()
+endfunction()
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index a81de940d1..a018341b24 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,7 +16,9 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports)
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
+                        INSTALL_EXPORT_SET raft-exports)
+
 endfunction()
 
 find_and_configure_libcudacxx()
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index 6cd2b5c96c..12ac7ab0fd 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -17,6 +17,7 @@ function(find_and_configure_mdspan VERSION)
     mdspan ${VERSION}
     GLOBAL_TARGETS std::mdspan
     BUILD_EXPORT_SET    raft-exports
+    INSTALL_EXPORT_SET  raft-exports
     CPM_ARGS
       GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
       GIT_TAG b3042485358d2ee168ae2b486c98c2c61ec5aec1

From 94881c28d0bbfaf2f51dded913b4dd379f882f0d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 16 Mar 2022 12:30:02 -0400
Subject: [PATCH 108/167] More udpates

---
 build.sh                            | 3 +--
 cpp/cmake/modules/raft_export.cmake | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/build.sh b/build.sh
index 23c5635614..ff580705fb 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft pylibraft docs tests bench -v -g --install --remove-cmake-deps --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
+VALIDARGS="clean libraft pyraft pylibraft docs tests bench clean -v -g --install --remove-cmake-deps --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -39,7 +39,6 @@ HELP="$0 [<target> ...] [<flag> ...]
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
    --install        - install cmake targets
-   --clean          - perform clean of all build directories
    --nvtx           - enable nvtx for profiling support
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
diff --git a/cpp/cmake/modules/raft_export.cmake b/cpp/cmake/modules/raft_export.cmake
index 4121088bb3..e89a9c5ee6 100644
--- a/cpp/cmake/modules/raft_export.cmake
+++ b/cpp/cmake/modules/raft_export.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From aa805b0f9f320948b137d20b46aa38b49310804b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 17 Mar 2022 10:17:33 -0400
Subject: [PATCH 109/167] Updating README and BUILD docs to remove raft core.

---
 BUILD.md  | 73 +++++++++++++++++++++++++++++--------------------------
 README.md | 19 ++++++---------
 2 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index ea82276ec1..bcf7d49c0a 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -19,20 +19,20 @@
 ### <a id="cuda_gpu_req"></a>CUDA/GPU Requirements
 - CUDA Toolkit 11.0+
 - NVIDIA driver 450.80.02+
-- Pascal architecture of better (Compute capability >= 6.0)
+- Pascal architecture of better (compute capability >= 6.0)
 
 ### <a id="required_dependencies"></a>Build Dependencies
 
-Below are the dependencies for building RAFT from source. Many of these dependencies can be installed with [conda](https://anaconda.org) or [rapids-cpm](https://github.com/rapidsai/rapids-cmake#cpm).
+In addition to the libraries included with cudatoolkit 11.0+, there are some other dependencies below for building RAFT from source. Many of the dependencies are optional and depend only on the primitives being used. All of these can be installed with cmake or [rapids-cpm](https://github.com/rapidsai/rapids-cmake#cpm) and many of them can be installed with [conda](https://anaconda.org).
 
 #### Required
 - [Thrust](https://github.com/NVIDIA/thrust) v1.15 / [CUB](https://github.com/NVIDIA/cub)
 - [RMM](https://github.com/rapidsai/rmm)
-- [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0
 - [mdspan](https://github.com/rapidsai/mdspan)
   
 #### Optional
 - [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API
+- [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0
 - [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::spatial::knn` API
 - [NCCL](https://github.com/NVIDIA/nccl) - Used in `raft::comms` API and needed to build `Pyraft`
 - [UCX](https://github.com/openucx/ucx) - Used in `raft::comms` API and needed to build `Pyraft`
@@ -40,24 +40,22 @@ Below are the dependencies for building RAFT from source. Many of these dependen
 - [Googlebench](https://github.com/google/benchmark) - Needed to build benchmarks
 - [Doxygen](https://github.com/doxygen/doxygen) - Needed to build docs
 
+C++ RAFT is a header-only library but provides the option of building shared libraries with template instantiations for common types to speed up compile times for larger projects.
 
-C++ RAFT is a header-only library with the option of building shared libraries with template instantiations for common types to speed up compile times for larger projects.
-
-The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python code and provides options for building and installing the core headers, tests, benchmarks, and individual shared libraries.
+The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python artifacts and provides options for building and installing the core headers, tests, benchmarks, and individual shared libraries.
 
 ### <a id="install_header_only_cpp"></a>Header-only C++
 
-`build.sh` uses [rapids-cmake](https://github.com/rapidsai/rapids-cmake), which will automatically download any dependencies. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which will need to be explicitly enabled in `build.sh`.
+`build.sh` uses [rapids-cmake](https://github.com/rapidsai/rapids-cmake), which will automatically download any dependencies which are not already installed. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which will need to be explicitly enabled in `build.sh`.
 
-The following example will download the needed dependencies and install the `raft core` headers in `$INSTALL_PREFIX/include/raft`. The `raft core` headers are a subset of the RAFT headers which are safe to install and expose through public APIs in consuming projects as they require only RMM and the libraries provided by the CUDA toolkit.
+The following example will download the needed dependencies and install the RAFT headers into `$INSTALL_PREFIX/include/raft`. The `--install` flag can be omitted to just have the build download the needed dependencies. Since RAFT is primarily used at build-time, the dependencies will never be installed by the RAFT build, with the exception of building FAISS statically into the shared libraries.
 ```bash
-./build.sh libraft
+./build.sh libraft --install
 ```
 
 ###<a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
 
-Shared libraries can be built to speed up compile times for larger libraries which may heavily utilize some of the APIs. These shared libraries can also significantly improve re-compile times while developing against the APIs. Build all the shared libraries by passing `--compile-libs` flag to `build.sh`:
-
+For larger projects which make heavy use of the pairwise distances or nearest neighbors APIs, shared libraries can be built to speed up compile times. These shared libraries can also significantly improve re-compile times both while developing RAFT and developing against the APIs. Build all of the available shared libraries by passing `--compile-libs` flag to `build.sh`:
 ```bash
 ./build.sh libraft --compile-libs
 ```
@@ -67,11 +65,11 @@ Individual shared libraries have their own flags and multiple can be used (thoug
 ./build.sh libraft --compile-nn --compile-dist
 ```
 
-The `--install` flag can be passed to `build.sh` to install the shared libraries.
+Add the `--install` flag to the above example to also install the shared libraries into `$INSTALL_PREFIX/lib`.
 
 ###<a id="gtests"></a>Tests
 
-Compile the tests using the `tests` target in `build.sh`:
+Compile the tests using the `tests` target in `build.sh`. By default, the shared libraries are assumed to be already built and on the library path. Add `--compile-libs` to also compile them.
 ```bash
 ./build.sh libraft tests --compile-libs
 ```
@@ -134,31 +132,40 @@ Currently, shared libraries are provided for the `libraft-nn` and `libraft-dista
 Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. The following example will install create and install dependencies for a CUDA 11.5 conda environment:
 
 ```bash
-conda env create --name raft_env -f conda/environments/raft_dev_cuda11.5.yml
-conda activate raft_env
+mamba env create --name raft_env -f conda/environments/raft_dev_cuda11.5.yml
+mamba activate raft_env
 ```
 
-The Python API can be built using the `build.sh` script:
+The Python APIs can be built using the `build.sh` script:
 
 ```bash
-./build.sh pyraft
+./build.sh pyraft pylibraft
 ```
 
-`setup.py` can also be used to build the Python API manually:
+`setup.py` can also be used to build the Python APIs manually:
 ```bash
-cd python
+cd python/raft
+python setup.py build_ext --inplace
+python setup.py install
+
+cd python/pylibraft
 python setup.py build_ext --inplace
 python setup.py install
 ```
 
 To run the Python tests:
 ```bash
-cd python
+cd python/raft
 py.test -s -v raft
+
+cd python pylibraft
+py.test -s -v pylibraft
 ```
 
 ## <a id="use_raft"></a>Using RAFT in downstream projects
 
+There are two different strategies for including RAFT in downstream projects, depending on whether or not the required dependencies are already installed and available on the `lib` and `include` paths. 
+
 ### <a id="cxx_integration"></a>C++ header-only integration using cmake
 
 When the needed [build dependencies](#required_depenencies) are already satisfied, RAFT can be trivially integrated into downstream projects by cloning the repository and adding `cpp/include` from RAFT to the include path:
@@ -171,25 +178,18 @@ ExternalProject_Add(raft
   CONFIGURE_COMMAND ""
   BUILD_COMMAND     ""
   INSTALL_COMMAND   "")
-set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/src/raft/cpp/include CACHE STRING "RAFT include variable")
+set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/raft/cpp/include CACHE STRING "RAFT include variable")
 ```
 
-The RAFT headers are broken down into two different include paths so that core headers can be isolated between projects while public API headers can be installed globally, exposed to users through public APIs, and shared across projects.
-- `cpp/include/raft` contains public API headers that require only rmm and the cudatoolkit libraries. These are safe to expose on public APIs and don't require `nvcc` to compile. 
-- `cpp/include/raft` contains the core of the RAFT header-only library, containing primitives, algorithms, and other tools.
-
-If RAFT has already been installed, such as by using the `build.sh` script, 
-Use `find_package(raft)` and the `raft::raft` target if using RAFT to interact only with the public APIs of consuming projects.
-
-Use `find_package(raft COMPONENTS core)` and both the `raft::raft` and `raft::core` targets when building a library that uses headers in `include/raft`.
+If RAFT has already been installed, such as by using the `build.sh` script, use `find_package(raft)` and the `raft::raft` target if using RAFT to interact only with the public APIs of consuming projects.
 
 ### <a id="use_shared_libs"></a>Using pre-compiled shared libraries
 
-Use `find_package(raft COMPONENTS core nn distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available in addition to `raft::raft` and `raft::core` for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
+Use `find_package(raft COMPONENTS core nn distance)` to enable the shared libraries and transitively pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
 
-The pre-compiled libraries contain template specializations for commonly used types. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `spectializations.hpp` and located in the base directory for the packages that contain specializations.
+The pre-compiled libraries contain template specializations for commonly used types, such as single- and double-precision floating-point. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `spectializations.hpp` and located in the base directory for the packages that contain specializations.
 
-The following example ignores the pre-compiled templates for the `libraft-distance` API so the symbols from pre-compiled shared library will be used:
+The following example tells the compiler to ignore the pre-compiled templates for the `libraft-distance` API so any symbols already compiled into pre-compiled shared library will be used instead:
 ```c++
 #include <raft/distance/distance.hpp>
 #include <raft/distance/specializations.hpp>
@@ -197,7 +197,9 @@ The following example ignores the pre-compiled templates for the `libraft-distan
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target for `include/raft` headers and `raft::core` for the `include/raft` headers. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be more easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). 
+
+The following example is similar to invoking `find_package(raft)` but uses `rapids_cpm_find`, which provides a richer and more flexible configuration landscape by using CPM to fetch any dependencies not already available to the build. The `raft::raft` link target will be made available and it's recommended that it be used as a `PRIVATE` link dependency in downstream projects. The `COMPILE_LIBRARIES` option enables the building the shared libraries.
 
 The following `cmake` snippet enables a flexible configuration of RAFT:
 
@@ -227,7 +229,6 @@ function(find_and_configure_raft)
   # Add components
   #-----------------------------------------------------
 
-  string(APPEND RAFT_COMPONENTS "core")
   if(PKG_USE_NN_LIBRARY)
     string(APPEND RAFT_COMPONENTS " nn")
   endif()
@@ -279,6 +280,8 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
 )
 ```
 
+If using the nearest neighbors APIs without the shared libraries, set `ENABLE_NN_DEPENDENCIES=ON` and keep `USE_NN_LIBRARY=OFF`
+
 ### <a id="py_integration"></a>Python/Cython Integration
 
-Once installed, RAFT's Python library can be imported and used directly.
+Once installed, RAFT's Python library can be added to downstream conda recipes, imported and used directly.
diff --git a/README.md b/README.md
index e91744c835..ee533aa46c 100755
--- a/README.md
+++ b/README.md
@@ -24,8 +24,8 @@ The algorithms in RAFT span the following general categories:
 RAFT provides a header-only C++ library and pre-compiled shared libraries that can 1) speed up compile times and 2) enable the APIs to be used without CUDA-enabled compilers.
 
 RAFT also provides 2 Python libraries:
-- `pyraft` - reusable infrastructure for building analytics, such as tools for building multi-node multi-GPU algorithms that leverage [Dask](https://dask.org/).
 - `pylibraft` - cython wrappers around RAFT algorithms and primitives.
+- `pyraft` - reusable infrastructure for building analytics, such as tools for building multi-node multi-GPU algorithms that leverage [Dask](https://dask.org/).
 
 ## Getting started
 
@@ -98,7 +98,7 @@ pairwise_distance(input, input, output, "euclidean")
 
 ## Installing
 
-RAFT can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), or by building the repository from source.
+RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), or by building the repository from source. Please refer to the [build instructions](BUILD.md) for more a comprehensive guide on building RAFT and using it in downstream projects.
 
 ### Conda
 
@@ -114,7 +114,7 @@ Use the following command to install RAFT with conda (use `-c rapidsai-nightly`
 conda install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft pylibraft
 ```
 
-After installing RAFT, `find_package(raft COMPONENTS backend nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
+After installing RAFT, `find_package(raft COMPONENTS backend nn distance)` can be used in your CUDA/C++ build. `COMPONENTS` are optional and will depend on the packages installed.
 
 ### CPM
 
@@ -127,7 +127,6 @@ After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids
 set(RAFT_VERSION "22.04")
 set(RAFT_FORK "rapidsai")
 set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
-set(RAFT_COMPONENTS "core")
 
 function(find_and_configure_raft)
   set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC 
@@ -147,7 +146,6 @@ function(find_and_configure_raft)
           GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
           GIT_TAG        ${PKG_PINNED_TAG}
           SOURCE_SUBDIR  cpp
-          FIND_PACKAGE_ARGUMENTS "COMPONENTS ${RAFT_COMPONENTS}"
           OPTIONS
           "BUILD_TESTS OFF"
           "BUILD_BENCH OFF"
@@ -172,10 +170,9 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
 
 Several cmake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available.
 
-| Component | Target | Description | Dependencies |
+| Component | Target | Description | Base Dependencies |
 | --- | --- | --- | --- |
-| n/a | `raft::core` | Only RAFT core headers. These are very lightweight and safe to expose in public APIs. | Cudatoolkit libraries, RMM |
-| core | `raft::raft` | Full RAFT header library | std::mdspan, cuCollections, Thrust, NVTools |
+| n/a | `raft::raft` | Full RAFT header library | CUDA toolkit library, RMM, std::mdspan, cuCollections, Thrust, NVTools |
 | distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::raft |
 | nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::raft, FAISS |
 
@@ -184,14 +181,14 @@ Several cmake targets can be made available by adding components in the table be
 The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository:
 1. Create an environment with the needed dependencies: 
 ```
-conda env create --name raft_dev -f conda/environments/raft_dev_cuda11.5.yml
-conda activate raft_dev
+mamba env create --name raft_dev -f conda/environments/raft_dev_cuda11.5.yml
+mamba activate raft_dev
 ```
 ```
 ./build.sh pyraft pylibraft libraft tests bench --compile-libs
 ```
 
-The [Build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) guide.
+The [build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) section of the build instructions.
 
 ## Folder Structure and Contents
 

From 110b26f514c3d68c6b27aaaf4064f6544ced68b1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 17 Mar 2022 10:24:28 -0400
Subject: [PATCH 110/167] More updates to dev guide

---
 CONTRIBUTING.md    | 2 +-
 DEVELOPER_GUIDE.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 828986e190..faf777ba42 100755
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -23,7 +23,7 @@ into three categories:
 
 ### Your first issue
 
-1. Read the project's [README.md](https://github.com/rapidsai/RAFT/blob/main/README.md)
+1. Read the project's [README.md](https://github.com/rapidsai/raft)
     to learn how to setup the development environment
 2. Find an issue to work on. The best way is to look for the [good first issue](https://github.com/rapidsai/RAFT/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
     or [help wanted](https://github.com/rapidsai/RAFT/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels
diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
index a045d13991..1355d83e86 100644
--- a/DEVELOPER_GUIDE.md
+++ b/DEVELOPER_GUIDE.md
@@ -4,7 +4,7 @@
 
 Devloping features and fixing bugs for the RAFT library itself is straightforward and only requires building and installing the relevant RAFT artifacts. 
 
-The process for working on a CUDA/C++ feature which spans RAFT and one or more consumers can vary slightly depending on whether the consuming project relies on a source build (as outlined in the [BUILD](BUILD.md#building-raft-c-from-source) docs). In such a case, the option `CPM_raft_SOURCE=/path/to/raft/source` can be passed to the cmake of the consuming project in order to build the local RAFT from source. The PR with relevant changes to the consuming project can also pin the RAFT version temporarily by explicitly changing the `FORK` and `PINNED_TAG` arguments to the RAFT branch containing their changes when invoking `find_and_configure_raft`.  The pin should be reverted after the changed is merged to the RAFT project and before it is merged to the dependent project(s) downstream. 
+The process for working on a CUDA/C++ feature which spans RAFT and one or more consumers can vary slightly depending on whether the consuming project relies on a source build (as outlined in the [BUILD](BUILD.md#install_header_only_cpp) docs). In such a case, the option `CPM_raft_SOURCE=/path/to/raft/source` can be passed to the cmake of the consuming project in order to build the local RAFT from source. The PR with relevant changes to the consuming project can also pin the RAFT version temporarily by explicitly changing the `FORK` and `PINNED_TAG` arguments to the RAFT branch containing their changes when invoking `find_and_configure_raft`.  The pin should be reverted after the changed is merged to the RAFT project and before it is merged to the dependent project(s) downstream. 
 
 If building a feature which spans projects and not using the source build in cmake, the RAFT changes (both C++ and Python) will need to be installed into the environment of the consuming project before they can be used. The ideal integration of RAFT into consuming projects will enable both the source build in the consuming project only for this case but also rely on a more stable packaging (such as conda packaging) otherwise. 
 

From 66c9f9cb28ebc94a16099e32619394f9d95686c1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 17 Mar 2022 16:19:28 -0400
Subject: [PATCH 111/167] Couple of small fixes

---
 README.md                       |  2 +-
 build.sh                        |  2 +-
 ci/cpu/build.sh                 |  4 +--
 ci/cpu/upload.sh                | 10 ++++----
 cpp/include/raft/random/rng.cuh |  2 +-
 cpp/include/raft/random/rng.hpp |  2 +-
 python/raft/record.txt          | 44 ---------------------------------
 7 files changed, 11 insertions(+), 55 deletions(-)
 delete mode 100644 python/raft/record.txt

diff --git a/README.md b/README.md
index ee533aa46c..69003a82ac 100755
--- a/README.md
+++ b/README.md
@@ -114,7 +114,7 @@ Use the following command to install RAFT with conda (use `-c rapidsai-nightly`
 conda install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft pylibraft
 ```
 
-After installing RAFT, `find_package(raft COMPONENTS backend nn distance)` can be used in your CUDA/C++ build. `COMPONENTS` are optional and will depend on the packages installed.
+After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ build. `COMPONENTS` are optional and will depend on the packages installed.
 
 ### CPM
 
diff --git a/build.sh b/build.sh
index ff580705fb..0c3fbaccb6 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft pylibraft docs tests bench clean -v -g --install --remove-cmake-deps --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
+VALIDARGS="clean libraft pyraft pylibraft docs tests bench clean -v -g --install --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index f7f777c791..f487e268f8 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -85,7 +85,7 @@ gpuci_mamba_retry install -c conda-forge boa
 ###############################################################################
 
 if [ "$BUILD_LIBRAFT" == '1' ]; then
-  gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-client-api"
+  gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-headers"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn
@@ -107,7 +107,7 @@ if [ "$BUILD_LIBRAFT" == '1' ]; then
     mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_distance/work
   fi
 else
-  gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-client-api"
+  gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-headers"
 fi
 
 if [ "$BUILD_RAFT" == '1' ]; then
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 4cbfe4e8d8..822c15f0e1 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -31,7 +31,7 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBRAFT_CLIENT_API_FILE=`conda build --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_headers --output`
+export LIBRAFT_HEADERS_FILE=`conda build --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_headers --output`
 export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_nn --output`
 export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_distance --output`
 export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/pyraft --python=$PYTHON --output`
@@ -45,10 +45,10 @@ gpuci_logger "Starting conda uploads"
 
 if [[ "$BUILD_LIBRAFT" == "1" && "$UPLOAD_LIBRAFT" == "1" ]]; then
 
-  test -e ${LIBRAFT_CLIENT_API_FILE}
-  echo "Upload libraft-client-api"
-  echo ${LIBRAFT_CLIENT_API_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_CLIENT_API_FILE} --no-progress
+  test -e ${LIBRAFT_HEADERS_FILE}
+  echo "Upload libraft-headers"
+  echo ${LIBRAFT_HEADERS_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_HEADERS_FILE} --no-progress
 
   test -e ${LIBRAFT_NN_FILE}
   echo "Upload libraft-nn"
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index cd066c41a9..3e75b2ae74 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -83,7 +83,7 @@ class Rng : public detail::RngImpl {
   /**
    * @brief ctor
    * @param _s 64b seed used to initialize the RNG
-   * @param _t core device RNG generator type
+   * @param _t backend device RNG generator type
    * @note Refer to the `Rng::seed` method for details about seeding the engine
    */
   Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl(_s, _t) {}
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index 450feb563a..2d1af6a97e 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -87,7 +87,7 @@ class Rng : public detail::RngImpl {
   /**
    * @brief ctor
    * @param _s 64b seed used to initialize the RNG
-   * @param _t core device RNG generator type
+   * @param _t backend device RNG generator type
    * @note Refer to the `Rng::seed` method for details about seeding the engine
    */
   Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl(_s, _t) {}
diff --git a/python/raft/record.txt b/python/raft/record.txt
deleted file mode 100644
index 61d6e527ff..0000000000
--- a/python/raft/record.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/_version.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/test_comms.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/conftest.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/test_raft.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/test_interruptible.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/include_test/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/__init__.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/comms.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/utils.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/ucx.py
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/handle.pxd
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/__init__.pxd
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/cuda.pxd
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/interruptible.pxd
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/__pycache__/_version.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__pycache__/test_comms.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__pycache__/conftest.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__pycache__/test_raft.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/test/__pycache__/test_interruptible.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/include_test/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/common/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/__pycache__/__init__.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/__pycache__/comms.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/__pycache__/utils.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft/dask/common/__pycache__/ucx.cpython-39.pyc
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/common/cuda.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/common/handle.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/common/interruptible.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/dask/common/comms_utils.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/dask/common/nccl.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/jects/raft/python/raft/raft/include_test/raft_include_test.cpython-39-x86_64-linux-gnu.so
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/top_level.txt
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/PKG-INFO
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/SOURCES.txt
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/requires.txt
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/dependency_links.txt
-/home/cjnolet/miniconda3/envs/cuml_2204_022222_2/lib/python3.9/site-packages/raft-22.4.0a0+159.g55c3896c4-py3.9.egg-info/not-zip-safe

From bdb75bace3563d83ff2d534647300305178d050e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 17 Mar 2022 16:23:49 -0400
Subject: [PATCH 112/167] A few more doc updates I missed

---
 BUILD.md  | 4 ++--
 README.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index bcf7d49c0a..e88480f9af 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -42,7 +42,7 @@ In addition to the libraries included with cudatoolkit 11.0+, there are some oth
 
 C++ RAFT is a header-only library but provides the option of building shared libraries with template instantiations for common types to speed up compile times for larger projects.
 
-The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python artifacts and provides options for building and installing the core headers, tests, benchmarks, and individual shared libraries.
+The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python artifacts and provides options for building and installing the headers, tests, benchmarks, and individual shared libraries.
 
 ### <a id="install_header_only_cpp"></a>Header-only C++
 
@@ -185,7 +185,7 @@ If RAFT has already been installed, such as by using the `build.sh` script, use
 
 ### <a id="use_shared_libs"></a>Using pre-compiled shared libraries
 
-Use `find_package(raft COMPONENTS core nn distance)` to enable the shared libraries and transitively pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
+Use `find_package(raft COMPONENTS nn distance)` to enable the shared libraries and transitively pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
 
 The pre-compiled libraries contain template specializations for commonly used types, such as single- and double-precision floating-point. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `spectializations.hpp` and located in the base directory for the packages that contain specializations.
 
diff --git a/README.md b/README.md
index 69003a82ac..83aa42d74d 100755
--- a/README.md
+++ b/README.md
@@ -103,13 +103,13 @@ RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https:
 ### Conda
 
 The easiest way to install RAFT is through conda and several packages are provided.
-- `libraft-headers` contains a subset of CUDA/C++ headers that can be safely included in public APIs because they depend only upon the cudatoolkit libraries and can be safely compiled without `nvcc`. The client APIs are also more stable across versions so they can be safely installed globally in an environment with projects which might have been built with different versions of RAFT.
+- `libraft-headers` RAFT headers
 - `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
 - `libraft-distance` (optional) contains shared libraries for distance primitives.
 - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives
 - `pyraft` (optional) contains reusable Python infrastructure and tools to accelerate Python algorithm development.
 
-Use the following command to install RAFT with conda (use `-c rapidsai-nightly` for more up-to-date but less stable nightly packages)
+Use the following command to install RAFT with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages)
 ```bash
 conda install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft pylibraft
 ```

From 0fcff5de3b06eec474cd9c79fd1eca236150de4d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 17 Mar 2022 18:14:28 -0400
Subject: [PATCH 113/167] Review feedback

---
 ci/gpu/build.sh    | 5 ++---
 ci/local/README.md | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 919c83ff64..4427362103 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -37,8 +37,6 @@ export SCCACHE_BUCKET="rapids-sccache"
 export SCCACHE_REGION="us-west-2"
 export SCCACHE_IDLE_TIMEOUT="32768"
 
-export LIBRAFT_CONDA_PACKAGES="$WORKSPACE/ci/artifacts/raft/cpu/.conda-bld/linux-64"
-
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -77,7 +75,8 @@ set +x
 
 # Install pre-built conda packages from previous CI step
 gpuci_logger "Install libraft conda packages from CPU job"
-gpuci_mamba_retry install --use-local "${LIBRAFT_CONDA_PACKAGES}/libraft*.bz2"
+export LIBRAFT_CONDA_PACKAGES="$WORKSPACE/ci/artifacts/raft/cpu/.conda-bld/" # notice there is no `linux-64` here
+gpuci_mamba_retry install -c "${LIBRAFT_CONDA_PACKAGES}" libraft-headers libraft-distance libraft-nn
 
 gpuci_logger "Check compiler versions"
 python --version
diff --git a/ci/local/README.md b/ci/local/README.md
index 7126a3973d..bae3b278f0 100644
--- a/ci/local/README.md
+++ b/ci/local/README.md
@@ -23,7 +23,7 @@ where:
 ```
 
 Example Usage:
-`bash build.sh -r ~/rapids/raft -i gpuci/rapidsai-base:cuda11.5-ubuntu21.04-py3.8`
+`bash build.sh -r ~/rapids/raft -i gpuci/rapidsai-base:cuda11.5-ubuntu20.04-py3.8`
 
 For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai-base/tags) page.
 

From 4fc2656dfa5e4446c5c093349cef04bd348aed24 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 18 Mar 2022 12:11:16 -0400
Subject: [PATCH 114/167] Updating docs for 22.04

---
 cpp/doxygen/Doxyfile.in                       |   8 +-
 cpp/include/raft/cache/cache_util.cuh         |   2 +-
 cpp/include/raft/device_atomics.cuh           |   2 +-
 cpp/include/raft/distance/distance.cuh        |  27 +-
 cpp/include/raft/distance/distance.hpp        | 446 +----------------
 cpp/include/raft/linalg/map.hpp               |  38 +-
 cpp/include/raft/linalg/qr.hpp                |  62 +--
 cpp/include/raft/matrix/math.hpp              | 451 +-----------------
 cpp/include/raft/random/make_blobs.cuh        |   7 +-
 cpp/include/raft/random/make_blobs.hpp        | 160 +------
 cpp/include/raft/random/rng.hpp               | 363 +-------------
 cpp/include/raft/sparse/csr.hpp               |   4 +-
 cpp/include/raft/sparse/distance/distance.hpp | 120 +----
 .../raft/sparse/hierarchy/single_linkage.hpp  |  49 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh |  16 +-
 cpp/include/raft/sparse/linalg/symmetrize.hpp | 152 +-----
 cpp/include/raft/sparse/mst/mst.hpp           |  41 +-
 .../raft/spectral/modularity_maximization.cuh |  28 +-
 .../raft/spectral/modularity_maximization.hpp |  76 +--
 cpp/include/raft/spectral/partition.cuh       |  27 +-
 cpp/include/raft/spectral/partition.hpp       |  85 +---
 docs/source/cpp_api.rst                       |  10 +-
 docs/source/cpp_api/clustering.rst            |  25 +
 docs/source/cpp_api/core.rst                  |  26 +-
 docs/source/cpp_api/distributed.rst           |   9 +
 docs/source/cpp_api/linalg.rst                |   8 +
 docs/source/cpp_api/matrix.rst                |   8 +
 docs/source/cpp_api/nn.rst                    |  14 -
 docs/source/cpp_api/optimization.rst          |  19 +
 docs/source/cpp_api/random.rst                |  12 +
 docs/source/cpp_api/sparse.rst                |   5 +-
 docs/source/cpp_api/spatial.rst               |   9 +-
 docs/source/cpp_api/stats.rst                 |   8 +
 docs/source/cuda_cpp.rst                      |   2 +-
 docs/source/index.rst                         |   4 +-
 docs/source/python_api.rst                    |   6 +-
 36 files changed, 199 insertions(+), 2130 deletions(-)
 create mode 100644 docs/source/cpp_api/clustering.rst
 create mode 100644 docs/source/cpp_api/matrix.rst
 delete mode 100644 docs/source/cpp_api/nn.rst
 create mode 100644 docs/source/cpp_api/optimization.rst
 create mode 100644 docs/source/cpp_api/random.rst

diff --git a/cpp/doxygen/Doxyfile.in b/cpp/doxygen/Doxyfile.in
index c83224050e..2ca265c454 100644
--- a/cpp/doxygen/Doxyfile.in
+++ b/cpp/doxygen/Doxyfile.in
@@ -798,7 +798,7 @@ INPUT_ENCODING         = UTF-8
 # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl,
 # *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js.
 
-FILE_PATTERNS          = *.hpp
+FILE_PATTERNS          = *.hpp *.cuh
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
@@ -814,8 +814,7 @@ RECURSIVE              = YES
 # run.
 
 EXCLUDE                = @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/linalg/symmetrize.hpp \ # Contains device code
-                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/csr.hpp \    # Contains device code
-                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/detail/cusparse_wrappers.h
+                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/csr.hpp     # Contains device code
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -832,8 +831,7 @@ EXCLUDE_SYMLINKS       = NO
 # exclude all test directories for example use the pattern */test/*
 
 EXCLUDE_PATTERNS       = */detail/* \
-                         */specializations/* \
-                         */spectral/*
+                         */specializations/*
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh
index dc9327bb94..66f132d0c8 100644
--- a/cpp/include/raft/cache/cache_util.cuh
+++ b/cpp/include/raft/cache/cache_util.cuh
@@ -111,7 +111,7 @@ __global__ void store_vecs(const math_t* tile,
  * @brief Map a key to a cache set.
  *
  * @param key key to be hashed
- * @param n_cache_set number of cache sets
+ * @param n_cache_sets number of cache sets
  * @return index of the cache set [0..n_cache_set)
  */
 int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; }
diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh
index e3b324d030..8169d9f1dc 100644
--- a/cpp/include/raft/device_atomics.cuh
+++ b/cpp/include/raft/device_atomics.cuh
@@ -462,7 +462,7 @@ struct typesAtomicCASImpl<T, 8> {
  * int8_t, int16_t, int32_t, int64_t, float, double
  *
  * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
+ * @param[in] update_value The value to be computed
  * @param[in] op  The binary operator used for compute
  *
  * @returns The old value at `address`
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index e13cfd94f8..e1905f30da 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -25,9 +25,15 @@
 
 #include <raft/mdarray.hpp>
 
+/**
+ * @defgroup pairwise_distance pairwise distance prims
+ * @{
+ */
+
 namespace raft {
 namespace distance {
 
+
 /**
  * @brief Evaluate pairwise distances with the user epilogue lamba allowed
  * @tparam DistanceType which distance to evaluate
@@ -267,12 +273,11 @@ void distance(raft::handle_t const& handle,
 }
 
 /**
- * @defgroup pairwise_distance pairwise distance prims
- * @{
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
  * @tparam Type input/accumulation/output data-type
  * @tparam Index_ indexing type
+ * @param handle raft handle for managing expensive resources
  * @param x first set of points
  * @param y second set of points
  * @param dist output distance matrix
@@ -282,8 +287,8 @@ void distance(raft::handle_t const& handle,
  * @param workspace temporary workspace buffer which can get resized as per the
  * needed workspace size
  * @param metric distance metric
- * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename Index_ = int>
 void pairwise_distance(const raft::handle_t& handle,
@@ -363,15 +368,13 @@ void pairwise_distance(const raft::handle_t& handle,
     default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
 }
-/** @} */
 
 /**
- * @defgroup pairwise_distance pairwise distance prims
- * @{
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
  * @tparam Type input/accumulation/output data-type
  * @tparam Index_ indexing type
+ * @param handle raft handle for managing expensive resources
  * @param x first set of points
  * @param y second set of points
  * @param dist output distance matrix
@@ -379,8 +382,8 @@ void pairwise_distance(const raft::handle_t& handle,
  * @param n number of points in y
  * @param k dimensionality
  * @param metric distance metric
- * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename Index_ = int>
 void pairwise_distance(const raft::handle_t& handle,
@@ -400,20 +403,16 @@ void pairwise_distance(const raft::handle_t& handle,
 }
 
 /**
- * @defgroup pairwise_distance pairwise distance prims
- * @{
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
  * @tparam Type input/accumulation/output data-type
  * @tparam Index_ indexing type
+ * @param handle raft handle for managing expensive resources
  * @param x first matrix of points (size mxk)
  * @param y second matrix of points (size nxk)
  * @param dist output distance matrix (size mxn)
- * @param workspace temporary workspace buffer which can get resized as per the
- * needed workspace size
  * @param metric distance metric
- * @param stream cuda stream
- * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename Index_ = int, typename layout = layout_c_contiguous>
 void pairwise_distance(raft::handle_t const& handle,
@@ -454,4 +453,6 @@ void pairwise_distance(raft::handle_t const& handle,
 };  // namespace distance
 };  // namespace raft
 
+/** @} */
+
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 66b4efcede..783a362797 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -18,450 +18,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __DISTANCE_H
-#define __DISTANCE_H
-
 #pragma once
 
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/distance_type.hpp>
-#include <raft/handle.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <raft/mdarray.hpp>
-
-namespace raft {
-namespace distance {
-
-/**
- * @brief Evaluate pairwise distances with the user epilogue lamba allowed
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param fin_op the final gemm epilogue lambda
- * @param stream cuda stream
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- *
- * @note fin_op: This is a device lambda which is supposed to operate upon the
- * input which is AccType and returns the output in OutType. It's signature is
- * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
- * any other parameters, feel free to pass them via closure.
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void distance(const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              void* workspace,
-              size_t worksize,
-              FinalLambda fin_op,
-              cudaStream_t stream,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
-{
-  detail::distance<distanceType, InType, AccType, OutType, FinalLambda, Index_>(
-    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
-}
-
-/**
- * @brief Evaluate pairwise distances for the simple use case
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param stream cuda stream
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- *
- * @note if workspace is passed as nullptr, this will return in
- *  worksize, the number of bytes of workspace required
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-void distance(const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              void* workspace,
-              size_t worksize,
-              cudaStream_t stream,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
-{
-  detail::distance<distanceType, InType, AccType, OutType, Index_>(
-    x, y, dist, m, n, k, workspace, worksize, stream, isRowMajor, metric_arg);
-}
-
-/**
- * @brief Return the exact workspace size to compute the distance
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param x first set of points
- * @param y second set of points
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- *
- * @note If the specified distanceType doesn't need the workspace at all, it
- * returns 0.
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
-{
-  return detail::getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
-}
-
-/**
- * @brief Return the exact workspace size to compute the distance
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param x first set of points (size m*k)
- * @param y second set of points (size n*k)
- * @return number of bytes needed in workspace
- *
- * @note If the specified distanceType doesn't need the workspace at all, it
- * returns 0.
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-size_t getWorkspaceSize(const raft::device_matrix_view<InType>& x,
-                        const raft::device_matrix_view<InType>& y)
-{
-  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
-
-  return getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(
-    x.data(), y.data(), x.extent(0), y.extent(0), x.extent(1));
-}
-
-/**
- * @brief Evaluate pairwise distances for the simple use case
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param stream cuda stream
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-void distance(const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              cudaStream_t stream,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
-{
-  rmm::device_uvector<char> workspace(0, stream);
-  auto worksize = getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
-  workspace.resize(worksize, stream);
-  detail::distance<distanceType, InType, AccType, OutType, Index_>(
-    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
-}
-
-/**
- * @brief Evaluate pairwise distances for the simple use case.
- *
- * Note: Only contiguous row- or column-major layouts supported currently.
- *
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points (size n*k)
- * @param y second set of points (size m*k)
- * @param dist output distance matrix (size n*m)
- * @param metric_arg metric argument (used for Minkowski distance)
- */
-
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int,
-          typename layout = raft::layout_c_contiguous>
-void distance(raft::handle_t const handle,
-              raft::device_matrix_view<InType, layout> const x,
-              raft::device_matrix_view<InType, layout> const y,
-              raft::device_matrix_view<OutType> dist,
-              InType metric_arg = 2.0f)
-{
-  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
-  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
-               "Number of rows in output must be equal to "
-               "number of rows in X");
-  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
-               "Number of columns in output must be equal to "
-               "number of rows in Y");
-
-  RAFT_EXPECTS(x.is_contiguous(), "Input x must be contiguous.");
-  RAFT_EXPECTS(y.is_contiguous(), "Input y must be contiguous.");
-
-  if (x.stride(0) == 0 && y.stride(0) == 0) {
-    distance<distanceType, InType, AccType, OutType, Index_>(x.data(),
-                                                             y.data(),
-                                                             dist.data(),
-                                                             x.extent(0),
-                                                             y.extent(0),
-                                                             x.extent(1),
-                                                             handle.get_stream(),
-                                                             true,
-                                                             metric_arg);
-  } else if (x.stride(0) > 0 && y.stride(0) > 0) {
-    distance<distanceType, InType, AccType, OutType, Index_>(x.data(),
-                                                             y.data(),
-                                                             dist.data(),
-                                                             x.extent(0),
-                                                             y.extent(0),
-                                                             x.extent(1),
-                                                             handle.get_stream(),
-                                                             false,
-                                                             metric_arg);
-  } else {
-    RAFT_FAIL("x and y must both have the same layout: row-major or column-major.");
-  }
-}
-
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace buffer which can get resized as per the
- * needed workspace size
- * @param metric distance metric
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument
- */
-template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t& handle,
-                       const Type* x,
-                       const Type* y,
-                       Type* dist,
-                       Index_ m,
-                       Index_ n,
-                       Index_ k,
-                       rmm::device_uvector<char>& workspace,
-                       raft::distance::DistanceType metric,
-                       bool isRowMajor = true,
-                       Type metric_arg = 2.0f)
-{
-  switch (metric) {
-    case raft::distance::DistanceType::L2Expanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Expanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::L2SqrtExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::CosineExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CosineExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::L1:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L1>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::L2Unexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::Linf:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Linf>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::HellingerExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::LpUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, metric_arg);
-      break;
-    case raft::distance::DistanceType::Canberra:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Canberra>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::HammingUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HammingUnexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::JensenShannon:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::JensenShannon>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::RusselRaoExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::KLDivergence:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::KLDivergence>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::CorrelationExpanded:
-      detail::
-        pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CorrelationExpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  };
-}
-
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param metric distance metric
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument
- */
-template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t& handle,
-                       const Type* x,
-                       const Type* y,
-                       Type* dist,
-                       Index_ m,
-                       Index_ n,
-                       Index_ k,
-                       raft::distance::DistanceType metric,
-                       bool isRowMajor = true,
-                       Type metric_arg = 2.0f)
-{
-  rmm::device_uvector<char> workspace(0, handle.get_stream());
-  pairwise_distance<Type, Index_>(
-    handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
-}
-
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first matrix of points (size mxk)
- * @param y second matrix of points (size nxk)
- * @param dist output distance matrix (size mxn)
- * @param metric distance metric
- * @param metric_arg metric argument
- */
-template <typename Type, typename Index_ = int>
-void pairwise_distance(raft::handle_t const& handle,
-                       device_matrix_view<Type> const& x,
-                       device_matrix_view<Type> const& y,
-                       device_matrix_view<Type>& dist,
-                       raft::distance::DistanceType metric,
-                       Type metric_arg = 2.0f)
-{
-  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
-  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
-               "Number of rows in output must be equal to "
-               "number of rows in X");
-  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
-               "Number of columns in output must be equal to "
-               "number of rows in Y");
-
-  RAFT_EXPECTS(x.is_contiguous(), "Input x must be contiguous.");
-  RAFT_EXPECTS(y.is_contiguous(), "Input y must be contiguous.");
-  RAFT_EXPECTS(dist.is_contiguous(), "Output must be contiguous.");
-
-  bool rowmajor = x.stride(0) == 0;
-
-  rmm::device_uvector<char> workspace(0, handle.get_stream());
-
-  pairwise_distance(handle,
-                    x.data(),
-                    y.data(),
-                    dist.data(),
-                    x.extent(0),
-                    y.extent(0),
-                    x.extent(1),
-                    metric,
-                    rowmajor,
-                    metric_arg);
-}
-
-};  // namespace distance
-};  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/distance/distance.cuh>
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp
index d4ee231eb1..a1ce1f3ef9 100644
--- a/cpp/include/raft/linalg/map.hpp
+++ b/cpp/include/raft/linalg/map.hpp
@@ -18,42 +18,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MAP_H
-#define __MAP_H
-
 #pragma once
 
-#include "detail/map.cuh"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @brief CUDA version of map
- * @tparam InType data-type upon which the math operation will be performed
- * @tparam MapOp the device-lambda performing the actual operation
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Args additional parameters
- * @tparam OutType data-type in which the result will be stored
- * @param out the output of the map operation (assumed to be a device pointer)
- * @param len number of elements in the input array
- * @param map the device-lambda
- * @param stream cuda-stream where to launch this kernel
- * @param in the input array
- * @param args additional input arrays
- */
-
-template <typename InType,
-          typename MapOp,
-          int TPB = 256,
-          typename... Args,
-          typename OutType = InType>
-void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
-  detail::mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
-}
-
-}  // namespace linalg
-};  // namespace raft
-
-#endif
\ No newline at end of file
+#include "map.cuh"
diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp
index da8736b46f..ffb381dca1 100644
--- a/cpp/include/raft/linalg/qr.hpp
+++ b/cpp/include/raft/linalg/qr.hpp
@@ -18,66 +18,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __QR_H
-#define __QR_H
-
 #pragma once
 
-#include "detail/qr.cuh"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @defgroup QRdecomp QR decomposition
- * @{
- */
-
-/**
- * @brief compute QR decomp and return only Q matrix
- * @param handle: raft handle
- * @param M: input matrix
- * @param Q: Q matrix to be returned (on GPU)
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream cuda stream
- * @{
- */
-template <typename math_t>
-void qrGetQ(const raft::handle_t& handle,
-            const math_t* M,
-            math_t* Q,
-            int n_rows,
-            int n_cols,
-            cudaStream_t stream)
-{
-  detail::qrGetQ(handle, M, Q, n_rows, n_cols, stream);
-}
-
-/**
- * @brief compute QR decomp and return both Q and R matrices
- * @param handle: raft handle
- * @param M: input matrix
- * @param Q: Q matrix to be returned (on GPU)
- * @param R: R matrix to be returned (on GPU)
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void qrGetQR(const raft::handle_t& handle,
-             math_t* M,
-             math_t* Q,
-             math_t* R,
-             int n_rows,
-             int n_cols,
-             cudaStream_t stream)
-{
-  detail::qrGetQR(handle, M, Q, R, n_rows, n_cols, stream);
-}
-/** @} */
-
-};  // namespace linalg
-};  // namespace raft
-
-#endif
\ No newline at end of file
+#include "qr.cuh"
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index ab02c8a85f..e04764f59e 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -18,455 +18,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MATH_H
-#define __MATH_H
-
 #pragma once
 
-#include "detail/math.cuh"
-
-namespace raft {
-namespace matrix {
-
-/**
- * @defgroup MatrixMathOp math operation on the input matrix
- * @{
- */
-
-/**
- * @brief Power of every element in the input matrix
- * @param in: input matrix
- * @param out: output matrix. The result is stored in the out matrix
- * @param scalar: every element is multiplied with scalar.
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
-{
-  detail::power(in, out, scalar, len, stream);
-}
-
-/**
- * @brief Power of every element in the input matrix
- * @param inout: input matrix and also the result is stored
- * @param scalar: every element is multiplied with scalar.
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
-{
-  detail::power(inout, scalar, len, stream);
-}
-
-/**
- * @brief Power of every element in the input matrix
- * @param inout: input matrix and also the result is stored
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void power(math_t* inout, int len, cudaStream_t stream)
-{
-  detail::power(inout, len, stream);
-}
-
-/**
- * @brief Power of every element in the input matrix
- * @param in: input matrix
- * @param out: output matrix. The result is stored in the out matrix
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @{
- */
-template <typename math_t>
-void power(math_t* in, math_t* out, int len, cudaStream_t stream)
-{
-  detail::power(in, out, len, stream);
-}
-
-/**
- * @brief Square root of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param in: input matrix and also the result is stored
- * @param out: output matrix. The result is stored in the out matrix
- * @param scalar: every element is multiplied with scalar
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param set_neg_zero whether to set negative numbers to zero
- */
-template <typename math_t, typename IdxType = int>
-void seqRoot(math_t* in,
-             math_t* out,
-             math_t scalar,
-             IdxType len,
-             cudaStream_t stream,
-             bool set_neg_zero = false)
-{
-  detail::seqRoot(in, out, scalar, len, stream, set_neg_zero);
-}
-
-/**
- * @brief Square root of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix and also the result is stored
- * @param scalar: every element is multiplied with scalar
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param set_neg_zero whether to set negative numbers to zero
- */
-template <typename math_t, typename IdxType = int>
-void seqRoot(
-  math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
-{
-  detail::seqRoot(inout, scalar, len, stream, set_neg_zero);
-}
-
-/**
- * @brief Square root of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param in: input matrix and also the result is stored
- * @param out: output matrix. The result is stored in the out matrix
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
-{
-  detail::seqRoot(in, out, len, stream);
-}
-
-/**
- * @brief Square root of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix with in-place results
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
-{
-  detail::seqRoot(inout, len, stream);
-}
-
-/**
- * @brief sets the small values to zero based on a defined threshold
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out: output matrix. The result is stored in the out matrix
- * @param in: input matrix
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param thres threshold to set values to zero
- */
-template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(
-  math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
-{
-  detail::setSmallValuesZero(out, in, len, stream, thres);
-}
-
-/**
- * @brief sets the small values to zero based on a defined threshold
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix and also the result is stored
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param thres: threshold
- */
-template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
-{
-  detail::setSmallValuesZero(inout, len, stream, thres);
-}
-
-/**
- * @brief Reciprocal of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param in: input matrix and also the result is stored
- * @param out: output matrix. The result is stored in the out matrix
- * @param scalar: every element is multiplied with scalar
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param setzero round down to zero if the input is less the threshold
- * @param thres the threshold used to forcibly set inputs to zero
- * @{
- */
-template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* in,
-                math_t* out,
-                math_t scalar,
-                int len,
-                cudaStream_t stream,
-                bool setzero = false,
-                math_t thres = 1e-15)
-{
-  detail::reciprocal(in, out, scalar, len, stream, setzero, thres);
-}
-
-/**
- * @brief Reciprocal of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix with in-place results
- * @param scalar: every element is multiplied with scalar
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param setzero round down to zero if the input is less the threshold
- * @param thres the threshold used to forcibly set inputs to zero
- * @{
- */
-template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* inout,
-                math_t scalar,
-                IdxType len,
-                cudaStream_t stream,
-                bool setzero = false,
-                math_t thres = 1e-15)
-{
-  detail::reciprocal(inout, scalar, len, stream, setzero, thres);
-}
-
-/**
- * @brief Reciprocal of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix and also the result is stored
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
-{
-  detail::reciprocal(inout, len, stream);
-}
-
-/**
- * @brief Reciprocal of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param in: input matrix and also the result is stored
- * @param out: output matrix. The result is stored in the out matrix
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
-{
-  detail::reciprocal(in, out, len, stream);
-}
-
-/**
- * @brief set values to scalar in matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @param out output matrix. The result is stored in the out matrix
- * @param in input matrix
- * @param scalar svalar value
- * @param len number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
-{
-  detail::setValue(out, in, scalar, len, stream);
-}
-
-/**
- * @brief ratio of every element over sum of input vector is calculated
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param handle
- * @param src: input matrix
- * @param dest: output matrix. The result is stored in the dest matrix
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void ratio(
-  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
-{
-  detail::ratio(handle, src, dest, len, stream);
-}
-
-/** @} */
-
-/**
- * @brief Argmax: find the row idx with maximum value for each column
- * @param in: input matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param out: output vector of size n_cols
- * @param stream: cuda stream
- */
-template <typename math_t>
-void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream)
-{
-  detail::argmax(in, n_rows, n_cols, out, stream);
-}
-
-/**
- * @brief sign flip for PCA. This is used to stabilize the sign of column
- * major eigen vectors. Flips the sign if the column has negative |max|.
- * @param inout: input matrix. Result also stored in this parameter
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
-{
-  detail::signFlip(inout, n_rows, n_cols, stream);
-}
-
-/**
- * @brief multiply each row or column of matrix with vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMult(Type* data,
-                            const Type* vec,
-                            IdxType n_row,
-                            IdxType n_col,
-                            bool rowMajor,
-                            bool bcastAlongRows,
-                            cudaStream_t stream)
-{
-  detail::matrixVectorBinaryMult<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-/**
- * @brief multiply each row or column of matrix with vector, skipping zeros in vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMultSkipZero(Type* data,
-                                    const Type* vec,
-                                    IdxType n_row,
-                                    IdxType n_col,
-                                    bool rowMajor,
-                                    bool bcastAlongRows,
-                                    cudaStream_t stream)
-{
-  detail::matrixVectorBinaryMultSkipZero<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-/**
- * @brief divide each row or column of matrix with vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDiv(Type* data,
-                           const Type* vec,
-                           IdxType n_row,
-                           IdxType n_col,
-                           bool rowMajor,
-                           bool bcastAlongRows,
-                           cudaStream_t stream)
-{
-  detail::matrixVectorBinaryDiv<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-/**
- * @brief divide each row or column of matrix with vector, skipping zeros in vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- * @param return_zero result is zero if true and vector value is below threshold, original value if
- * false
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDivSkipZero(Type* data,
-                                   const Type* vec,
-                                   IdxType n_row,
-                                   IdxType n_col,
-                                   bool rowMajor,
-                                   bool bcastAlongRows,
-                                   cudaStream_t stream,
-                                   bool return_zero = false)
-{
-  detail::matrixVectorBinaryDivSkipZero<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream, return_zero);
-}
-
-/**
- * @brief add each row or column of matrix with vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryAdd(Type* data,
-                           const Type* vec,
-                           IdxType n_row,
-                           IdxType n_col,
-                           bool rowMajor,
-                           bool bcastAlongRows,
-                           cudaStream_t stream)
-{
-  detail::matrixVectorBinaryAdd<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-/**
- * @brief subtract each row or column of matrix with vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinarySub(Type* data,
-                           const Type* vec,
-                           IdxType n_row,
-                           IdxType n_col,
-                           bool rowMajor,
-                           bool bcastAlongRows,
-                           cudaStream_t stream)
-{
-  detail::matrixVectorBinarySub<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-};  // end namespace matrix
-};  // end namespace raft
-
-#endif
\ No newline at end of file
+#include "math.cuh"
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
index 088690529a..5da960fc1a 100644
--- a/cpp/include/raft/random/make_blobs.cuh
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -99,17 +99,12 @@ void make_blobs(DataT* out,
  * @tparam DataT output data type
  * @tparam IdxT  indexing arithmetic type
  *
+ * @param[in] handle raft handle for managing expensive resources
  * @param[out] out                generated data [on device]
  *                                [dim = n_rows x n_cols]
  * @param[out] labels             labels for the generated data [on device]
  *                                [len = n_rows]
- * @param[in]  n_rows             number of rows in the generated data
- * @param[in]  n_cols             number of columns in the generated data
  * @param[in]  n_clusters         number of clusters (or classes) to generate
- * @param[in]  stream             cuda stream to schedule the work on
- * @param[in]  row_major          whether input `centers` and output `out`
- *                                buffers are to be stored in row or column
- *                                major layout
  * @param[in]  centers            centers of each of the cluster, pass a nullptr
  *                                if you need this also to be generated randomly
  *                                [on device] [dim = n_clusters x n_cols]
diff --git a/cpp/include/raft/random/make_blobs.hpp b/cpp/include/raft/random/make_blobs.hpp
index 02aef809e7..ab04684f75 100644
--- a/cpp/include/raft/random/make_blobs.hpp
+++ b/cpp/include/raft/random/make_blobs.hpp
@@ -19,164 +19,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MAKE_BLOBS_H
-#define __MAKE_BLOBS_H
-
 #pragma once
 
-#include "detail/make_blobs.cuh"
-#include <optional>
-#include <raft/mdarray.hpp>
-
-namespace raft::random {
-
-/**
- * @brief GPU-equivalent of sklearn.datasets.make_blobs
- *
- * @tparam DataT output data type
- * @tparam IdxT  indexing arithmetic type
- *
- * @param[out] out                generated data [on device]
- *                                [dim = n_rows x n_cols]
- * @param[out] labels             labels for the generated data [on device]
- *                                [len = n_rows]
- * @param[in]  n_rows             number of rows in the generated data
- * @param[in]  n_cols             number of columns in the generated data
- * @param[in]  n_clusters         number of clusters (or classes) to generate
- * @param[in]  stream             cuda stream to schedule the work on
- * @param[in]  row_major          whether input `centers` and output `out`
- *                                buffers are to be stored in row or column
- *                                major layout
- * @param[in]  centers            centers of each of the cluster, pass a nullptr
- *                                if you need this also to be generated randomly
- *                                [on device] [dim = n_clusters x n_cols]
- * @param[in]  cluster_std        standard deviation of each cluster center,
- *                                pass a nullptr if this is to be read from the
- *                                `cluster_std_scalar`. [on device]
- *                                [len = n_clusters]
- * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
- *                                the std-dev across all dimensions.
- * @param[in]  shuffle            shuffle the generated dataset and labels
- * @param[in]  center_box_min     min value of box from which to pick cluster
- *                                centers. Useful only if 'centers' is nullptr
- * @param[in]  center_box_max     max value of box from which to pick cluster
- *                                centers. Useful only if 'centers' is nullptr
- * @param[in]  seed               seed for the RNG
- * @param[in]  type               RNG type
- */
-template <typename DataT, typename IdxT>
-void make_blobs(DataT* out,
-                IdxT* labels,
-                IdxT n_rows,
-                IdxT n_cols,
-                IdxT n_clusters,
-                cudaStream_t stream,
-                bool row_major                 = true,
-                const DataT* centers           = nullptr,
-                const DataT* cluster_std       = nullptr,
-                const DataT cluster_std_scalar = (DataT)1.0,
-                bool shuffle                   = true,
-                DataT center_box_min           = (DataT)-10.0,
-                DataT center_box_max           = (DataT)10.0,
-                uint64_t seed                  = 0ULL,
-                GeneratorType type             = GenPhilox)
-{
-  detail::make_blobs_caller(out,
-                            labels,
-                            n_rows,
-                            n_cols,
-                            n_clusters,
-                            stream,
-                            row_major,
-                            centers,
-                            cluster_std,
-                            cluster_std_scalar,
-                            shuffle,
-                            center_box_min,
-                            center_box_max,
-                            seed,
-                            type);
-}
-
-/**
- * @brief GPU-equivalent of sklearn.datasets.make_blobs
- *
- * @tparam DataT output data type
- * @tparam IdxT  indexing arithmetic type
- *
- * @param[in]  handle             raft handle for managing expensive resources
- * @param[out] out                generated data [on device]
- *                                [dim = n_rows x n_cols]
- * @param[out] labels             labels for the generated data [on device]
- *                                [len = n_rows]
- * @param[in]  n_clusters         number of clusters (or classes) to generate
- * @param[in]  centers            centers of each of the cluster, pass a nullptr
- *                                if you need this also to be generated randomly
- *                                [on device] [dim = n_clusters x n_cols]
- * @param[in]  cluster_std        standard deviation of each cluster center,
- *                                pass a nullptr if this is to be read from the
- *                                `cluster_std_scalar`. [on device]
- *                                [len = n_clusters]
- * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
- *                                the std-dev across all dimensions.
- * @param[in]  shuffle            shuffle the generated dataset and labels
- * @param[in]  center_box_min     min value of box from which to pick cluster
- *                                centers. Useful only if 'centers' is nullptr
- * @param[in]  center_box_max     max value of box from which to pick cluster
- *                                centers. Useful only if 'centers' is nullptr
- * @param[in]  seed               seed for the RNG
- * @param[in]  type               RNG type
- */
-template <typename DataT, typename IdxT, typename layout>
-void make_blobs(raft::handle_t const& handle,
-                raft::device_matrix_view<DataT, layout> out,
-                raft::device_vector_view<IdxT> labels,
-                IdxT n_clusters                                                  = 5,
-                std::optional<raft::device_matrix_view<DataT, layout>> centers   = std::nullopt,
-                std::optional<raft::device_vector_view<DataT>> const cluster_std = std::nullopt,
-                const DataT cluster_std_scalar                                   = (DataT)1.0,
-                bool shuffle                                                     = true,
-                DataT center_box_min                                             = (DataT)-10.0,
-                DataT center_box_max                                             = (DataT)10.0,
-                uint64_t seed                                                    = 0ULL,
-                GeneratorType type                                               = GenPhilox)
-{
-  if (centers.has_value()) {
-    RAFT_EXPECTS(centers.value().extent(0) == (std::size_t)n_clusters,
-                 "n_centers must equal size of centers");
-  }
-
-  if (cluster_std.has_value()) {
-    RAFT_EXPECTS(cluster_std.value().extent(0) == (std::size_t)n_clusters,
-                 "n_centers must equal size of cluster_std");
-  }
-
-  RAFT_EXPECTS(out.extent(0) == labels.extent(0),
-               "Number of labels must equal the number of row in output matrix");
-
-  RAFT_EXPECTS(out.is_contiguous(), "Output must be contiguous.");
-
-  bool row_major = std::is_same<layout, raft::layout_c_contiguous>::value;
-
-  auto prm_centers     = centers.has_value() ? centers.value().data() : nullptr;
-  auto prm_cluster_std = cluster_std.has_value() ? cluster_std.value().data() : nullptr;
-
-  detail::make_blobs_caller(out.data(),
-                            labels.data(),
-                            (IdxT)out.extent(0),
-                            (IdxT)out.extent(1),
-                            n_clusters,
-                            handle.get_stream(),
-                            row_major,
-                            prm_centers,
-                            prm_cluster_std,
-                            cluster_std_scalar,
-                            shuffle,
-                            center_box_min,
-                            center_box_max,
-                            seed,
-                            type);
-}
-}  // end namespace raft::random
-
-#endif
\ No newline at end of file
+#include "make_blobs.cuh"
\ No newline at end of file
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index 2d1af6a97e..44f9c955ac 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -18,367 +18,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __RNG_H
-#define __RNG_H
-
 #pragma once
 
-#include "detail/rng_impl.cuh"
-
-namespace raft {
-namespace random {
-
-using detail::RngState;
-
-using detail::GeneratorType;
-using detail::GenPC;
-using detail::GenPhilox;
-
-using detail::PCGenerator;
-using detail::PhiloxGenerator;
-
-using detail::BernoulliDistParams;
-using detail::ExponentialDistParams;
-using detail::GumbelDistParams;
-using detail::InvariantDistParams;
-using detail::LaplaceDistParams;
-using detail::LogisticDistParams;
-using detail::LogNormalDistParams;
-using detail::NormalDistParams;
-using detail::NormalIntDistParams;
-using detail::NormalTableDistParams;
-using detail::RayleighDistParams;
-using detail::SamplingParams;
-using detail::ScaledBernoulliDistParams;
-using detail::UniformDistParams;
-using detail::UniformIntDistParams;
-
-// Not strictly needed due to C++ ADL rules
-using detail::custom_next;
-
-/**
- * @brief Helper method to compute Box Muller transform
- *
- * @tparam Type data type
- *
- * @param[inout] val1   first value
- * @param[inout] val2   second value
- * @param[in]    sigma1 standard deviation of output gaussian for first value
- * @param[in]    mu1    mean of output gaussian for first value
- * @param[in]    sigma2 standard deviation of output gaussian for second value
- * @param[in]    mu2    mean of output gaussian for second value
- * @{
- */
-template <typename Type>
-DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
-{
-  detail::box_muller_transform(val1, val2, sigma1, mu1, sigma2, mu2);
-}
-
-template <typename Type>
-DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
-{
-  detail::box_muller_transform(val1, val2, sigma1, mu1);
-}
-/** @} */
-
-class Rng : public detail::RngImpl {
- public:
-  /**
-   * @brief ctor
-   * @param _s 64b seed used to initialize the RNG
-   * @param _t backend device RNG generator type
-   * @note Refer to the `Rng::seed` method for details about seeding the engine
-   */
-  Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl(_s, _t) {}
-
-  /**
-   * @brief Generates the 'a' and 'b' parameters for a modulo affine
-   *        transformation equation: `(ax + b) % n`
-   *
-   * @tparam IdxT integer type
-   *
-   * @param[in]  n the modulo range
-   * @param[out] a slope parameter
-   * @param[out] b intercept parameter
-   */
-  template <typename IdxT>
-  void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
-  {
-    detail::RngImpl::affine_transform_params(n, a, b);
-  }
-
-  /**
-   * @brief Generate uniformly distributed numbers in the given range
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param start start of the range
-   * @param end end of the range
-   * @param stream stream where to launch the kernel
-   * @{
-   */
-  template <typename OutType, typename LenType = int>
-  void uniform(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
-  {
-    detail::RngImpl::uniform(ptr, len, start, end, stream);
-  }
-
-  template <typename OutType, typename LenType = int>
-  void uniformInt(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
-  {
-    detail::RngImpl::uniformInt(ptr, len, start, end, stream);
-  }
-  /** @} */
-
-  /**
-   * @brief Generate normal distributed numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param mu mean of the distribution
-   * @param sigma std-dev of the distribution
-   * @param stream stream where to launch the kernel
-   * @{
-   */
-  template <typename OutType, typename LenType = int>
-  void normal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
-  {
-    detail::RngImpl::normal(ptr, len, mu, sigma, stream);
-  }
-
-  template <typename IntType, typename LenType = int>
-  void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
-  {
-    detail::RngImpl::normalInt(ptr, len, mu, sigma, stream);
-  }
-  /** @} */
-
-  /**
-   * @brief Generate normal distributed table according to the given set of
-   * means and scalar standard deviations.
-   *
-   * Each row in this table conforms to a normally distributed n-dim vector
-   * whose mean is the input vector and standard deviation is the corresponding
-   * vector or scalar. Correlations among the dimensions itself is assumed to
-   * be absent.
-   *
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output table (dim = n_rows x n_cols)
-   * @param n_rows number of rows in the table
-   * @param n_cols number of columns in the table
-   * @param mu_vec mean vector (dim = n_cols x 1).
-   * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass
-   * a nullptr to use the same scalar 'sigma' across all components
-   * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void normalTable(OutType* ptr,
-                   LenType n_rows,
-                   LenType n_cols,
-                   const OutType* mu_vec,
-                   const OutType* sigma_vec,
-                   OutType sigma,
-                   cudaStream_t stream)
-  {
-    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu_vec, sigma_vec, sigma, stream);
-  }
-
-  /**
-   * @brief Fill an array with the given value
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param val value to be filled
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void fill(OutType* ptr, LenType len, OutType val, cudaStream_t stream)
-  {
-    detail::RngImpl::fill(ptr, len, val, stream);
-  }
-
-  /**
-   * @brief Generate bernoulli distributed boolean array
-   *
-   * @tparam Type    data type in which to compute the probabilities
-   * @tparam OutType output data type
-   * @tparam LenType data type used to represent length of the arrays
-   *
-   * @param[out] ptr    the output array
-   * @param[in]  len    the number of elements in the output
-   * @param[in]  prob   coin-toss probability for heads
-   * @param[in]  stream stream where to launch the kernel
-   */
-  template <typename Type, typename OutType = bool, typename LenType = int>
-  void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
-  {
-    detail::RngImpl::bernoulli(ptr, len, prob, stream);
-  }
-
-  /**
-   * @brief Generate bernoulli distributed array and applies scale
-   * @tparam Type data type in which to compute the probabilities
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param prob coin-toss probability for heads
-   * @param scale scaling factor
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void scaled_bernoulli(OutType* ptr, LenType len, OutType prob, OutType scale, cudaStream_t stream)
-  {
-    detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream);
-  }
-
-  /**
-   * @brief Generate Gumbel distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu mean value
-   * @param beta scale value
-   * @param stream stream where to launch the kernel
-   * @note https://en.wikipedia.org/wiki/Gumbel_distribution
-   */
-  template <typename OutType, typename LenType = int>
-  void gumbel(OutType* ptr, LenType len, OutType mu, OutType beta, cudaStream_t stream)
-  {
-    detail::RngImpl::gumbel(ptr, len, mu, beta, stream);
-  }
-
-  /**
-   * @brief Generate lognormal distributed numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param mu mean of the distribution
-   * @param sigma std-dev of the distribution
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void lognormal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
-  {
-    detail::RngImpl::lognormal(ptr, len, mu, sigma, stream);
-  }
-
-  /**
-   * @brief Generate logistic distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu mean value
-   * @param scale scale value
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void logistic(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
-  {
-    detail::RngImpl::logistic(ptr, len, mu, scale, stream);
-  }
-
-  /**
-   * @brief Generate exponentially distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param lambda the lambda
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void exponential(OutType* ptr, LenType len, OutType lambda, cudaStream_t stream)
-  {
-    detail::RngImpl::exponential(ptr, len, lambda, stream);
-  }
-
-  /**
-   * @brief Generate rayleigh distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param sigma the sigma
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void rayleigh(OutType* ptr, LenType len, OutType sigma, cudaStream_t stream)
-  {
-    detail::RngImpl::rayleigh(ptr, len, sigma, stream);
-  }
-
-  /**
-   * @brief Generate laplace distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu the mean
-   * @param scale the scale
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void laplace(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
-  {
-    detail::RngImpl::laplace(ptr, len, mu, scale, stream);
-  }
-
-  void advance(uint64_t max_streams, uint64_t max_calls_per_subsequence)
-  {
-    detail::RngImpl::advance(max_streams, max_calls_per_subsequence);
-  }
-
-  /**
-   * @brief Sample the input array without replacement, optionally based on the
-   * input weight vector for each element in the array
-   *
-   * Implementation here is based on the `one-pass sampling` algo described here:
-   * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf
-   *
-   * @note In the sampled array the elements which are picked will always appear
-   * in the increasing order of their weights as computed using the exponential
-   * distribution. So, if you're particular about the order (for eg. array
-   * permutations), then this might not be the right choice!
-   *
-   * @tparam DataT data type
-   * @tparam WeightsT weights type
-   * @tparam IdxT index type
-   * @param handle
-   * @param out output sampled array (of length 'sampledLen')
-   * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass
-   * a nullptr if this is not required.
-   * @param in input array to be sampled (of length 'len')
-   * @param wts weights array (of length 'len'). Pass a nullptr if uniform
-   * sampling is desired
-   * @param sampledLen output sampled array length
-   * @param len input array length
-   * @param stream cuda stream
-   */
-  template <typename DataT, typename WeightsT, typename IdxT = int>
-  void sampleWithoutReplacement(const raft::handle_t& handle,
-                                DataT* out,
-                                IdxT* outIdx,
-                                const DataT* in,
-                                const WeightsT* wts,
-                                IdxT sampledLen,
-                                IdxT len,
-                                cudaStream_t stream)
-  {
-    detail::RngImpl::sampleWithoutReplacement(
-      handle, out, outIdx, in, wts, sampledLen, len, stream);
-  }
-};
-
-};  // end namespace random
-};  // end namespace raft
-
-#endif
\ No newline at end of file
+#include "rng.cuh"
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/csr.hpp b/cpp/include/raft/sparse/csr.hpp
index ca0e6537e4..3e0a6392c5 100644
--- a/cpp/include/raft/sparse/csr.hpp
+++ b/cpp/include/raft/sparse/csr.hpp
@@ -47,7 +47,7 @@ using WeakCCState = detail::WeakCCState;
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_, typename Lambda = auto(Index_)->bool>
+template <typename Index_, typename Lambda>
 void weak_cc_batched(Index_* labels,
                      const Index_* row_ind,
                      const Index_* row_ind_ptr,
@@ -129,7 +129,7 @@ void weak_cc_batched(Index_* labels,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_ = int, typename Lambda = auto(Index_)->bool>
+template <typename Index_ = int, typename Lambda>
 void weak_cc(Index_* labels,
              const Index_* row_ind,
              const Index_* row_ind_ptr,
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index cba419e53a..1bae6fa533 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -18,124 +18,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_DIST_H
-#define __SPARSE_DIST_H
-
 #pragma once
 
-#include <raft/sparse/distance/common.h>
-#include <unordered_set>
-
-#include <raft/distance/distance_type.hpp>
-
-#include <raft/sparse/distance/detail/bin_distance.cuh>
-#include <raft/sparse/distance/detail/ip_distance.cuh>
-#include <raft/sparse/distance/detail/l2_distance.cuh>
-#include <raft/sparse/distance/detail/lp_distance.cuh>
-
-namespace raft {
-namespace sparse {
-namespace distance {
-
-static const std::unordered_set<raft::distance::DistanceType> supportedDistance{
-  raft::distance::DistanceType::L2Expanded,
-  raft::distance::DistanceType::L2Unexpanded,
-  raft::distance::DistanceType::L2SqrtExpanded,
-  raft::distance::DistanceType::L2SqrtUnexpanded,
-  raft::distance::DistanceType::InnerProduct,
-  raft::distance::DistanceType::L1,
-  raft::distance::DistanceType::Canberra,
-  raft::distance::DistanceType::Linf,
-  raft::distance::DistanceType::LpUnexpanded,
-  raft::distance::DistanceType::JaccardExpanded,
-  raft::distance::DistanceType::CosineExpanded,
-  raft::distance::DistanceType::HellingerExpanded,
-  raft::distance::DistanceType::DiceExpanded,
-  raft::distance::DistanceType::CorrelationExpanded,
-  raft::distance::DistanceType::RusselRaoExpanded,
-  raft::distance::DistanceType::HammingUnexpanded,
-  raft::distance::DistanceType::JensenShannon,
-  raft::distance::DistanceType::KLDivergence};
-
-/**
- * Compute pairwise distances between A and B, using the provided
- * input configuration and distance function.
- *
- * @tparam value_idx index type
- * @tparam value_t value type
- * @param[out] out dense output array (size A.nrows * B.nrows)
- * @param[in] input_config input argument configuration
- * @param[in] metric distance metric to use
- * @param[in] metric_arg metric argument (used for Minkowski distance)
- */
-template <typename value_idx = int, typename value_t = float>
-void pairwiseDistance(value_t* out,
-                      distances_config_t<value_idx, value_t> input_config,
-                      raft::distance::DistanceType metric,
-                      float metric_arg)
-{
-  switch (metric) {
-    case raft::distance::DistanceType::L2Expanded:
-      detail::l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::L2SqrtExpanded:
-      detail::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::InnerProduct:
-      detail::ip_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::L2Unexpanded:
-      detail::l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      detail::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::L1:
-      detail::l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::LpUnexpanded:
-      detail::lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg).compute(out);
-      break;
-    case raft::distance::DistanceType::Linf:
-      detail::linf_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::Canberra:
-      detail::canberra_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::JaccardExpanded:
-      detail::jaccard_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::CosineExpanded:
-      detail::cosine_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::HellingerExpanded:
-      detail::hellinger_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::DiceExpanded:
-      detail::dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::CorrelationExpanded:
-      detail::correlation_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::RusselRaoExpanded:
-      detail::russelrao_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::HammingUnexpanded:
-      detail::hamming_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::JensenShannon:
-      detail::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::KLDivergence:
-      detail::kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-
-    default: THROW("Unsupported distance: %d", metric);
-  }
-}
-
-};  // namespace distance
-};  // namespace sparse
-};  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/sparse/distance/distance.cuh>
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index e7a37b7bf5..7f48f578b7 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -18,53 +18,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SINGLE_LINKAGE_H
-#define __SINGLE_LINKAGE_H
-
 #pragma once
 
-#include <raft/sparse/hierarchy/common.h>
-#include <raft/sparse/hierarchy/detail/single_linkage.cuh>
-
-namespace raft {
-namespace hierarchy {
-
-/**
- * Single-linkage clustering, capable of constructing a KNN graph to
- * scale the algorithm beyond the n^2 memory consumption of implementations
- * that use the fully-connected graph of pairwise distances by connecting
- * a knn graph when k is not large enough to connect it.
-
- * @tparam value_idx
- * @tparam value_t
- * @tparam dist_type method to use for constructing connectivities graph
- * @param[in] handle raft handle
- * @param[in] X dense input matrix in row-major layout
- * @param[in] m number of rows in X
- * @param[in] n number of columns in X
- * @param[in] metric distance metrix to use when constructing connectivities graph
- * @param[out] out struct containing output dendrogram and cluster assignments
- * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect
- control
- *            of k. The algorithm will set `k = log(n) + c`
- * @param[in] n_clusters number of clusters to assign data samples
- */
-template <typename value_idx,
-          typename value_t,
-          LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(const raft::handle_t& handle,
-                    const value_t* X,
-                    size_t m,
-                    size_t n,
-                    raft::distance::DistanceType metric,
-                    linkage_output<value_idx, value_t>* out,
-                    int c,
-                    size_t n_clusters)
-{
-  detail::single_linkage<value_idx, value_t, dist_type>(
-    handle, X, m, n, metric, out, c, n_clusters);
-}
-};  // namespace hierarchy
-};  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/sparse/hierarchy/single_linkage.cuh>
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index d41540c0b3..44de653f79 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -57,12 +57,12 @@ void coo_symmetrize(COO<T>* in,
  * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_find_size(const value_t* __restrict__ data,
-                                           const value_idx* __restrict__ indices,
+__global__ void symmetric_find_size(const value_t __restrict__ *data,
+                                           const value_idx __restrict__ *indices,
                                            const value_idx n,
                                            const int k,
-                                           value_idx* __restrict__ row_sizes,
-                                           value_idx* __restrict__ row_sizes2)
+                                           value_idx __restrict__ *row_sizes,
+                                           value_idx __restrict__ *row_sizes2)
 {
   detail::symmetric_find_size(data, indices, n, k, row_sizes, row_sizes2);
 }
@@ -78,10 +78,10 @@ __global__ static void symmetric_find_size(const value_t* __restrict__ data,
  * @param row_sizes2: Input row sum 2 array(n) for faster reduction
  */
 template <typename value_idx>
-__global__ static void reduce_find_size(const value_idx n,
+__global__ void reduce_find_size(const value_idx n,
                                         const int k,
-                                        value_idx* __restrict__ row_sizes,
-                                        const value_idx* __restrict__ row_sizes2)
+                                        value_idx __restrict__ *row_sizes,
+                                        const value_idx __restrict__ *row_sizes2)
 {
   detail::reduce_find_size(n, k, row_sizes, row_sizes2);
 }
@@ -103,7 +103,7 @@ __global__ static void reduce_find_size(const value_idx n,
  * @param k: Number of n_neighbors
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_sum(value_idx* __restrict__ edges,
+__global__ void symmetric_sum(value_idx* __restrict__ edges,
                                      const value_t* __restrict__ data,
                                      const value_idx* __restrict__ indices,
                                      value_t* __restrict__ VAL,
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.hpp b/cpp/include/raft/sparse/linalg/symmetrize.hpp
index 4d8520dabf..df5754536b 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.hpp
+++ b/cpp/include/raft/sparse/linalg/symmetrize.hpp
@@ -18,156 +18,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __SYMMETRIZE_H
-#define __SYMMETRIZE_H
-
 #pragma once
 
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/linalg/detail/symmetrize.cuh>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
-/**
- * @brief takes a COO matrix which may not be symmetric and symmetrizes
- * it, running a custom reduction function against the each value
- * and its transposed value.
- *
- * @param in: Input COO matrix
- * @param out: Output symmetrized COO matrix
- * @param reduction_op: a custom reduction function
- * @param stream: cuda stream to use
- */
-template <typename T, typename Lambda>
-void coo_symmetrize(COO<T>* in,
-                    COO<T>* out,
-                    Lambda reduction_op,  // two-argument reducer
-                    cudaStream_t stream)
-{
-  detail::coo_symmetrize(in, out, reduction_op, stream);
-}
-
-/**
- * @brief Find how much space needed in each row.
- * We look through all datapoints and increment the count for each row.
- *
- * TODO: This isn't generalized. Remove in place of `symmetrize()`
- * @param data: Input knn distances(n, k)
- * @param indices: Input knn indices(n, k)
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- * @param row_sizes: Input empty row sum 1 array(n)
- * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
- */
-template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_find_size(const value_t* __restrict__ data,
-                                           const value_idx* __restrict__ indices,
-                                           const value_idx n,
-                                           const int k,
-                                           value_idx* __restrict__ row_sizes,
-                                           value_idx* __restrict__ row_sizes2)
-{
-  detail::symmetric_find_size(data, indices, n, k, row_sizes, row_sizes2);
-}
-
-/**
- * @brief Reduce sum(row_sizes) + k
- * Reduction for symmetric_find_size kernel. Allows algo to be faster.
- *
- * TODO: This isn't generalized. Remove in place of `symmetrize()`
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- * @param row_sizes: Input row sum 1 array(n)
- * @param row_sizes2: Input row sum 2 array(n) for faster reduction
- */
-template <typename value_idx>
-__global__ static void reduce_find_size(const value_idx n,
-                                        const int k,
-                                        value_idx* __restrict__ row_sizes,
-                                        const value_idx* __restrict__ row_sizes2)
-{
-  detail::reduce_find_size(n, k, row_sizes, row_sizes2);
-}
-
-/**
- * @brief Perform data + data.T operation.
- * Can only run once row_sizes from the CSR matrix of data + data.T has been
- * determined.
- *
- * TODO: This isn't generalized. Remove in place of `symmetrize()`
- *
- * @param edges: Input row sum array(n) after reduction
- * @param data: Input knn distances(n, k)
- * @param indices: Input knn indices(n, k)
- * @param VAL: Output values for data + data.T
- * @param COL: Output column indices for data + data.T
- * @param ROW: Output row indices for data + data.T
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- */
-template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_sum(value_idx* __restrict__ edges,
-                                     const value_t* __restrict__ data,
-                                     const value_idx* __restrict__ indices,
-                                     value_t* __restrict__ VAL,
-                                     value_idx* __restrict__ COL,
-                                     value_idx* __restrict__ ROW,
-                                     const value_idx n,
-                                     const int k)
-{
-  detail::symmetric_sum(edges, data, indices, VAL, COL, ROW, n, k);
-}
-
-/**
- * @brief Perform data + data.T on raw KNN data.
- * The following steps are invoked:
- * (1) Find how much space needed in each row
- * (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
- * (3) Allocate new space
- * (4) Prepare edges for each new row
- * (5) Perform final data + data.T operation
- * (6) Return summed up VAL, COL, ROW
- *
- * TODO: This isn't generalized. Remove in place of `symmetrize()`
- *
- * @param knn_indices: Input knn distances(n, k)
- * @param knn_dists: Input knn indices(n, k)
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- * @param out: Output COO Matrix class
- * @param stream: Input cuda stream
- */
-template <typename value_idx = int64_t, typename value_t = float, int TPB_X = 32, int TPB_Y = 32>
-void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
-                                const value_t* __restrict__ knn_dists,
-                                const value_idx n,
-                                const int k,
-                                COO<value_t, value_idx>* out,
-                                cudaStream_t stream)
-{
-  detail::from_knn_symmetrize_matrix(knn_indices, knn_dists, n, k, out, stream);
-}
-
-/**
- * Symmetrizes a COO matrix
- */
-template <typename value_idx, typename value_t>
-void symmetrize(const raft::handle_t& handle,
-                const value_idx* rows,
-                const value_idx* cols,
-                const value_t* vals,
-                size_t m,
-                size_t n,
-                size_t nnz,
-                raft::sparse::COO<value_t, value_idx>& out)
-{
-  detail::symmetrize(handle, rows, cols, vals, m, n, nnz, out);
-}
-
-};  // end NAMESPACE linalg
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
-
-#endif
+#include <raft/sparse/linalg/symmetrize.cuh>
diff --git a/cpp/include/raft/sparse/mst/mst.hpp b/cpp/include/raft/sparse/mst/mst.hpp
index ac4cf21b64..6523897d62 100644
--- a/cpp/include/raft/sparse/mst/mst.hpp
+++ b/cpp/include/raft/sparse/mst/mst.hpp
@@ -19,45 +19,6 @@
  * @warning This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
-
-#ifndef __MST_H
-#define __MST_H
-
 #pragma once
 
-#include "mst_solver.cuh"
-
-namespace raft {
-namespace mst {
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t = weight_t>
-raft::Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
-                                                edge_t const* offsets,
-                                                vertex_t const* indices,
-                                                weight_t const* weights,
-                                                vertex_t const v,
-                                                edge_t const e,
-                                                vertex_t* color,
-                                                cudaStream_t stream,
-                                                bool symmetrize_output = true,
-                                                bool initialize_colors = true,
-                                                int iterations         = 0)
-{
-  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(handle,
-                                                                  offsets,
-                                                                  indices,
-                                                                  weights,
-                                                                  v,
-                                                                  e,
-                                                                  color,
-                                                                  stream,
-                                                                  symmetrize_output,
-                                                                  initialize_colors,
-                                                                  iterations);
-  return mst_solver.solve();
-}
-
-}  // namespace mst
-}  // namespace raft
-
-#endif
\ No newline at end of file
+#include "mst.cuh"
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/modularity_maximization.cuh b/cpp/include/raft/spectral/modularity_maximization.cuh
index c8221e434c..61d85aefaa 100644
--- a/cpp/include/raft/spectral/modularity_maximization.cuh
+++ b/cpp/include/raft/spectral/modularity_maximization.cuh
@@ -31,24 +31,17 @@ namespace spectral {
 
 /** Compute partition for a weighted undirected graph. This
  *  partition attempts to minimize the cost function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
  *
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of partitions.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter_lanczos Maximum number of Lanczos iterations.
- *  @param restartIter_lanczos Maximum size of Lanczos system before
- *    implicit restart.
- *  @param tol_lanczos Convergence tolerance for Lanczos method.
- *  @param maxIter_kmeans Maximum number of k-means iterations.
- *  @param tol_kmeans Convergence tolerance for k-means algorithm.
- *  @param clusters (Output, device memory, n entries) Cluster
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param eigen_solver Eigensolver implementation
+ *  @param cluster_solver Cluster solver implementation
+ *  @param clusters (Output, device memory, n entries) Partition
  *    assignments.
- *  @param iters_lanczos On exit, number of Lanczos iterations
- *    performed.
- *  @param iters_kmeans On exit, number of k-means iterations
- *    performed.
- *  @return error flag.
+ *  @param eigVals Output eigenvalue array pointer on device
+ *  @param eigVecs Output eigenvector array pointer on device
+ *  @return statistics: number of eigensolver iterations, .
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
@@ -70,7 +63,8 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
 
 /// Compute modularity
 /** This function determines the modularity based on a graph and cluster assignments
- *  @param G Weighted graph in CSR format
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
  *  @param nClusters Number of clusters.
  *  @param clusters (Input, device memory, n entries) Cluster assignments.
  *  @param modularity On exit, modularity
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index d1c3ea00f7..a09ceaa933 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -18,80 +18,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __MODULARITY_MAXIMIZATION_H
-#define __MODULARITY_MAXIMIZATION_H
-
 #pragma once
 
-#include <tuple>
-
-#include <raft/spectral/detail/modularity_maximization.hpp>
-
-namespace raft {
-namespace spectral {
-
-// =========================================================
-// Spectral modularity_maximization
-// =========================================================
-
-/** Compute partition for a weighted undirected graph. This
- *  partition attempts to minimize the cost function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
- *
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of partitions.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter_lanczos Maximum number of Lanczos iterations.
- *  @param restartIter_lanczos Maximum size of Lanczos system before
- *    implicit restart.
- *  @param tol_lanczos Convergence tolerance for Lanczos method.
- *  @param maxIter_kmeans Maximum number of k-means iterations.
- *  @param tol_kmeans Convergence tolerance for k-means algorithm.
- *  @param clusters (Output, device memory, n entries) Cluster
- *    assignments.
- *  @param iters_lanczos On exit, number of Lanczos iterations
- *    performed.
- *  @param iters_kmeans On exit, number of k-means iterations
- *    performed.
- *  @return error flag.
- */
-template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
-std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const& handle,
-  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-  EigenSolver const& eigen_solver,
-  ClusterSolver const& cluster_solver,
-  vertex_t* __restrict__ clusters,
-  weight_t* eigVals,
-  weight_t* eigVecs)
-{
-  return raft::spectral::detail::
-    modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(
-      handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
-}
-//===================================================
-// Analysis of graph partition
-// =========================================================
-
-/// Compute modularity
-/** This function determines the modularity based on a graph and cluster assignments
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of clusters.
- *  @param clusters (Input, device memory, n entries) Cluster assignments.
- *  @param modularity On exit, modularity
- */
-template <typename vertex_t, typename weight_t>
-void analyzeModularity(handle_t const& handle,
-                       matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-                       vertex_t nClusters,
-                       vertex_t const* __restrict__ clusters,
-                       weight_t& modularity)
-{
-  raft::spectral::detail::analyzeModularity<vertex_t, weight_t>(
-    handle, csr_m, nClusters, clusters, modularity);
-}
-
-}  // namespace spectral
-}  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/spectral/modularity_maximization.cuh>
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/partition.cuh b/cpp/include/raft/spectral/partition.cuh
index 9ccc21c868..2d21f2223c 100644
--- a/cpp/include/raft/spectral/partition.cuh
+++ b/cpp/include/raft/spectral/partition.cuh
@@ -33,23 +33,16 @@ namespace spectral {
 /// Compute spectral graph partition
 /** Compute partition for a weighted undirected graph. This
  *  partition attempts to minimize the cost function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
  *
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of partitions.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter_lanczos Maximum number of Lanczos iterations.
- *  @param restartIter_lanczos Maximum size of Lanczos system before
- *    implicit restart.
- *  @param tol_lanczos Convergence tolerance for Lanczos method.
- *  @param maxIter_kmeans Maximum number of k-means iterations.
- *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param eigen_solver Eigensolver implementation
+ *  @param cluster_solver Cluster solver implementation
  *  @param clusters (Output, device memory, n entries) Partition
  *    assignments.
- *  @param iters_lanczos On exit, number of Lanczos iterations
- *    performed.
- *  @param iters_kmeans On exit, number of k-means iterations
- *    performed.
+ *  @param eigVals Output eigenvalue array pointer on device
+ *  @param eigVecs Output eigenvector array pointer on device
  *  @return statistics: number of eigensolver iterations, .
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
@@ -73,16 +66,16 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
 /// Compute cost function for partition
 /** This function determines the edges cut by a partition and a cost
  *  function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
  *  Graph is assumed to be weighted and undirected.
  *
- *  @param G Weighted graph in CSR format
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
  *  @param nClusters Number of partitions.
  *  @param clusters (Input, device memory, n entries) Partition
  *    assignments.
  *  @param edgeCut On exit, weight of edges cut by partition.
  *  @param cost On exit, partition cost function.
- *  @return error flag.
  */
 template <typename vertex_t, typename weight_t>
 void analyzePartition(handle_t const& handle,
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index fde2e6572b..78fea157ae 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -18,89 +18,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __PARTITION_H
-#define __PARTITION_H
-
 #pragma once
 
-#include <tuple>
-
-#include <raft/spectral/detail/partition.hpp>
-
-namespace raft {
-namespace spectral {
-
-// =========================================================
-// Spectral partitioner
-// =========================================================
-
-/// Compute spectral graph partition
-/** Compute partition for a weighted undirected graph. This
- *  partition attempts to minimize the cost function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
- *
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of partitions.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter_lanczos Maximum number of Lanczos iterations.
- *  @param restartIter_lanczos Maximum size of Lanczos system before
- *    implicit restart.
- *  @param tol_lanczos Convergence tolerance for Lanczos method.
- *  @param maxIter_kmeans Maximum number of k-means iterations.
- *  @param tol_kmeans Convergence tolerance for k-means algorithm.
- *  @param clusters (Output, device memory, n entries) Partition
- *    assignments.
- *  @param iters_lanczos On exit, number of Lanczos iterations
- *    performed.
- *  @param iters_kmeans On exit, number of k-means iterations
- *    performed.
- *  @return statistics: number of eigensolver iterations, .
- */
-template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
-std::tuple<vertex_t, weight_t, vertex_t> partition(
-  handle_t const& handle,
-  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-  EigenSolver const& eigen_solver,
-  ClusterSolver const& cluster_solver,
-  vertex_t* __restrict__ clusters,
-  weight_t* eigVals,
-  weight_t* eigVecs)
-{
-  return raft::spectral::detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
-    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
-}
-
-// =========================================================
-// Analysis of graph partition
-// =========================================================
-
-/// Compute cost function for partition
-/** This function determines the edges cut by a partition and a cost
- *  function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
- *  Graph is assumed to be weighted and undirected.
- *
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of partitions.
- *  @param clusters (Input, device memory, n entries) Partition
- *    assignments.
- *  @param edgeCut On exit, weight of edges cut by partition.
- *  @param cost On exit, partition cost function.
- *  @return error flag.
- */
-template <typename vertex_t, typename weight_t>
-void analyzePartition(handle_t const& handle,
-                      matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-                      vertex_t nClusters,
-                      const vertex_t* __restrict__ clusters,
-                      weight_t& edgeCut,
-                      weight_t& cost)
-{
-  raft::spectral::detail::analyzePartition<vertex_t, weight_t>(
-    handle, csr_m, nClusters, clusters, edgeCut, cost);
-}
-
-}  // namespace spectral
-}  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/spectral/partition.cuh>
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index 6d951587d9..efdec05ea4 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -9,6 +9,12 @@ RAFT C++ API Reference
    :maxdepth: 4
 
    cpp_api/core.rst
+   cpp_api/clustering.rst
+   cpp_api/distributed.rst
+   cpp_api/linalg.rst
+   cpp_api/matrix.rst
+   cpp_api/optimization.rst
+   cpp_api/random.rst
    cpp_api/spatial.rst
-   cpp_api/nn.rst
-   cpp_api/sparse.rst
\ No newline at end of file
+   cpp_api/sparse.rst
+   cpp_api/stats.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/clustering.rst b/docs/source/cpp_api/clustering.rst
new file mode 100644
index 0000000000..cebef84535
--- /dev/null
+++ b/docs/source/cpp_api/clustering.rst
@@ -0,0 +1,25 @@
+Clustering
+==========
+
+This page provides C++ class references for the publicly-exposed elements of the clustering package.
+
+K-Means
+#######
+
+.. doxygennamespace:: raft::cluster
+    :project: RAFT
+    :members:
+
+Spectral
+########
+
+.. doxygennamespace:: raft::spectral
+    :project: RAFT
+    :members:
+
+Hierarchical
+############
+
+.. doxygennamespace:: raft::hierarchical
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index bae39e3282..1f48b5aacc 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -14,8 +14,32 @@ handle_t
 
 
 interruptible
-########
+#############
 
 .. doxygenclass:: raft::interruptible
     :project: RAFT
     :members:
+
+
+mdarray
+#######
+
+.. doxygenclass:: raft::mdarray
+    :project: RAFT
+    :members:
+
+
+span
+####
+
+.. doxygenclass:: raft::span
+    :project: RAFT
+    :members:
+
+
+logger
+######
+
+.. doxygenclass:: raft::logger
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/distributed.rst b/docs/source/cpp_api/distributed.rst
index e69de29bb2..94e49385d2 100644
--- a/docs/source/cpp_api/distributed.rst
+++ b/docs/source/cpp_api/distributed.rst
@@ -0,0 +1,9 @@
+Multi-node Multi-GPU Communicator
+=================================
+
+RAFT's distributed package contains a communicator, which provides an MPI-like facade for building algorithms that can
+scale to multiple GPUs across multiple physical machines.
+
+.. doxygennamespace:: raft::comms
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/linalg.rst b/docs/source/cpp_api/linalg.rst
index e69de29bb2..f9986fd2ce 100644
--- a/docs/source/cpp_api/linalg.rst
+++ b/docs/source/cpp_api/linalg.rst
@@ -0,0 +1,8 @@
+Linear Algebra
+==============
+
+This page provides C++ class references for the publicly-exposed elements of the (dense) linear algebra package.
+
+.. doxygennamespace:: raft::linalg
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/matrix.rst b/docs/source/cpp_api/matrix.rst
new file mode 100644
index 0000000000..65534aa6ee
--- /dev/null
+++ b/docs/source/cpp_api/matrix.rst
@@ -0,0 +1,8 @@
+Matrix
+======
+
+This page provides C++ class references for the publicly-exposed elements of the matrix package.
+
+.. doxygennamespace:: raft::matrix
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/nn.rst b/docs/source/cpp_api/nn.rst
deleted file mode 100644
index 79d8dd1ad3..0000000000
--- a/docs/source/cpp_api/nn.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Nearest Neighbors
-=================
-
-This page provides C++ class references for the publicly-exposed elements of the nearest neighbors package.
-
-
-
-nearest neighbors
-#################
-
-.. doxygennamespace:: raft::spatial::knn
-    :project: RAFT
-    :members:
-
diff --git a/docs/source/cpp_api/optimization.rst b/docs/source/cpp_api/optimization.rst
new file mode 100644
index 0000000000..b5076f0a66
--- /dev/null
+++ b/docs/source/cpp_api/optimization.rst
@@ -0,0 +1,19 @@
+Optimization
+============
+
+This page provides C++ class references for the publicly-exposed elements of the optimization package.
+
+
+Linear Assignment Problem
+#########################
+
+.. doxygennamespace:: raft::lap
+    :project: RAFT
+    :members:
+
+Minimum Spanning Tree
+#####################
+
+.. doxygennamespace:: raft::mst
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/random.rst b/docs/source/cpp_api/random.rst
new file mode 100644
index 0000000000..8635855484
--- /dev/null
+++ b/docs/source/cpp_api/random.rst
@@ -0,0 +1,12 @@
+Random
+======
+
+This page provides C++ class references for the publicly-exposed elements of the random package.
+
+.. doxygennamespace:: raft::random
+    :project: RAFT
+    :members:
+
+.. doxygenclass:: raft::random::Rng
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/sparse.rst b/docs/source/cpp_api/sparse.rst
index 91e553426b..58d54d14c7 100644
--- a/docs/source/cpp_api/sparse.rst
+++ b/docs/source/cpp_api/sparse.rst
@@ -5,10 +5,9 @@ This page provides C++ class references for the publicly-exposed elements of the
 
 
 
-raft::sparse
-############
+Sparse Primitives
+#################
 
 .. doxygennamespace:: raft::sparse
     :project: RAFT
     :members:
-
diff --git a/docs/source/cpp_api/spatial.rst b/docs/source/cpp_api/spatial.rst
index 410267e528..fd6f64166a 100644
--- a/docs/source/cpp_api/spatial.rst
+++ b/docs/source/cpp_api/spatial.rst
@@ -3,11 +3,16 @@ Spatial
 
 This page provides C++ class references for the publicly-exposed elements of the spatial package.
 
-
-
 distance
 ########
 
 .. doxygennamespace:: raft::distance
     :project: RAFT
 
+
+nearest neighbors
+#################
+
+.. doxygennamespace:: raft::spatial::knn
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/stats.rst b/docs/source/cpp_api/stats.rst
index e69de29bb2..8ad8b8a604 100644
--- a/docs/source/cpp_api/stats.rst
+++ b/docs/source/cpp_api/stats.rst
@@ -0,0 +1,8 @@
+Stats
+=====
+
+This page provides C++ class references for the publicly-exposed elements of the stats package.
+
+.. doxygennamespace:: raft::stats
+    :project: RAFT
+    :members:
diff --git a/docs/source/cuda_cpp.rst b/docs/source/cuda_cpp.rst
index 3737875a27..30e8903f29 100644
--- a/docs/source/cuda_cpp.rst
+++ b/docs/source/cuda_cpp.rst
@@ -8,4 +8,4 @@ RAFT is header-only but provides optional shared libraries to speed up compile t
 .. toctree::
    :maxdepth: 4
 
-    cpp_api.rst
+   cpp_api.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 85798a9b47..ebeebc93af 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -2,15 +2,13 @@ Welcome to RAFT's documentation!
 =================================
 
 
-RAFT (RAPIDS Analytics Framework Toolkit) is a Python and CUDA/C++ library containing building-blocks, mathematical primitives, and utilities for accelerating the composition of RAPIDS analytics.
-
+RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning.
 
 
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
 
-   raft_intro.rst
    cpp_api.rst
    python_api.rst
 
diff --git a/docs/source/python_api.rst b/docs/source/python_api.rst
index fb8be78c7a..7b2cdcc5a5 100644
--- a/docs/source/python_api.rst
+++ b/docs/source/python_api.rst
@@ -1,6 +1,6 @@
-~~~~~~~~~~~~~~~~~~~
-RAFT API Reference
-~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~
+PyRAFT API Reference
+~~~~~~~~~~~~~~~~~~~~
 
 .. role:: py(code)
    :language: python

From 6fd944f50cbad0bd25ad1a11e22524e9b8f199af Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 18 Mar 2022 12:26:07 -0400
Subject: [PATCH 115/167] Removing allocator

---
 python/pylibraft/pylibraft/common/handle.pxd | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index ed8b11dca0..d5d5cd9306 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -25,11 +25,6 @@ from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
 from libcpp.memory cimport shared_ptr
 from libcpp.memory cimport unique_ptr
 
-cdef extern from "raft/mr/device/allocator.hpp" \
-        namespace "raft::mr::device" nogil:
-    cdef cppclass allocator:
-        pass
-
 cdef extern from "raft/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +

From 682cd2f168530a6ac0b08e2140b5be718b04df08 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 18 Mar 2022 12:54:22 -0400
Subject: [PATCH 116/167] removing device allocator

---
 python/pylibraft/pylibraft/common/handle.pxd | 2 --
 python/pylibraft/pylibraft/common/handle.pyx | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index d5d5cd9306..71dd8d516f 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -31,8 +31,6 @@ cdef extern from "raft/handle.hpp" namespace "raft" nogil:
         handle_t(cuda_stream_view stream_view) except +
         handle_t(cuda_stream_view stream_view,
                  shared_ptr[cuda_stream_pool] stream_pool) except +
-        void set_device_allocator(shared_ptr[allocator] a) except +
-        shared_ptr[allocator] get_device_allocator() except +
         cuda_stream_view get_stream() except +
         void sync_stream() except +
 
diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
index f4db60f794..b92fdab822 100644
--- a/python/pylibraft/pylibraft/common/handle.pyx
+++ b/python/pylibraft/pylibraft/common/handle.pyx
@@ -18,7 +18,7 @@
 # distutils: language = c++
 # cython: embedsignature = True
 # cython: language_level = 3
-
+I'
 # import raft
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
@@ -75,6 +75,7 @@ cdef class Handle:
         """
         self.c_obj.get()[0].sync_stream()
 
+
     def getHandle(self):
         return <size_t> self.c_obj.get()
 

From 1dfe2fcc63ce5e0dd8b01485c5c08cf2d9bc4324 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 18 Mar 2022 13:06:27 -0400
Subject: [PATCH 117/167] fixing style

---
 python/pylibraft/pylibraft/common/handle.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
index b92fdab822..f4db60f794 100644
--- a/python/pylibraft/pylibraft/common/handle.pyx
+++ b/python/pylibraft/pylibraft/common/handle.pyx
@@ -18,7 +18,7 @@
 # distutils: language = c++
 # cython: embedsignature = True
 # cython: language_level = 3
-I'
+
 # import raft
 from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
@@ -75,7 +75,6 @@ cdef class Handle:
         """
         self.c_obj.get()[0].sync_stream()
 
-
     def getHandle(self):
         return <size_t> self.c_obj.get()
 

From a4cd2cf1a7c97a11f7af557a0bee506cdf8385fe Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 18 Mar 2022 17:17:07 -0400
Subject: [PATCH 118/167] Breaking out sparse primitives.

---
 .../raft/linalg/cholesky_r1_update.cuh        |  8 ++--
 .../raft/linalg/cholesky_r1_update.hpp        |  8 ++--
 docs/source/cpp_api.rst                       |  1 -
 docs/source/cpp_api/clustering.rst            |  2 +-
 docs/source/cpp_api/core.rst                  |  9 +++++
 docs/source/cpp_api/distributed.rst           |  9 -----
 docs/source/cpp_api/sparse.rst                | 38 +++++++++++++++++--
 docs/source/cpp_api/spatial.rst               |  4 +-
 docs/source/index.rst                         |  2 -
 docs/source/python.rst                        |  2 -
 10 files changed, 55 insertions(+), 28 deletions(-)
 delete mode 100644 docs/source/cpp_api/distributed.rst

diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index 7d22d6bcf7..d8e838a634 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -62,7 +62,7 @@ namespace linalg {
  * // Initialize arrays
  * int ld_L = n_rows;
  * rmm::device_uvector<math_t> L(ld_L * n_rows, stream);
- * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
+ * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_LOWER,
  *                                       stream);
  * rmm::device_uvector<char> workspace(n_bytes, stream);
@@ -74,7 +74,7 @@ namespace linalg {
  *   RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1,
  *                           L + n - 1, ld_L, stream));
  *   // Update Cholesky factorization
- *   MLCommon::LinAlg::choleskyRank1Update(
+ *   raft::linalg::choleskyRank1Update(
  *       handle, L, rank, ld_L, workspace, &n_bytes, CUBLAS_FILL_MODE_LOWER,
  *       stream);
  * }
@@ -86,7 +86,7 @@ namespace linalg {
  * // Initialize arrays
  * int ld_U = n_rows;
  * rmm::device_uvector<math_t> U(ld_U * n_rows, stream);
- * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
+ * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_UPPER,
  *                                       stream);
  * rmm::device_uvector<char> workspace(stream, n_bytes, stream);
@@ -98,7 +98,7 @@ namespace linalg {
  *   raft::copy(U + ld_U * (n-1), A_new, n-1, stream);
  *   //
  *   // Update Cholesky factorization
- *   MLCommon::LinAlg::choleskyRank1Update(
+ *   raft::linalg::choleskyRank1Update(
  *       handle, U, n, ld_U, workspace, &n_bytes, CUBLAS_FILL_MODE_UPPER,
  *       stream);
  * }
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp
index b55f5d06da..6504ace7f8 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -67,7 +67,7 @@ namespace linalg {
  * // Initialize arrays
  * int ld_L = n_rows;
  * rmm::device_uvector<math_t> L(ld_L * n_rows, stream);
- * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
+ * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_LOWER,
  *                                       stream);
  * rmm::device_uvector<char> workspace(n_bytes, stream);
@@ -79,7 +79,7 @@ namespace linalg {
  *   RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1,
  *                           L + n - 1, ld_L, stream));
  *   // Update Cholesky factorization
- *   MLCommon::LinAlg::choleskyRank1Update(
+ *   raft::linalg::choleskyRank1Update(
  *       handle, L, rank, ld_L, workspace, &n_bytes, CUBLAS_FILL_MODE_LOWER,
  *       stream);
  * }
@@ -91,7 +91,7 @@ namespace linalg {
  * // Initialize arrays
  * int ld_U = n_rows;
  * rmm::device_uvector<math_t> U(ld_U * n_rows, stream);
- * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
+ * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_UPPER,
  *                                       stream);
  * rmm::device_uvector<char> workspace(stream, n_bytes, stream);
@@ -103,7 +103,7 @@ namespace linalg {
  *   raft::copy(U + ld_U * (n-1), A_new, n-1, stream);
  *   //
  *   // Update Cholesky factorization
- *   MLCommon::LinAlg::choleskyRank1Update(
+ *   raft::linalg::choleskyRank1Update(
  *       handle, U, n, ld_U, workspace, &n_bytes, CUBLAS_FILL_MODE_UPPER,
  *       stream);
  * }
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index efdec05ea4..db139031a2 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -10,7 +10,6 @@ RAFT C++ API Reference
 
    cpp_api/core.rst
    cpp_api/clustering.rst
-   cpp_api/distributed.rst
    cpp_api/linalg.rst
    cpp_api/matrix.rst
    cpp_api/optimization.rst
diff --git a/docs/source/cpp_api/clustering.rst b/docs/source/cpp_api/clustering.rst
index cebef84535..715275b59a 100644
--- a/docs/source/cpp_api/clustering.rst
+++ b/docs/source/cpp_api/clustering.rst
@@ -20,6 +20,6 @@ Spectral
 Hierarchical
 ############
 
-.. doxygennamespace:: raft::hierarchical
+.. doxygennamespace:: raft::hierarchy
     :project: RAFT
     :members:
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index 1f48b5aacc..ef6270556e 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -43,3 +43,12 @@ logger
 .. doxygenclass:: raft::logger
     :project: RAFT
     :members:
+
+
+Multi-node Multi-GPU
+####################
+
+.. doxygennamespace:: raft::comms
+    :project: RAFT
+    :members:
+
diff --git a/docs/source/cpp_api/distributed.rst b/docs/source/cpp_api/distributed.rst
deleted file mode 100644
index 94e49385d2..0000000000
--- a/docs/source/cpp_api/distributed.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Multi-node Multi-GPU Communicator
-=================================
-
-RAFT's distributed package contains a communicator, which provides an MPI-like facade for building algorithms that can
-scale to multiple GPUs across multiple physical machines.
-
-.. doxygennamespace:: raft::comms
-    :project: RAFT
-    :members:
diff --git a/docs/source/cpp_api/sparse.rst b/docs/source/cpp_api/sparse.rst
index 58d54d14c7..fe9b38c1c0 100644
--- a/docs/source/cpp_api/sparse.rst
+++ b/docs/source/cpp_api/sparse.rst
@@ -5,9 +5,41 @@ This page provides C++ class references for the publicly-exposed elements of the
 
 
 
-Sparse Primitives
-#################
+Conversion
+##########
 
-.. doxygennamespace:: raft::sparse
+.. doxygennamespace:: raft::sparse::convert
+    :project: RAFT
+    :members:
+
+Distance
+########
+
+.. doxygennamespace:: raft::sparse::distance
+    :project: RAFT
+    :members:
+
+Linear Algebra
+##############
+
+.. doxygennamespace:: raft::sparse::linalg
+    :project: RAFT
+    :members:
+
+Misc Operations
+###############
+
+.. doxygennamespace:: raft::sparse::op
+    :project: RAFT
+    :members:
+
+Selection
+#########
+
+.. doxygennamespace:: raft::sparse::selection
+    :project: RAFT
+    :members:
+
+.. doxygennamespace:: raft::sparse::linkage
     :project: RAFT
     :members:
diff --git a/docs/source/cpp_api/spatial.rst b/docs/source/cpp_api/spatial.rst
index fd6f64166a..5065fa5af0 100644
--- a/docs/source/cpp_api/spatial.rst
+++ b/docs/source/cpp_api/spatial.rst
@@ -3,14 +3,14 @@ Spatial
 
 This page provides C++ class references for the publicly-exposed elements of the spatial package.
 
-distance
+Distance
 ########
 
 .. doxygennamespace:: raft::distance
     :project: RAFT
 
 
-nearest neighbors
+Nearest Neighbors
 #################
 
 .. doxygennamespace:: raft::spatial::knn
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ebeebc93af..0ce3ff1ccc 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,10 +1,8 @@
 Welcome to RAFT's documentation!
 =================================
 
-
 RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning.
 
-
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
diff --git a/docs/source/python.rst b/docs/source/python.rst
index 3909403ff0..4600a3c31b 100644
--- a/docs/source/python.rst
+++ b/docs/source/python.rst
@@ -1,8 +1,6 @@
 Python API
 ==========
 
-
-
 .. toctree::
    :maxdepth: 2
    :caption: Contents:

From 437127e94ab8d0bc9531bbd956d50707eb245fa6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 21 Mar 2022 11:17:19 -0400
Subject: [PATCH 119/167] More docs updates.

---
 cpp/include/raft.hpp                                     | 2 +-
 cpp/include/raft/comms/comms.hpp                         | 2 +-
 cpp/include/raft/cudart_utils.h                          | 2 +-
 cpp/include/raft/distance/distance.hpp                   | 2 +-
 cpp/include/raft/distance/fused_l2_nn.hpp                | 2 +-
 cpp/include/raft/distance/specializations.hpp            | 2 +-
 cpp/include/raft/error.hpp                               | 2 +-
 cpp/include/raft/handle.hpp                              | 2 +-
 cpp/include/raft/interruptible.hpp                       | 2 +-
 cpp/include/raft/lap/lap.hpp                             | 4 ++--
 cpp/include/raft/linalg/add.cuh                          | 8 ++++++++
 cpp/include/raft/linalg/add.hpp                          | 2 +-
 cpp/include/raft/linalg/axpy.hpp                         | 2 +-
 cpp/include/raft/linalg/binary_op.hpp                    | 2 +-
 cpp/include/raft/linalg/cholesky_r1_update.hpp           | 2 +-
 cpp/include/raft/linalg/coalesced_reduction.hpp          | 2 +-
 cpp/include/raft/linalg/contractions.hpp                 | 2 +-
 cpp/include/raft/linalg/cublas_macros.h                  | 2 +-
 cpp/include/raft/linalg/cusolver_macros.h                | 2 +-
 cpp/include/raft/linalg/divide.hpp                       | 2 +-
 cpp/include/raft/linalg/eig.hpp                          | 2 +-
 cpp/include/raft/linalg/eltwise.hpp                      | 2 +-
 cpp/include/raft/linalg/gemm.hpp                         | 2 +-
 cpp/include/raft/linalg/gemv.hpp                         | 2 +-
 cpp/include/raft/linalg/init.hpp                         | 2 +-
 cpp/include/raft/linalg/lanczos.hpp                      | 2 +-
 cpp/include/raft/linalg/lstsq.hpp                        | 2 +-
 cpp/include/raft/linalg/map.hpp                          | 2 +-
 cpp/include/raft/linalg/map_then_reduce.hpp              | 2 +-
 cpp/include/raft/linalg/matrix_vector_op.hpp             | 2 +-
 cpp/include/raft/linalg/mean_squared_error.hpp           | 2 +-
 cpp/include/raft/linalg/multiply.hpp                     | 2 +-
 cpp/include/raft/linalg/norm.hpp                         | 2 +-
 cpp/include/raft/linalg/power.hpp                        | 2 +-
 cpp/include/raft/linalg/qr.hpp                           | 2 +-
 cpp/include/raft/linalg/reduce.hpp                       | 2 +-
 cpp/include/raft/linalg/reduce_cols_by_key.hpp           | 2 +-
 cpp/include/raft/linalg/reduce_rows_by_key.hpp           | 2 +-
 cpp/include/raft/linalg/rsvd.hpp                         | 2 +-
 cpp/include/raft/linalg/sqrt.hpp                         | 2 +-
 cpp/include/raft/linalg/strided_reduction.hpp            | 2 +-
 cpp/include/raft/linalg/subtract.hpp                     | 2 +-
 cpp/include/raft/linalg/svd.hpp                          | 2 +-
 cpp/include/raft/linalg/ternary_op.hpp                   | 2 +-
 cpp/include/raft/linalg/transpose.hpp                    | 2 +-
 cpp/include/raft/linalg/unary_op.hpp                     | 2 +-
 cpp/include/raft/matrix/col_wise_sort.hpp                | 2 +-
 cpp/include/raft/matrix/math.hpp                         | 2 +-
 cpp/include/raft/matrix/matrix.hpp                       | 2 +-
 cpp/include/raft/random/make_blobs.hpp                   | 2 +-
 cpp/include/raft/random/make_regression.hpp              | 2 +-
 cpp/include/raft/random/multi_variable_gaussian.hpp      | 2 +-
 cpp/include/raft/random/permute.hpp                      | 2 +-
 cpp/include/raft/random/rng.hpp                          | 2 +-
 cpp/include/raft/sparse/convert/coo.hpp                  | 2 +-
 cpp/include/raft/sparse/convert/csr.hpp                  | 2 +-
 cpp/include/raft/sparse/convert/dense.hpp                | 2 +-
 cpp/include/raft/sparse/distance/distance.hpp            | 2 +-
 cpp/include/raft/sparse/hierarchy/single_linkage.hpp     | 2 +-
 cpp/include/raft/sparse/linalg/add.hpp                   | 2 +-
 cpp/include/raft/sparse/linalg/degree.hpp                | 2 +-
 cpp/include/raft/sparse/linalg/norm.hpp                  | 2 +-
 cpp/include/raft/sparse/linalg/spectral.hpp              | 2 +-
 cpp/include/raft/sparse/linalg/symmetrize.hpp            | 2 +-
 cpp/include/raft/sparse/linalg/transpose.hpp             | 2 +-
 cpp/include/raft/sparse/mst/mst.hpp                      | 2 +-
 cpp/include/raft/sparse/op/filter.hpp                    | 2 +-
 cpp/include/raft/sparse/op/reduce.hpp                    | 2 +-
 cpp/include/raft/sparse/op/row_op.hpp                    | 2 +-
 cpp/include/raft/sparse/op/slice.hpp                     | 2 +-
 cpp/include/raft/sparse/op/sort.hpp                      | 2 +-
 cpp/include/raft/sparse/selection/connect_components.hpp | 2 +-
 cpp/include/raft/sparse/selection/knn.hpp                | 2 +-
 cpp/include/raft/sparse/selection/knn_graph.hpp          | 2 +-
 cpp/include/raft/spatial/knn/ann.hpp                     | 2 +-
 cpp/include/raft/spatial/knn/ball_cover.hpp              | 2 +-
 cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp    | 2 +-
 cpp/include/raft/spatial/knn/knn.hpp                     | 2 +-
 cpp/include/raft/spatial/knn/specializations.hpp         | 2 +-
 cpp/include/raft/spectral/cluster_solvers.hpp            | 2 +-
 cpp/include/raft/spectral/eigen_solvers.hpp              | 2 +-
 cpp/include/raft/spectral/modularity_maximization.hpp    | 2 +-
 cpp/include/raft/spectral/partition.hpp                  | 2 +-
 cpp/include/raft/stats/accuracy.hpp                      | 2 +-
 cpp/include/raft/stats/adjusted_rand_index.hpp           | 2 +-
 cpp/include/raft/stats/completeness_score.hpp            | 2 +-
 cpp/include/raft/stats/contingency_matrix.hpp            | 2 +-
 cpp/include/raft/stats/cov.hpp                           | 2 +-
 cpp/include/raft/stats/dispersion.hpp                    | 2 +-
 cpp/include/raft/stats/entropy.hpp                       | 2 +-
 cpp/include/raft/stats/histogram.hpp                     | 2 +-
 cpp/include/raft/stats/homogeneity_score.hpp             | 2 +-
 cpp/include/raft/stats/information_criterion.hpp         | 2 +-
 cpp/include/raft/stats/kl_divergence.hpp                 | 2 +-
 cpp/include/raft/stats/mean.hpp                          | 2 +-
 cpp/include/raft/stats/mean_center.hpp                   | 2 +-
 cpp/include/raft/stats/meanvar.hpp                       | 2 +-
 cpp/include/raft/stats/minmax.hpp                        | 2 +-
 cpp/include/raft/stats/mutual_info_score.hpp             | 2 +-
 cpp/include/raft/stats/r2_score.hpp                      | 2 +-
 cpp/include/raft/stats/rand_index.hpp                    | 2 +-
 cpp/include/raft/stats/regression_metrics.hpp            | 2 +-
 cpp/include/raft/stats/silhouette_score.hpp              | 2 +-
 cpp/include/raft/stats/specializations.hpp               | 2 +-
 cpp/include/raft/stats/stddev.hpp                        | 2 +-
 cpp/include/raft/stats/sum.hpp                           | 2 +-
 cpp/include/raft/stats/trustworthiness_score.hpp         | 2 +-
 cpp/include/raft/stats/v_measure.hpp                     | 2 +-
 cpp/include/raft/stats/weighted_mean.hpp                 | 2 +-
 docs/source/cpp_api/optimization.rst                     | 2 +-
 docs/source/cpp_api/sparse.rst                           | 2 +-
 111 files changed, 119 insertions(+), 111 deletions(-)

diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index fff4d09ffe..b1b8255b7e 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  */
 #include "raft/handle.hpp"
 #include "raft/mdarray.hpp"
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index b30a4648a6..9fb2b5a2c6 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use raft_runtime/comms.hpp instead.
  */
 
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 3a18d7e420..4ba1e18768 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use raft_runtime/cudart_utils.hpp instead.
  */
 
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 783a362797..e5d39be86b 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
index 1cb3ee39eb..768e33b3a7 100644
--- a/cpp/include/raft/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft/distance/fused_l2_nn.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/distance/specializations.hpp b/cpp/include/raft/distance/specializations.hpp
index db426c30d2..641968d9f1 100644
--- a/cpp/include/raft/distance/specializations.hpp
+++ b/cpp/include/raft/distance/specializations.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index 5e1aa3af28..1650840aa2 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the include/raft_runtime/error.hpp instead.
  */
 
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 158816f762..74bf570b26 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the include/raft_runtime/handle.hpp instead.
  */
 
diff --git a/cpp/include/raft/interruptible.hpp b/cpp/include/raft/interruptible.hpp
index 6764065363..19e361490c 100644
--- a/cpp/include/raft/interruptible.hpp
+++ b/cpp/include/raft/interruptible.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the include/raft_runtime/interruptible.hpp instead.
  */
 
diff --git a/cpp/include/raft/lap/lap.hpp b/cpp/include/raft/lap/lap.hpp
index 238af9545d..2a4f10f000 100644
--- a/cpp/include/raft/lap/lap.hpp
+++ b/cpp/include/raft/lap/lap.hpp
@@ -12,7 +12,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License.
+ * limitations under the License.+
  *
  *      CUDA Implementation of O(n^3) alternating tree Hungarian Algorithm
  *      Authors: Ketan Date and Rakesh Nagi
@@ -24,7 +24,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 92152a8c03..e25c9df9ef 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -16,6 +16,11 @@
 #ifndef __ADD_H
 #define __ADD_H
 
+/**
+ * @defgroup arithmetic Dense matrix arithmetic
+ * @{
+ */
+
 #pragma once
 
 #include "detail/add.cuh"
@@ -26,6 +31,7 @@ namespace linalg {
 using detail::adds_scalar;
 
 /**
+ * @ingroup arithmetic
  * @brief Elementwise scalar add operation on the input buffer
  *
  * @tparam InT     input data-type. Also the data-type upon which the math ops
@@ -87,4 +93,6 @@ void addDevScalar(math_t* outDev,
 };  // end namespace linalg
 };  // end namespace raft
 
+/** @} */
+
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp
index 32c7f68459..a80398fcad 100644
--- a/cpp/include/raft/linalg/add.hpp
+++ b/cpp/include/raft/linalg/add.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/axpy.hpp b/cpp/include/raft/linalg/axpy.hpp
index 921ed3f89b..c227ba66c8 100644
--- a/cpp/include/raft/linalg/axpy.hpp
+++ b/cpp/include/raft/linalg/axpy.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp
index 468c278909..9983e8ab50 100644
--- a/cpp/include/raft/linalg/binary_op.hpp
+++ b/cpp/include/raft/linalg/binary_op.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp
index 6504ace7f8..1158ad3aa4 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
index 4b9e5d262f..48f8798a03 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.hpp
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp
index 84c86b93a4..256593d9ae 100644
--- a/cpp/include/raft/linalg/contractions.hpp
+++ b/cpp/include/raft/linalg/contractions.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/cublas_macros.h b/cpp/include/raft/linalg/cublas_macros.h
index 0281c5c667..44a609a48c 100644
--- a/cpp/include/raft/linalg/cublas_macros.h
+++ b/cpp/include/raft/linalg/cublas_macros.h
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use raft_runtime/cublas_macros.hpp instead.
  */
 
diff --git a/cpp/include/raft/linalg/cusolver_macros.h b/cpp/include/raft/linalg/cusolver_macros.h
index df27f7ce26..4010125ccf 100644
--- a/cpp/include/raft/linalg/cusolver_macros.h
+++ b/cpp/include/raft/linalg/cusolver_macros.h
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use raft_runtime/cusolver_macros.hpp instead.
  */
 
diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp
index 88b919b92a..8d1bd37186 100644
--- a/cpp/include/raft/linalg/divide.hpp
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
index 9417b6fb3f..032c4e97f9 100644
--- a/cpp/include/raft/linalg/eig.hpp
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp
index 0ebefc7c25..62624f6eeb 100644
--- a/cpp/include/raft/linalg/eltwise.hpp
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index 736590938b..56621e4f8b 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp
index d6e0e0326b..3b6b60263b 100644
--- a/cpp/include/raft/linalg/gemv.hpp
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp
index af3486f278..db7b0f9cfe 100644
--- a/cpp/include/raft/linalg/init.hpp
+++ b/cpp/include/raft/linalg/init.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 7663af3cb2..75e3d11444 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp
index 008fcab653..f90cd00ea3 100644
--- a/cpp/include/raft/linalg/lstsq.hpp
+++ b/cpp/include/raft/linalg/lstsq.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp
index a1ce1f3ef9..8321dcebe4 100644
--- a/cpp/include/raft/linalg/map.hpp
+++ b/cpp/include/raft/linalg/map.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
index c4b136d1b8..235485926b 100644
--- a/cpp/include/raft/linalg/map_then_reduce.hpp
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp
index c041d4c263..574d4aee63 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.hpp
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp
index 95428d47e0..7a7f03ee18 100644
--- a/cpp/include/raft/linalg/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/mean_squared_error.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/multiply.hpp b/cpp/include/raft/linalg/multiply.hpp
index 260fb25018..eb933cd607 100644
--- a/cpp/include/raft/linalg/multiply.hpp
+++ b/cpp/include/raft/linalg/multiply.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/norm.hpp b/cpp/include/raft/linalg/norm.hpp
index 7be524f6de..958784d67e 100644
--- a/cpp/include/raft/linalg/norm.hpp
+++ b/cpp/include/raft/linalg/norm.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/power.hpp b/cpp/include/raft/linalg/power.hpp
index 124ee8513a..d1506ff7a9 100644
--- a/cpp/include/raft/linalg/power.hpp
+++ b/cpp/include/raft/linalg/power.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp
index ffb381dca1..f0194ddbf9 100644
--- a/cpp/include/raft/linalg/qr.hpp
+++ b/cpp/include/raft/linalg/qr.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/reduce.hpp b/cpp/include/raft/linalg/reduce.hpp
index b9f057771a..b9cc2c6e9d 100644
--- a/cpp/include/raft/linalg/reduce.hpp
+++ b/cpp/include/raft/linalg/reduce.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.hpp b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
index a338d8572b..c24baa60de 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.hpp
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.hpp b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
index 70ce9eaa4f..d18a00aa1d 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.hpp
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/rsvd.hpp b/cpp/include/raft/linalg/rsvd.hpp
index 2dd5faa332..ac6e13b555 100644
--- a/cpp/include/raft/linalg/rsvd.hpp
+++ b/cpp/include/raft/linalg/rsvd.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/sqrt.hpp b/cpp/include/raft/linalg/sqrt.hpp
index 9856173248..9c66ee2d14 100644
--- a/cpp/include/raft/linalg/sqrt.hpp
+++ b/cpp/include/raft/linalg/sqrt.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
index 534f8edcf7..3b1597dfc3 100644
--- a/cpp/include/raft/linalg/strided_reduction.hpp
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp
index 2420ce69e2..accf57a939 100644
--- a/cpp/include/raft/linalg/subtract.hpp
+++ b/cpp/include/raft/linalg/subtract.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
index 765f364d5b..01788a4188 100644
--- a/cpp/include/raft/linalg/svd.hpp
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/ternary_op.hpp b/cpp/include/raft/linalg/ternary_op.hpp
index 1e8892211c..bce9eacb11 100644
--- a/cpp/include/raft/linalg/ternary_op.hpp
+++ b/cpp/include/raft/linalg/ternary_op.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp
index 765d523b16..caa6bafedf 100644
--- a/cpp/include/raft/linalg/transpose.hpp
+++ b/cpp/include/raft/linalg/transpose.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp
index 12d841340b..ca1e3f9875 100644
--- a/cpp/include/raft/linalg/unary_op.hpp
+++ b/cpp/include/raft/linalg/unary_op.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/matrix/col_wise_sort.hpp b/cpp/include/raft/matrix/col_wise_sort.hpp
index f259bc71a8..83a8738219 100644
--- a/cpp/include/raft/matrix/col_wise_sort.hpp
+++ b/cpp/include/raft/matrix/col_wise_sort.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index e04764f59e..6ed9a0d358 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index cf5f5d1f25..7409140d7c 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/random/make_blobs.hpp b/cpp/include/raft/random/make_blobs.hpp
index ab04684f75..372839b500 100644
--- a/cpp/include/raft/random/make_blobs.hpp
+++ b/cpp/include/raft/random/make_blobs.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/random/make_regression.hpp b/cpp/include/raft/random/make_regression.hpp
index c050a447ed..4f6b2717f6 100644
--- a/cpp/include/raft/random/make_regression.hpp
+++ b/cpp/include/raft/random/make_regression.hpp
@@ -19,7 +19,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/random/multi_variable_gaussian.hpp b/cpp/include/raft/random/multi_variable_gaussian.hpp
index fd1de4aadd..6b85ec6a14 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.hpp
+++ b/cpp/include/raft/random/multi_variable_gaussian.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/random/permute.hpp b/cpp/include/raft/random/permute.hpp
index 3507d66cc3..26e22e403b 100644
--- a/cpp/include/raft/random/permute.hpp
+++ b/cpp/include/raft/random/permute.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index 44f9c955ac..519325f6d3 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/convert/coo.hpp b/cpp/include/raft/sparse/convert/coo.hpp
index 009a19a563..697452db09 100644
--- a/cpp/include/raft/sparse/convert/coo.hpp
+++ b/cpp/include/raft/sparse/convert/coo.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/convert/csr.hpp b/cpp/include/raft/sparse/convert/csr.hpp
index 6a9a99d014..cd5d89bf71 100644
--- a/cpp/include/raft/sparse/convert/csr.hpp
+++ b/cpp/include/raft/sparse/convert/csr.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/convert/dense.hpp b/cpp/include/raft/sparse/convert/dense.hpp
index 1bdfa26732..f8338536c8 100644
--- a/cpp/include/raft/sparse/convert/dense.hpp
+++ b/cpp/include/raft/sparse/convert/dense.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index 1bae6fa533..86d4db81d2 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index 7f48f578b7..80c3c3c521 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/linalg/add.hpp b/cpp/include/raft/sparse/linalg/add.hpp
index 33259cb39f..39ab2d6450 100644
--- a/cpp/include/raft/sparse/linalg/add.hpp
+++ b/cpp/include/raft/sparse/linalg/add.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/linalg/degree.hpp b/cpp/include/raft/sparse/linalg/degree.hpp
index 0c6af596ce..7cece7908e 100644
--- a/cpp/include/raft/sparse/linalg/degree.hpp
+++ b/cpp/include/raft/sparse/linalg/degree.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/linalg/norm.hpp b/cpp/include/raft/sparse/linalg/norm.hpp
index 196951bac7..1f054e63ab 100644
--- a/cpp/include/raft/sparse/linalg/norm.hpp
+++ b/cpp/include/raft/sparse/linalg/norm.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/linalg/spectral.hpp b/cpp/include/raft/sparse/linalg/spectral.hpp
index 9daa6e07b0..ff400f1f0f 100644
--- a/cpp/include/raft/sparse/linalg/spectral.hpp
+++ b/cpp/include/raft/sparse/linalg/spectral.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.hpp b/cpp/include/raft/sparse/linalg/symmetrize.hpp
index df5754536b..6e1d3b4fa1 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.hpp
+++ b/cpp/include/raft/sparse/linalg/symmetrize.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/linalg/transpose.hpp b/cpp/include/raft/sparse/linalg/transpose.hpp
index 0aea254803..c709c20473 100644
--- a/cpp/include/raft/sparse/linalg/transpose.hpp
+++ b/cpp/include/raft/sparse/linalg/transpose.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/mst/mst.hpp b/cpp/include/raft/sparse/mst/mst.hpp
index 6523897d62..5a66e8c815 100644
--- a/cpp/include/raft/sparse/mst/mst.hpp
+++ b/cpp/include/raft/sparse/mst/mst.hpp
@@ -16,7 +16,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 #pragma once
diff --git a/cpp/include/raft/sparse/op/filter.hpp b/cpp/include/raft/sparse/op/filter.hpp
index b67084f18a..3821d963b0 100644
--- a/cpp/include/raft/sparse/op/filter.hpp
+++ b/cpp/include/raft/sparse/op/filter.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/op/reduce.hpp b/cpp/include/raft/sparse/op/reduce.hpp
index a7e771d157..bb7560fa3d 100644
--- a/cpp/include/raft/sparse/op/reduce.hpp
+++ b/cpp/include/raft/sparse/op/reduce.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/op/row_op.hpp b/cpp/include/raft/sparse/op/row_op.hpp
index b3eafafa66..ac12432e92 100644
--- a/cpp/include/raft/sparse/op/row_op.hpp
+++ b/cpp/include/raft/sparse/op/row_op.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/op/slice.hpp b/cpp/include/raft/sparse/op/slice.hpp
index b4e0622ced..75b7e478e5 100644
--- a/cpp/include/raft/sparse/op/slice.hpp
+++ b/cpp/include/raft/sparse/op/slice.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/op/sort.hpp b/cpp/include/raft/sparse/op/sort.hpp
index 12a4a77ca9..cd363582fb 100644
--- a/cpp/include/raft/sparse/op/sort.hpp
+++ b/cpp/include/raft/sparse/op/sort.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/selection/connect_components.hpp b/cpp/include/raft/sparse/selection/connect_components.hpp
index 83d8fce8ba..25d71367db 100644
--- a/cpp/include/raft/sparse/selection/connect_components.hpp
+++ b/cpp/include/raft/sparse/selection/connect_components.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index 4158bd40c2..bd6dd39fdf 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index eb035390ce..be47a6a9ef 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index bb11a2b11b..b6d3ca2976 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index 26c2c1fb2e..a7c483493e 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
index b3ba0fc442..7674ac0d46 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index da18e891d4..c7b21f16ad 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spatial/knn/specializations.hpp b/cpp/include/raft/spatial/knn/specializations.hpp
index 538e1b1380..13721a975f 100644
--- a/cpp/include/raft/spatial/knn/specializations.hpp
+++ b/cpp/include/raft/spatial/knn/specializations.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index c6b166bb4f..9cb773cce2 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index d55ddf952a..e6b37f29ec 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index a09ceaa933..1bc003e711 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index 78fea157ae..27f204d055 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/accuracy.hpp b/cpp/include/raft/stats/accuracy.hpp
index eefe96b2d1..8cbb0f719e 100644
--- a/cpp/include/raft/stats/accuracy.hpp
+++ b/cpp/include/raft/stats/accuracy.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/adjusted_rand_index.hpp b/cpp/include/raft/stats/adjusted_rand_index.hpp
index cbf6112000..bc836eed86 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.hpp
+++ b/cpp/include/raft/stats/adjusted_rand_index.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/completeness_score.hpp b/cpp/include/raft/stats/completeness_score.hpp
index 01ed0d66b9..0dd97e9782 100644
--- a/cpp/include/raft/stats/completeness_score.hpp
+++ b/cpp/include/raft/stats/completeness_score.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/contingency_matrix.hpp b/cpp/include/raft/stats/contingency_matrix.hpp
index 6fa4a314f9..70800be1e6 100644
--- a/cpp/include/raft/stats/contingency_matrix.hpp
+++ b/cpp/include/raft/stats/contingency_matrix.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/cov.hpp b/cpp/include/raft/stats/cov.hpp
index 27b4ede876..a584dedc95 100644
--- a/cpp/include/raft/stats/cov.hpp
+++ b/cpp/include/raft/stats/cov.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/dispersion.hpp b/cpp/include/raft/stats/dispersion.hpp
index 5958551e87..7fabf07992 100644
--- a/cpp/include/raft/stats/dispersion.hpp
+++ b/cpp/include/raft/stats/dispersion.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/entropy.hpp b/cpp/include/raft/stats/entropy.hpp
index eb1fee2949..37dc2b700c 100644
--- a/cpp/include/raft/stats/entropy.hpp
+++ b/cpp/include/raft/stats/entropy.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp
index 828719236b..627026c219 100644
--- a/cpp/include/raft/stats/histogram.hpp
+++ b/cpp/include/raft/stats/histogram.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/homogeneity_score.hpp b/cpp/include/raft/stats/homogeneity_score.hpp
index 49baea0c19..4e119f2bc7 100644
--- a/cpp/include/raft/stats/homogeneity_score.hpp
+++ b/cpp/include/raft/stats/homogeneity_score.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/information_criterion.hpp b/cpp/include/raft/stats/information_criterion.hpp
index f6dd69aa08..3a39e56c41 100644
--- a/cpp/include/raft/stats/information_criterion.hpp
+++ b/cpp/include/raft/stats/information_criterion.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/kl_divergence.hpp b/cpp/include/raft/stats/kl_divergence.hpp
index 9d7c0b1e46..59db77246f 100644
--- a/cpp/include/raft/stats/kl_divergence.hpp
+++ b/cpp/include/raft/stats/kl_divergence.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp
index add9e47569..2767b632e6 100644
--- a/cpp/include/raft/stats/mean.hpp
+++ b/cpp/include/raft/stats/mean.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp
index 69ce79338b..e219891cab 100644
--- a/cpp/include/raft/stats/mean_center.hpp
+++ b/cpp/include/raft/stats/mean_center.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/meanvar.hpp b/cpp/include/raft/stats/meanvar.hpp
index a6809170e7..d7ef935fbc 100644
--- a/cpp/include/raft/stats/meanvar.hpp
+++ b/cpp/include/raft/stats/meanvar.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/minmax.hpp b/cpp/include/raft/stats/minmax.hpp
index 669b3c5837..97f06129fa 100644
--- a/cpp/include/raft/stats/minmax.hpp
+++ b/cpp/include/raft/stats/minmax.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/mutual_info_score.hpp b/cpp/include/raft/stats/mutual_info_score.hpp
index c900f9ce5b..a080211c36 100644
--- a/cpp/include/raft/stats/mutual_info_score.hpp
+++ b/cpp/include/raft/stats/mutual_info_score.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/r2_score.hpp b/cpp/include/raft/stats/r2_score.hpp
index 4e126d903b..c88a1822ec 100644
--- a/cpp/include/raft/stats/r2_score.hpp
+++ b/cpp/include/raft/stats/r2_score.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/rand_index.hpp b/cpp/include/raft/stats/rand_index.hpp
index c94e4fa8db..e8c3089371 100644
--- a/cpp/include/raft/stats/rand_index.hpp
+++ b/cpp/include/raft/stats/rand_index.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/regression_metrics.hpp b/cpp/include/raft/stats/regression_metrics.hpp
index b8868bdb33..f65ad524ef 100644
--- a/cpp/include/raft/stats/regression_metrics.hpp
+++ b/cpp/include/raft/stats/regression_metrics.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/silhouette_score.hpp b/cpp/include/raft/stats/silhouette_score.hpp
index 7506d9a733..e6c84855c6 100644
--- a/cpp/include/raft/stats/silhouette_score.hpp
+++ b/cpp/include/raft/stats/silhouette_score.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/specializations.hpp b/cpp/include/raft/stats/specializations.hpp
index 87301deccc..3929b3124c 100644
--- a/cpp/include/raft/stats/specializations.hpp
+++ b/cpp/include/raft/stats/specializations.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp
index e038fecc02..f496b1fd30 100644
--- a/cpp/include/raft/stats/stddev.hpp
+++ b/cpp/include/raft/stats/stddev.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp
index c2b93b79db..e1c8c67777 100644
--- a/cpp/include/raft/stats/sum.hpp
+++ b/cpp/include/raft/stats/sum.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/trustworthiness_score.hpp b/cpp/include/raft/stats/trustworthiness_score.hpp
index 81ca4eb5b7..81edf2ea04 100644
--- a/cpp/include/raft/stats/trustworthiness_score.hpp
+++ b/cpp/include/raft/stats/trustworthiness_score.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/v_measure.hpp b/cpp/include/raft/stats/v_measure.hpp
index 925171c2d2..a137af844d 100644
--- a/cpp/include/raft/stats/v_measure.hpp
+++ b/cpp/include/raft/stats/v_measure.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp
index 4f53067e65..5b3f4678d8 100644
--- a/cpp/include/raft/stats/weighted_mean.hpp
+++ b/cpp/include/raft/stats/weighted_mean.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/docs/source/cpp_api/optimization.rst b/docs/source/cpp_api/optimization.rst
index b5076f0a66..75cec2494e 100644
--- a/docs/source/cpp_api/optimization.rst
+++ b/docs/source/cpp_api/optimization.rst
@@ -7,7 +7,7 @@ This page provides C++ class references for the publicly-exposed elements of the
 Linear Assignment Problem
 #########################
 
-.. doxygennamespace:: raft::lap
+.. doxygenclass:: raft::lap::LinearAssignmentProblem
     :project: RAFT
     :members:
 
diff --git a/docs/source/cpp_api/sparse.rst b/docs/source/cpp_api/sparse.rst
index fe9b38c1c0..c0ea61c6f7 100644
--- a/docs/source/cpp_api/sparse.rst
+++ b/docs/source/cpp_api/sparse.rst
@@ -40,6 +40,6 @@ Selection
     :project: RAFT
     :members:
 
-.. doxygennamespace:: raft::sparse::linkage
+.. doxygennamespace:: raft::linkage
     :project: RAFT
     :members:

From ef4a9a85b229e43678655cd01206020ac277bd29 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 21 Mar 2022 13:58:24 -0400
Subject: [PATCH 120/167] Updates based on review feedback

---
 BUILD.md                                      | 18 ++--
 DEVELOPER_GUIDE.md                            |  2 +-
 README.md                                     | 12 +--
 conda/recipes/pylibraft/meta.yaml             |  1 -
 python/pylibraft/pylibraft/common/cuda.pyx    | 84 -----------------
 python/pylibraft/pylibraft/common/handle.pyx  | 90 -------------------
 .../pylibraft/common/interruptible.pxd        | 34 -------
 .../pylibraft/common/interruptible.pyx        | 84 -----------------
 .../pylibraft/distance/pairwise_distance.pyx  | 13 +--
 .../pylibraft/pylibraft/test/test_distance.py |  4 +-
 python/pylibraft/setup.cfg                    |  4 +-
 11 files changed, 27 insertions(+), 319 deletions(-)
 delete mode 100644 python/pylibraft/pylibraft/common/cuda.pyx
 delete mode 100644 python/pylibraft/pylibraft/common/handle.pyx
 delete mode 100644 python/pylibraft/pylibraft/common/interruptible.pxd
 delete mode 100644 python/pylibraft/pylibraft/common/interruptible.pyx

diff --git a/BUILD.md b/BUILD.md
index e88480f9af..ef2d1a2bda 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -27,13 +27,13 @@ In addition to the libraries included with cudatoolkit 11.0+, there are some oth
 
 #### Required
 - [Thrust](https://github.com/NVIDIA/thrust) v1.15 / [CUB](https://github.com/NVIDIA/cub)
-- [RMM](https://github.com/rapidsai/rmm)
+- [RMM](https://github.com/rapidsai/rmm) corresponding to RAFT version.
 - [mdspan](https://github.com/rapidsai/mdspan)
   
 #### Optional
 - [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API
 - [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0
-- [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::spatial::knn` API
+- [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::spatial::knn` API and needed to build tests.
 - [NCCL](https://github.com/NVIDIA/nccl) - Used in `raft::comms` API and needed to build `Pyraft`
 - [UCX](https://github.com/openucx/ucx) - Used in `raft::comms` API and needed to build `Pyraft`
 - [Googletest](https://github.com/google/googletest) - Needed to build tests
@@ -53,7 +53,7 @@ The following example will download the needed dependencies and install the RAFT
 ./build.sh libraft --install
 ```
 
-###<a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
+### <a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
 
 For larger projects which make heavy use of the pairwise distances or nearest neighbors APIs, shared libraries can be built to speed up compile times. These shared libraries can also significantly improve re-compile times both while developing RAFT and developing against the APIs. Build all of the available shared libraries by passing `--compile-libs` flag to `build.sh`:
 ```bash
@@ -67,7 +67,7 @@ Individual shared libraries have their own flags and multiple can be used (thoug
 
 Add the `--install` flag to the above example to also install the shared libraries into `$INSTALL_PREFIX/lib`.
 
-###<a id="gtests"></a>Tests
+### <a id="gtests"></a>Tests
 
 Compile the tests using the `tests` target in `build.sh`. By default, the shared libraries are assumed to be already built and on the library path. Add `--compile-libs` to also compile them.
 ```bash
@@ -80,7 +80,7 @@ To run C++ tests:
 ./cpp/build/test_raft
 ```
 
-###<a id="benchmarks"></a>Benchmarks
+### <a id="benchmarks"></a>Benchmarks
 
 Compile the benchmarks using the `bench` target in `build.sh`:
 ```bash
@@ -132,8 +132,8 @@ Currently, shared libraries are provided for the `libraft-nn` and `libraft-dista
 Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. The following example will install create and install dependencies for a CUDA 11.5 conda environment:
 
 ```bash
-mamba env create --name raft_env -f conda/environments/raft_dev_cuda11.5.yml
-mamba activate raft_env
+mamba env create --name raft_env_name -f conda/environments/raft_dev_cuda11.5.yml
+mamba activate raft_env_name
 ```
 
 The Python APIs can be built using the `build.sh` script:
@@ -187,7 +187,7 @@ If RAFT has already been installed, such as by using the `build.sh` script, use
 
 Use `find_package(raft COMPONENTS nn distance)` to enable the shared libraries and transitively pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
 
-The pre-compiled libraries contain template specializations for commonly used types, such as single- and double-precision floating-point. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `spectializations.hpp` and located in the base directory for the packages that contain specializations.
+The pre-compiled libraries contain template specializations for commonly used types, such as single- and double-precision floating-point. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `specializations.hpp` and located in the base directory for the packages that contain specializations.
 
 The following example tells the compiler to ignore the pre-compiled templates for the `libraft-distance` API so any symbols already compiled into pre-compiled shared library will be used instead:
 ```c++
@@ -197,7 +197,7 @@ The following example tells the compiler to ignore the pre-compiled templates fo
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library so it can be more easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). 
+RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library so it can be more easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [CMake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). 
 
 The following example is similar to invoking `find_package(raft)` but uses `rapids_cpm_find`, which provides a richer and more flexible configuration landscape by using CPM to fetch any dependencies not already available to the build. The `raft::raft` link target will be made available and it's recommended that it be used as a `PRIVATE` link dependency in downstream projects. The `COMPILE_LIBRARIES` option enables the building the shared libraries.
 
diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
index 1355d83e86..5c1e122525 100644
--- a/DEVELOPER_GUIDE.md
+++ b/DEVELOPER_GUIDE.md
@@ -4,7 +4,7 @@
 
 Devloping features and fixing bugs for the RAFT library itself is straightforward and only requires building and installing the relevant RAFT artifacts. 
 
-The process for working on a CUDA/C++ feature which spans RAFT and one or more consumers can vary slightly depending on whether the consuming project relies on a source build (as outlined in the [BUILD](BUILD.md#install_header_only_cpp) docs). In such a case, the option `CPM_raft_SOURCE=/path/to/raft/source` can be passed to the cmake of the consuming project in order to build the local RAFT from source. The PR with relevant changes to the consuming project can also pin the RAFT version temporarily by explicitly changing the `FORK` and `PINNED_TAG` arguments to the RAFT branch containing their changes when invoking `find_and_configure_raft`.  The pin should be reverted after the changed is merged to the RAFT project and before it is merged to the dependent project(s) downstream. 
+The process for working on a CUDA/C++ feature which spans RAFT and one or more consumers can vary slightly depending on whether the consuming project relies on a source build (as outlined in the [BUILD](BUILD.md#install_header_only_cpp) docs). In such a case, the option `CPM_raft_SOURCE=/path/to/raft/source` can be passed to the cmake of the consuming project in order to build the local RAFT from source. The PR with relevant changes to the consuming project can also pin the RAFT version temporarily by explicitly changing the `FORK` and `PINNED_TAG` arguments to the RAFT branch containing their changes when invoking `find_and_configure_raft`.  The pin should be reverted after the changed is merged to the RAFT project and before it is merged to the dependent project(s) downstream.
 
 If building a feature which spans projects and not using the source build in cmake, the RAFT changes (both C++ and Python) will need to be installed into the environment of the consuming project before they can be used. The ideal integration of RAFT into consuming projects will enable both the source build in the consuming project only for this case but also rely on a more stable packaging (such as conda packaging) otherwise. 
 
diff --git a/README.md b/README.md
index 83aa42d74d..8e82780734 100755
--- a/README.md
+++ b/README.md
@@ -81,7 +81,8 @@ raft::distance::pairwise_distance(handle, input.view(), input.view(), output.vie
 
 The `pylibraft` package contains a Python API for RAFT algorithms and primitives. The package is currently limited to pairwise distances, and we will continue adding more.
 
-The example below demonstrates computing the pairwise Euclidean distances between cupy arrays.
+The example below demonstrates computing the pairwise Euclidean distances between cupy arrays. `pylibraft` is a low-level API that prioritizes efficiency and simplicity over being pythonic, which is shown here by pre-allocating the output memory before invoking the `pairwise_distance` function.
+
 ```python
 import cupy as cp
 
@@ -90,10 +91,11 @@ from pylibraft.distance import pairwise_distance
 n_samples = 5000
 n_features = 50
 
-input = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
 output = cp.empty((n_samples, n_samples), dtype=cp.float32)
 
-pairwise_distance(input, input, output, "euclidean")
+pairwise_distance(in1, in2, output, metric="euclidean")
 ```
 
 ## Installing
@@ -118,7 +120,7 @@ After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used i
 
 ### CPM
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS cmake provides a convenience layer around CPM. 
+RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS CMake provides a convenience layer around CPM. 
 
 After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids-cmake in your project, you can begin using RAFT by placing the code snippet below in a file named `get_raft.cmake` and including it in your cmake build with `include(get_raft.cmake)`. This will make available several targets to add to configure the link libraries for your artifacts.
 
@@ -168,7 +170,7 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
 )
 ```
 
-Several cmake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available.
+Several CMake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available.
 
 | Component | Target | Description | Base Dependencies |
 | --- | --- | --- | --- |
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 1c49f00b06..eaca379c4e 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -38,7 +38,6 @@ requirements:
     - libraft-headers {{ version }}
     - libraft-distance {{ version }}
     - cuda-python >=11.5,<12.0
-    - joblib >=0.11
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
 
 tests:                                 # [linux64]
diff --git a/python/pylibraft/pylibraft/common/cuda.pyx b/python/pylibraft/pylibraft/common/cuda.pyx
deleted file mode 100644
index eb48f64cf1..0000000000
--- a/python/pylibraft/pylibraft/common/cuda.pyx
+++ /dev/null
@@ -1,84 +0,0 @@
-#
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cuda.ccudart cimport(
-    cudaStream_t,
-    cudaError_t,
-    cudaSuccess,
-    cudaStreamCreate,
-    cudaStreamDestroy,
-    cudaStreamSynchronize,
-    cudaGetLastError,
-    cudaGetErrorString,
-    cudaGetErrorName
-)
-
-
-class CudaRuntimeError(RuntimeError):
-    def __init__(self, extraMsg=None):
-        cdef cudaError_t e = cudaGetLastError()
-        cdef bytes errMsg = cudaGetErrorString(e)
-        cdef bytes errName = cudaGetErrorName(e)
-        msg = "Error! %s reason='%s'" % (errName.decode(), errMsg.decode())
-        if extraMsg is not None:
-            msg += " extraMsg='%s'" % extraMsg
-        super(CudaRuntimeError, self).__init__(msg)
-
-
-cdef class Stream:
-    """
-    Stream represents a thin-wrapper around cudaStream_t and its operations.
-
-    Examples
-    --------
-
-    .. code-block:: python
-
-        from raft.common.cuda import Stream
-        stream = Stream()
-        stream.sync()
-        del stream  # optional!
-    """
-    def __cinit__(self):
-        cdef cudaStream_t stream
-        cdef cudaError_t e = cudaStreamCreate(&stream)
-        if e != cudaSuccess:
-            raise CudaRuntimeError("Stream create")
-        self.s = stream
-
-    def __dealloc__(self):
-        self.sync()
-        cdef cudaError_t e = cudaStreamDestroy(self.s)
-        if e != cudaSuccess:
-            raise CudaRuntimeError("Stream destroy")
-
-    def sync(self):
-        """
-        Synchronize on the cudastream owned by this object. Note that this
-        could raise exception due to issues with previous asynchronous
-        launches
-        """
-        cdef cudaError_t e = cudaStreamSynchronize(self.s)
-        if e != cudaSuccess:
-            raise CudaRuntimeError("Stream sync")
-
-    cdef cudaStream_t getStream(self):
-        return self.s
diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
deleted file mode 100644
index f4db60f794..0000000000
--- a/python/pylibraft/pylibraft/common/handle.pyx
+++ /dev/null
@@ -1,90 +0,0 @@
-#
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-# import raft
-from libcpp.memory cimport shared_ptr
-from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-from .cuda cimport Stream
-from .cuda import CudaRuntimeError
-
-
-cdef class Handle:
-    """
-    Handle is a lightweight python wrapper around the corresponding C++ class
-    of handle_t exposed by RAFT's C++ interface. Refer to the header file
-    raft/handle.hpp for interface level details of this struct
-
-    Examples
-    --------
-
-    .. code-block:: python
-
-        from raft.common import Stream, Handle
-        stream = Stream()
-        handle = Handle(stream)
-
-        # call algos here
-
-        # final sync of all work launched in the stream of this handle
-        # this is same as `raft.cuda.Stream.sync()` call, but safer in case
-        # the default stream inside the `handle_t` is being used
-        handle.sync()
-        del handle  # optional!
-    """
-
-    def __cinit__(self, stream: Stream = None, n_streams=0):
-        self.n_streams = n_streams
-        if n_streams > 0:
-            self.stream_pool.reset(new cuda_stream_pool(n_streams))
-
-        cdef cuda_stream_view c_stream
-        if stream is None:
-            # this constructor will construct a "main" handle on
-            # per-thread default stream, which is non-blocking
-            self.c_obj.reset(new handle_t(cuda_stream_per_thread,
-                                          self.stream_pool))
-        else:
-            # this constructor constructs a handle on user stream
-            c_stream = cuda_stream_view(stream.getStream())
-            self.c_obj.reset(new handle_t(c_stream,
-                                          self.stream_pool))
-
-    def sync(self):
-        """
-        Issues a sync on the stream set for this handle.
-        """
-        self.c_obj.get()[0].sync_stream()
-
-    def getHandle(self):
-        return <size_t> self.c_obj.get()
-
-    def __getstate__(self):
-        return self.n_streams
-
-    def __setstate__(self, state):
-        self.n_streams = state
-        if self.n_streams > 0:
-            self.stream_pool.reset(new cuda_stream_pool(self.n_streams))
-
-        self.c_obj.reset(new handle_t(cuda_stream_per_thread,
-                                      self.stream_pool))
diff --git a/python/pylibraft/pylibraft/common/interruptible.pxd b/python/pylibraft/pylibraft/common/interruptible.pxd
deleted file mode 100644
index cb639c0f72..0000000000
--- a/python/pylibraft/pylibraft/common/interruptible.pxd
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from libcpp.memory cimport shared_ptr
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-cdef extern from "raft/interruptible.hpp" namespace "raft" nogil:
-    cdef cppclass interruptible:
-        void cancel()
-
-cdef extern from "raft/interruptible.hpp" \
-        namespace "raft::interruptible" nogil:
-    cdef void inter_synchronize \
-        "raft::interruptible::synchronize"(cuda_stream_view stream) except+
-    cdef void inter_yield "raft::interruptible::yield"() except+
-    cdef shared_ptr[interruptible] get_token() except+
diff --git a/python/pylibraft/pylibraft/common/interruptible.pyx b/python/pylibraft/pylibraft/common/interruptible.pyx
deleted file mode 100644
index 4dd337649b..0000000000
--- a/python/pylibraft/pylibraft/common/interruptible.pyx
+++ /dev/null
@@ -1,84 +0,0 @@
-#
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-import contextlib
-import signal
-from cython.operator cimport dereference
-
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from cuda.ccudart cimport cudaStream_t
-from .cuda cimport Stream
-
-
-@contextlib.contextmanager
-def cuda_interruptible():
-    '''
-    Temporarily install a keyboard interrupt handler (Ctrl+C)
-    that cancels the enclosed interruptible C++ thread.
-
-    Use this on a long-running C++ function imported via cython:
-
-    .. code-block:: python
-
-        with cuda_interruptible():
-            my_long_running_function(...)
-
-    It's also recommended to release the GIL during the call, to
-    make sure the handler has a chance to run:
-
-    .. code-block:: python
-
-        with cuda_interruptible():
-            with nogil:
-                my_long_running_function(...)
-
-    '''
-    cdef shared_ptr[interruptible] token = get_token()
-
-    def newhr(*args, **kwargs):
-        with nogil:
-            dereference(token).cancel()
-
-    oldhr = signal.signal(signal.SIGINT, newhr)
-    try:
-        yield
-    finally:
-        signal.signal(signal.SIGINT, oldhr)
-
-
-def synchronize(stream: Stream):
-    '''
-    Same as cudaStreamSynchronize, but can be interrupted
-    if called within a `with cuda_interruptible()` block.
-    '''
-    cdef cuda_stream_view c_stream = cuda_stream_view(stream.getStream())
-    with nogil:
-        inter_synchronize(c_stream)
-
-
-def cuda_yield():
-    '''
-    Check for an asynchronously received interrupted_exception.
-    Raises the exception if a user pressed Ctrl+C within a
-    `with cuda_interruptible()` block before.
-    '''
-    with nogil:
-        inter_yield()
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 9b918396f6..3a04a802b5 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -77,8 +77,8 @@ def distance(X, Y, dists, metric="euclidean"):
     Parameters
     ----------
 
-    X : CUDA array interface matrix shape (m, k)
-    Y : CUDA array interface matrix shape (n, k)
+    X : CUDA array interface compliant matrix shape (m, k)
+    Y : CUDA array interface compliant matrix shape (n, k)
     dists : Writable CUDA array interface matrix shape (m, n)
     metric : string denoting the metric type
     """
@@ -92,13 +92,12 @@ def distance(X, Y, dists, metric="euclidean"):
     n = y_cai["shape"][0]
     k = x_cai["shape"][1]
 
-    x_ptr = <uintptr_t>x_cai["data"][0]
+    x_ptr = <uintptr_t>x_cai["data"][0]1
     y_ptr = <uintptr_t>y_cai["data"][0]
     d_ptr = <uintptr_t>dists_cai["data"][0]
 
     cdef handle_t *h = new handle_t()
 
-    # TODO: Support single and double precision
     x_dt = np.dtype(x_cai["typestr"])
     y_dt = np.dtype(y_cai["typestr"])
     d_dt = np.dtype(dists_cai["typestr"])
@@ -120,7 +119,8 @@ def distance(X, Y, dists, metric="euclidean"):
                           <int>n,
                           <int>k,
                           <DistanceType>distance_type,
-                          <bool>True, <float>0.0)
+                          <bool>True,
+                          <float>0.0)
     elif x_dt == np.float64:
         pairwise_distance(deref(h),
                           <double*> x_ptr,
@@ -130,6 +130,7 @@ def distance(X, Y, dists, metric="euclidean"):
                           <int>n,
                           <int>k,
                           <DistanceType>distance_type,
-                          <bool>True, <float>0.0)
+                          <bool>True,
+                          <float>0.0)
     else:
         raise ValueError("dtype %s not supported" % x_dt)
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
index aaea7ea80e..594f6e2f66 100644
--- a/python/pylibraft/pylibraft/test/test_distance.py
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -45,8 +45,8 @@ def copy_to_host(self):
             .reshape(self.ndarray_.shape)
 
 
-@pytest.mark.parametrize("n_rows", [10, 100])
-@pytest.mark.parametrize("n_cols", [10, 100])
+@pytest.mark.parametrize("n_rows", [100])
+@pytest.mark.parametrize("n_cols", [100])
 @pytest.mark.parametrize("metric", ["euclidean", "cityblock", "chebyshev",
                                     "canberra", "correlation", "hamming",
                                     "jensenshannon", "russellrao"])
diff --git a/python/pylibraft/setup.cfg b/python/pylibraft/setup.cfg
index 478822b727..e1f4865ac9 100644
--- a/python/pylibraft/setup.cfg
+++ b/python/pylibraft/setup.cfg
@@ -52,7 +52,5 @@ skip=
 
 [options]
 packages = find:
-install_requires =
-    numpy
-    numba>=0.49
+install_requires = numpy
 python_requires = >=3.7,<3.9

From 322d65974aedd6479d82a7053b5b42063e52f728 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 21 Mar 2022 14:23:48 -0400
Subject: [PATCH 121/167] A couple additional updates to README.md

---
 README.md | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 8e82780734..8cf1721373 100755
--- a/README.md
+++ b/README.md
@@ -111,9 +111,9 @@ The easiest way to install RAFT is through conda and several packages are provid
 - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives
 - `pyraft` (optional) contains reusable Python infrastructure and tools to accelerate Python algorithm development.
 
-Use the following command to install RAFT with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages)
+Use the following command to install RAFT with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command.
 ```bash
-conda install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft pylibraft
+mamba install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft pylibraft
 ```
 
 After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ build. `COMPONENTS` are optional and will depend on the packages installed.
@@ -131,8 +131,7 @@ set(RAFT_FORK "rapidsai")
 set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
 
 function(find_and_configure_raft)
-  set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC 
-          COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES)
+  set(oneValueArgs VERSION FORK PINNED_TAG COMPILE_LIBRARIES)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
                             "${multiValueArgs}" ${ARGN} )
 
@@ -151,8 +150,6 @@ function(find_and_configure_raft)
           OPTIONS
           "BUILD_TESTS OFF"
           "BUILD_BENCH OFF"
-          "RAFT_ENABLE_NN_DEPENDENCIES ${PKG_ENABLE_NN_DEPENDENCIES}"
-          "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}"
           "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}"
   )
 
@@ -165,8 +162,6 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
         FORK             ${RAFT_FORK}
         PINNED_TAG       ${RAFT_PINNED_TAG}
         COMPILE_LIBRARIES      NO
-        ENABLE_NN_DEPENDENCIES NO
-        USE_FAISS_STATIC       NO
 )
 ```
 
@@ -183,8 +178,8 @@ Several CMake targets can be made available by adding components in the table be
 The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository:
 1. Create an environment with the needed dependencies: 
 ```
-mamba env create --name raft_dev -f conda/environments/raft_dev_cuda11.5.yml
-mamba activate raft_dev
+mamba env create --name raft_dev_env -f conda/environments/raft_dev_cuda11.5.yml
+mamba activate raft_dev_env
 ```
 ```
 ./build.sh pyraft pylibraft libraft tests bench --compile-libs

From 42f56c94a749a26012fdfb604652dbaeb5c01b22 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 21 Mar 2022 14:26:42 -0400
Subject: [PATCH 122/167] Fixing typo

---
 python/pylibraft/pylibraft/distance/pairwise_distance.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 3a04a802b5..713a1d57d4 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -92,7 +92,7 @@ def distance(X, Y, dists, metric="euclidean"):
     n = y_cai["shape"][0]
     k = x_cai["shape"][1]
 
-    x_ptr = <uintptr_t>x_cai["data"][0]1
+    x_ptr = <uintptr_t>x_cai["data"][0]
     y_ptr = <uintptr_t>y_cai["data"][0]
     d_ptr = <uintptr_t>dists_cai["data"][0]
 

From eda6fcf9f600dc44409d01708f42108503dc5a6b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 21 Mar 2022 14:53:56 -0400
Subject: [PATCH 123/167] slight re-brand

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 8cf1721373..9a07e568e0 100755
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
-# <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: RAPIDS Analytics Framework Toolkit</div>
+# <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: Reusable Accelerated Functions and Tools</div>
 
-RAFT (Reusable Algorithms, Functions, and other Tools) contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics in the [RAPIDS](https://rapids.ai) ecosystem. 
+RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
 
-By taking a primitives-based approach to algorithm development, RAFT
+By taking a primitives-based approach to algorithm development, RAFT 
 - accelerates algorithm construction time
 - reduces the maintenance burden by maximizing reuse across projects, and
 - centralizes the core computations, allowing future optimizations to benefit all algorithms that use them.
 
-The algorithms in RAFT span the following general categories:
+The accelerated functions in RAFT span the following general categories:
 #####
 | Category | Examples |
 | --- | --- |
@@ -24,8 +24,8 @@ The algorithms in RAFT span the following general categories:
 RAFT provides a header-only C++ library and pre-compiled shared libraries that can 1) speed up compile times and 2) enable the APIs to be used without CUDA-enabled compilers.
 
 RAFT also provides 2 Python libraries:
-- `pylibraft` - cython wrappers around RAFT algorithms and primitives.
-- `pyraft` - reusable infrastructure for building analytics, such as tools for building multi-node multi-GPU algorithms that leverage [Dask](https://dask.org/).
+- `pylibraft` - low-level Python wrappers around RAFT algorithms and primitives.
+- `pyraft` - reusable infrastructure for building analytics, including tools for building both single-GPU and multi-node multi-GPU algorithms.
 
 ## Getting started
 

From 0cf80b3dac47dac2a64072f36e94d7d66b217f23 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 21 Mar 2022 15:02:47 -0400
Subject: [PATCH 124/167] One more change

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9a07e568e0..4a04b49331 100755
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ RAFT contains fundamental widely-used algorithms and primitives for data science
 By taking a primitives-based approach to algorithm development, RAFT 
 - accelerates algorithm construction time
 - reduces the maintenance burden by maximizing reuse across projects, and
-- centralizes the core computations, allowing future optimizations to benefit all algorithms that use them.
+- centralizes core reusable computations, allowing future optimizations to benefit all algorithms that use them.
 
 The accelerated functions in RAFT span the following general categories:
 #####
@@ -29,9 +29,9 @@ RAFT also provides 2 Python libraries:
 
 ## Getting started
 
-### Rapids Memory Manager (RMM)
+### RAPIDS Memory Manager (RMM)
 
-RAFT relies heavily on RMM which, like other projects in the RAPIDS ecosystem, eases the burden of configuring different allocation strategies globally across the libraries that use it.
+RAFT relies heavily on RMM which eases the burden of configuring different allocation strategies globally across the libraries that use it.
 
 ### Multi-dimensional Arrays
 

From 1ef83ff142d1c70f398f125bda7c2780229b7af2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 21 Mar 2022 15:12:03 -0400
Subject: [PATCH 125/167] More readme updates

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4a04b49331..4f34bbc6b0 100755
--- a/README.md
+++ b/README.md
@@ -7,12 +7,11 @@ By taking a primitives-based approach to algorithm development, RAFT
 - reduces the maintenance burden by maximizing reuse across projects, and
 - centralizes core reusable computations, allowing future optimizations to benefit all algorithms that use them.
 
-The accelerated functions in RAFT span the following general categories:
+While not exhaustive, the following general categories help summarize the accelerated functions in RAFT:
 #####
 | Category | Examples |
 | --- | --- |
 | **Data Formats** | sparse & dense, conversions, data generation |
-| **Data Generation** | sparse, spatial, machine learning datasets |
 | **Dense Linear Algebra** | matrix arithmetic, norms, factorization, least squares, svd & eigenvalue problems |
 | **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
 | **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, labeling |

From efa5e5a39873bd1897b8c7a724a543bf651d01df Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 21 Mar 2022 17:07:09 -0400
Subject: [PATCH 126/167] Updates

---
 python/pylibraft/pylibraft/common/__init__.py | 1 +
 python/pylibraft/pylibraft/common/handle.pxd  | 5 -----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/pylibraft/pylibraft/common/__init__.py b/python/pylibraft/pylibraft/common/__init__.py
index 273b4497cc..527e644b8f 100644
--- a/python/pylibraft/pylibraft/common/__init__.py
+++ b/python/pylibraft/pylibraft/common/__init__.py
@@ -12,3 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index 71dd8d516f..bc248a335b 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -33,8 +33,3 @@ cdef extern from "raft/handle.hpp" namespace "raft" nogil:
                  shared_ptr[cuda_stream_pool] stream_pool) except +
         cuda_stream_view get_stream() except +
         void sync_stream() except +
-
-cdef class Handle:
-    cdef unique_ptr[handle_t] c_obj
-    cdef shared_ptr[cuda_stream_pool] stream_pool
-    cdef int n_streams

From f209b1c2ccbe999b4e0e32e3216b85bf551d8c64 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 21 Mar 2022 18:14:32 -0400
Subject: [PATCH 127/167] Somehow missed a review comment

---
 ci/cpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index f487e268f8..71228cb846 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -84,7 +84,7 @@ gpuci_mamba_retry install -c conda-forge boa
 # BUILD - Conda package builds
 ###############################################################################
 
-if [ "$BUILD_LIBRAFT" == '1' ]; then
+if [ "$BUILD_LIBRAFT" == "1" ]; then
   gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-headers"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers

From 187dfa4185c6c90451a947c1ace779fd1f1eb6ad Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 21 Mar 2022 18:38:12 -0400
Subject: [PATCH 128/167] Removing versions.json

---
 cpp/cmake/versions.json | 11 -----------
 1 file changed, 11 deletions(-)
 delete mode 100644 cpp/cmake/versions.json

diff --git a/cpp/cmake/versions.json b/cpp/cmake/versions.json
deleted file mode 100644
index eac6e5b448..0000000000
--- a/cpp/cmake/versions.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "packages" : {
-    "Thrust" : {
-      "version" : "1.12.0",
-      "git_url" : "https://github.com/NVIDIA/thrust.git",
-      "git_tag" : "${version}",
-      "git_shallow" : true,
-      "exclude_from_all" : true
-    }
-  }
-}
\ No newline at end of file

From 648f1c9fbc4d6cf9cff43fd4c0f1de192542ad99 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 21 Mar 2022 20:17:09 -0400
Subject: [PATCH 129/167] Fixing style

---
 cpp/include/raft/cache/cache_util.cuh         |  2 +-
 cpp/include/raft/device_atomics.cuh           |  2 +-
 cpp/include/raft/distance/distance.cuh        |  1 -
 cpp/include/raft/sparse/csr.hpp               |  2 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh | 32 +++++++++----------
 5 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh
index 66f132d0c8..3e2222eff1 100644
--- a/cpp/include/raft/cache/cache_util.cuh
+++ b/cpp/include/raft/cache/cache_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh
index 8169d9f1dc..d1ca239170 100644
--- a/cpp/include/raft/device_atomics.cuh
+++ b/cpp/include/raft/device_atomics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index e1905f30da..d8e60550ca 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -33,7 +33,6 @@
 namespace raft {
 namespace distance {
 
-
 /**
  * @brief Evaluate pairwise distances with the user epilogue lamba allowed
  * @tparam DistanceType which distance to evaluate
diff --git a/cpp/include/raft/sparse/csr.hpp b/cpp/include/raft/sparse/csr.hpp
index 3e0a6392c5..49fe980646 100644
--- a/cpp/include/raft/sparse/csr.hpp
+++ b/cpp/include/raft/sparse/csr.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index 44de653f79..a01145376a 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -57,12 +57,12 @@ void coo_symmetrize(COO<T>* in,
  * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ void symmetric_find_size(const value_t __restrict__ *data,
-                                           const value_idx __restrict__ *indices,
-                                           const value_idx n,
-                                           const int k,
-                                           value_idx __restrict__ *row_sizes,
-                                           value_idx __restrict__ *row_sizes2)
+__global__ void symmetric_find_size(const value_t __restrict__* data,
+                                    const value_idx __restrict__* indices,
+                                    const value_idx n,
+                                    const int k,
+                                    value_idx __restrict__* row_sizes,
+                                    value_idx __restrict__* row_sizes2)
 {
   detail::symmetric_find_size(data, indices, n, k, row_sizes, row_sizes2);
 }
@@ -79,9 +79,9 @@ __global__ void symmetric_find_size(const value_t __restrict__ *data,
  */
 template <typename value_idx>
 __global__ void reduce_find_size(const value_idx n,
-                                        const int k,
-                                        value_idx __restrict__ *row_sizes,
-                                        const value_idx __restrict__ *row_sizes2)
+                                 const int k,
+                                 value_idx __restrict__* row_sizes,
+                                 const value_idx __restrict__* row_sizes2)
 {
   detail::reduce_find_size(n, k, row_sizes, row_sizes2);
 }
@@ -104,13 +104,13 @@ __global__ void reduce_find_size(const value_idx n,
  */
 template <typename value_idx = int64_t, typename value_t = float>
 __global__ void symmetric_sum(value_idx* __restrict__ edges,
-                                     const value_t* __restrict__ data,
-                                     const value_idx* __restrict__ indices,
-                                     value_t* __restrict__ VAL,
-                                     value_idx* __restrict__ COL,
-                                     value_idx* __restrict__ ROW,
-                                     const value_idx n,
-                                     const int k)
+                              const value_t* __restrict__ data,
+                              const value_idx* __restrict__ indices,
+                              value_t* __restrict__ VAL,
+                              value_idx* __restrict__ COL,
+                              value_idx* __restrict__ ROW,
+                              const value_idx n,
+                              const int k)
 {
   detail::symmetric_sum(edges, data, indices, VAL, COL, ROW, n, k);
 }

From 6f6d9cc538a2c98473475a419c6ce5f48c07b277 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 22 Mar 2022 11:57:21 -0400
Subject: [PATCH 130/167] Adding INSTALL_EXPORT_SET for cuco, rmm, thrust

---
 cpp/cmake/thirdparty/get_cuco.cmake   | 1 +
 cpp/cmake/thirdparty/get_rmm.cmake    | 3 ++-
 cpp/cmake/thirdparty/get_thrust.cmake | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 3a70d34283..dc0bf6a029 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -20,6 +20,7 @@ function(find_and_configure_cuco VERSION)
         rapids_cpm_find(cuco ${VERSION}
           GLOBAL_TARGETS      cuco::cuco
           BUILD_EXPORT_SET    raft-exports
+          INSTALL_EXPORT_SET  raft-exports
           CPM_ARGS
             EXCLUDE_FROM_ALL TRUE
             GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index ffab703091..5a7d54ea4a 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -16,7 +16,8 @@
 
 function(find_and_configure_rmm)
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
-    rapids_cpm_rmm(BUILD_EXPORT_SET raft-exports)
+    rapids_cpm_rmm(BUILD_EXPORT_SET raft-exports
+                   INSTALL_EXPORT_SET  raft-exports)
 endfunction()
 
 find_and_configure_rmm()
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index a0ea09483b..b142f1ac0d 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -18,6 +18,7 @@ function(find_and_configure_thrust)
 
     rapids_cpm_thrust( NAMESPACE raft )
     rapids_export_package(BUILD Thrust raft-exports)
+    rapids_export_package(INSTALL Thrust raft-exports)
 endfunction()
 
 find_and_configure_thrust()

From 2ff3536767fc36861d473db655750b7eb6449157 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 22 Mar 2022 13:02:08 -0400
Subject: [PATCH 131/167] Calling single thrust function instead of two

---
 cpp/cmake/thirdparty/get_thrust.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index b142f1ac0d..03dfecde6a 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -16,9 +16,9 @@
 function(find_and_configure_thrust)
     include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
-    rapids_cpm_thrust( NAMESPACE raft )
-    rapids_export_package(BUILD Thrust raft-exports)
-    rapids_export_package(INSTALL Thrust raft-exports)
+    rapids_cpm_thrust( NAMESPACE raft
+                       BUILD_EXPORT_SET raft-exports
+                       INSTALL_EXPORT_SET raft-exports)
 endfunction()
 
 find_and_configure_thrust()

From 2834b62e808087c4d012c22be75a18cbeebb819a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 23 Mar 2022 11:37:06 -0400
Subject: [PATCH 132/167] Adding docs for pylibraft

---
 README.md                                     |  2 +-
 cpp/doxygen/main_page.md                      | 26 +++++++++------
 docs/source/conf.py                           |  1 +
 docs/source/index.rst                         |  4 +--
 docs/source/pylibraft_api.rst                 | 13 ++++++++
 .../source/{python_api.rst => pyraft_api.rst} | 11 ++++---
 docs/source/python.rst                        |  7 ++--
 .../pylibraft/distance/distance_type.pxd      |  4 +++
 .../pylibraft/distance/pairwise_distance.pyx  | 32 +++++++++++++++++--
 9 files changed, 79 insertions(+), 21 deletions(-)
 create mode 100644 docs/source/pylibraft_api.rst
 rename docs/source/{python_api.rst => pyraft_api.rst} (51%)

diff --git a/README.md b/README.md
index 4f34bbc6b0..f73d474efc 100755
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ While not exhaustive, the following general categories help summarize the accele
 | **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
 | **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, labeling |
 | **Basic Clustering** | spectral clustering, hierarchical clustering, k-means |
-| **Optimization** | combinatorial optimization, iterative solvers |
+| **Solvers** | combinatorial optimization, iterative solvers |
 | **Statistics** | sampling, moments and summary statistics, metrics |
 | **Distributed Tools** | multi-node multi-gpu infrastructure |
 
diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index 070a8f1f1d..adc6d75f1a 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -1,14 +1,22 @@
 # libraft
 
-RAFT (RAPIDS Analytics Framework Toolkit) is a library containing building-blocks for rapid composition of RAPIDS Analytics. These building-blocks include shared representations, mathematical computational primitives, and utilities that accelerate building analytics and data science algorithms in the RAPIDS ecosystem. Both the C++ and Python components can be included in consuming libraries, providing building-blocks for both dense and sparse matrix formats in the following general categories:
+RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
+
+By taking a primitives-based approach to algorithm development, RAFT
+- accelerates algorithm construction time
+- reduces the maintenance burden by maximizing reuse across projects, and
+- centralizes core reusable computations, allowing future optimizations to benefit all algorithms that use them.
+
+While not exhaustive, the following general categories help summarize the accelerated functions in RAFT:
 
 #####
-| Category | Description / Examples |
+| Category | Examples |
 | --- | --- |
-| **Data Formats** | tensor representations and conversions for both sparse and dense formats |
-| **Data Generation** | graph, spatial, and machine learning dataset generation |
-| **Dense Operations** | linear algebra, statistics |
-| **Spatial** | pairwise distances, nearest neighbors, neighborhood / proximity graph construction |
-| **Sparse/Graph Operations** | linear algebra, statistics, slicing, msf, spectral embedding/clustering, slhc, vertex degree |
-| **Solvers** | eigenvalue decomposition, least squares, lanczos |
-| **Tools** | multi-node multi-gpu communicator, utilities |
+| **Data Formats** | sparse & dense, conversions, data generation |
+| **Dense Linear Algebra** | matrix arithmetic, norms, factorization, least squares, svd & eigenvalue problems |
+| **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
+| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, labeling |
+| **Basic Clustering** | spectral clustering, hierarchical clustering, k-means |
+| **Solvers** | combinatorial optimization, iterative solvers |
+| **Statistics** | sampling, moments and summary statistics, metrics |
+| **Distributed Tools** | multi-node multi-gpu infrastructure |
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6fd7e3d702..bb9f0c1a84 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -25,6 +25,7 @@
 # absolute, like shown here.
 sys.path.insert(0, os.path.abspath('sphinxext'))
 sys.path.insert(0, os.path.abspath('../../python/raft'))
+sys.path.insert(0, os.path.abspath('../../python/pylibraft'))
 
 from github_link import make_linkcode_resolve # noqa
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 0ce3ff1ccc..d047543c13 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -8,8 +8,8 @@ RAFT contains fundamental widely-used algorithms and primitives for data science
    :caption: Contents:
 
    cpp_api.rst
-   python_api.rst
-
+   pyraft_api.rst
+   pylibraft_api.rst
 
 
 Indices and tables
diff --git a/docs/source/pylibraft_api.rst b/docs/source/pylibraft_api.rst
new file mode 100644
index 0000000000..4df0d9d01c
--- /dev/null
+++ b/docs/source/pylibraft_api.rst
@@ -0,0 +1,13 @@
+~~~~~~~~~~~~~~~~~~~~~~~
+PyLibRAFT API Reference
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+
+Pairwise Distances
+==================
+
+.. autofunction:: pylibraft.distance.pairwise_distance
\ No newline at end of file
diff --git a/docs/source/python_api.rst b/docs/source/pyraft_api.rst
similarity index 51%
rename from docs/source/python_api.rst
rename to docs/source/pyraft_api.rst
index 7b2cdcc5a5..c763c9a0f7 100644
--- a/docs/source/python_api.rst
+++ b/docs/source/pyraft_api.rst
@@ -7,11 +7,14 @@ PyRAFT API Reference
    :class: highlight
 
 
-Multi-Node Multi-GPU Infrastructure
-===================================
+RAFT Handle
+-----------
 
-Dask-based Communicator
------------------------
+.. autoclass:: raft.common.handle.Handle
+    :members:
+
+Dask-based Multi-Node Multi-GPU Communicator
+--------------------------------------------
 
 .. autoclass:: raft.dask.common.Comms
     :members:
diff --git a/docs/source/python.rst b/docs/source/python.rst
index 4600a3c31b..0ae9f88398 100644
--- a/docs/source/python.rst
+++ b/docs/source/python.rst
@@ -1,9 +1,10 @@
-Python API
-==========
+RAFT Python APIs
+================
 
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
 
-   python_api.rst
+   pyraft_api.rst
+   pylibraft_api.rst
 
diff --git a/python/pylibraft/pylibraft/distance/distance_type.pxd b/python/pylibraft/pylibraft/distance/distance_type.pxd
index 2c01e42e53..ab865670bb 100644
--- a/python/pylibraft/pylibraft/distance/distance_type.pxd
+++ b/python/pylibraft/pylibraft/distance/distance_type.pxd
@@ -13,6 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
 
 cdef extern from "raft/distance/distance_type.hpp" namespace "raft::distance":
 
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 713a1d57d4..ffb2f11110 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -13,6 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
 import numpy as np
 
 from libc.stdint cimport uintptr_t
@@ -74,14 +79,37 @@ def distance(X, Y, dists, metric="euclidean"):
     """
     Compute pairwise distances between X and Y
 
+    Valid values for metric:
+        ["euclidean", "l2", "l1", "cityblock", "inner_product",
+         "chebyshev", "canberra", "lp", "hellinger", "jensenshannon",
+         "kl_divergence", "russellrao"]
+
     Parameters
     ----------
 
     X : CUDA array interface compliant matrix shape (m, k)
     Y : CUDA array interface compliant matrix shape (n, k)
     dists : Writable CUDA array interface matrix shape (m, n)
-    metric : string denoting the metric type
-    """
+    metric : string denoting the metric type (default="euclidean")
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        import cupy as cp
+
+        from pylibraft.distance import pairwise_distance
+
+        n_samples = 5000
+        n_features = 50
+
+        in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+        in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+        output = cp.empty((n_samples, n_samples), dtype=cp.float32)
+
+        pairwise_distance(in1, in2, output, metric="euclidean")
+   """
 
     # TODO: Validate inputs, shapes, etc...
     x_cai = X.__cuda_array_interface__

From b1e1ff88042590e771e60809de42aeb21760b23e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 23 Mar 2022 11:55:45 -0400
Subject: [PATCH 133/167] pushing cuco dependency down to distance component

---
 cpp/CMakeLists.txt                  | 4 ++++
 cpp/cmake/thirdparty/get_cuco.cmake | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e48508099f..99b14a4534 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -160,7 +160,10 @@ target_link_libraries(raft INTERFACE
         CUDA::cudart
         CUDA::cusparse
         rmm::rmm
+<<<<<<< Updated upstream
         $<$<BOOL:${RAFT_ENABLE_cuco_DEPENDENCY}>:cuco::cuco>
+=======
+>>>>>>> Stashed changes
         std::mdspan)
 
 target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
@@ -243,6 +246,7 @@ endif()
 
 target_link_libraries(raft_distance INTERFACE
     raft::raft
+    $<$<BOOL:${cuco_ADDED}>:cuco::cuco>
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 3f2434d9be..81172a2d8e 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -19,8 +19,8 @@ function(find_and_configure_cuco VERSION)
     if(RAFT_ENABLE_cuco_DEPENDENCY)
         rapids_cpm_find(cuco ${VERSION}
           GLOBAL_TARGETS      cuco::cuco
-          BUILD_EXPORT_SET    raft-exports
-          INSTALL_EXPORT_SET  raft-exports
+          BUILD_EXPORT_SET    raft-distance-exports
+          INSTALL_EXPORT_SET  raft-distance-exports
           CPM_ARGS
             GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
             GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82

From a7f859ea2a086d2b317df28c8bef5c348b76e232 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 23 Mar 2022 12:47:49 -0400
Subject: [PATCH 134/167] Updating style

---
 python/pylibraft/pylibraft/distance/pairwise_distance.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index ffb2f11110..e667015ac8 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -104,8 +104,10 @@ def distance(X, Y, dists, metric="euclidean"):
         n_samples = 5000
         n_features = 50
 
-        in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
-        in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+        in1 = cp.random.random_sample((n_samples, n_features),
+                                      dtype=cp.float32)
+        in2 = cp.random.random_sample((n_samples, n_features),
+                                      dtype=cp.float32)
         output = cp.empty((n_samples, n_samples), dtype=cp.float32)
 
         pairwise_distance(in1, in2, output, metric="euclidean")

From 454fe3ccac8a9a14da6781545dbb0dc8331ca536 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 23 Mar 2022 12:55:51 -0400
Subject: [PATCH 135/167] Enabling cuco

---
 cpp/CMakeLists.txt | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 99b14a4534..4fba1b258e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -160,10 +160,6 @@ target_link_libraries(raft INTERFACE
         CUDA::cudart
         CUDA::cusparse
         rmm::rmm
-<<<<<<< Updated upstream
-        $<$<BOOL:${RAFT_ENABLE_cuco_DEPENDENCY}>:cuco::cuco>
-=======
->>>>>>> Stashed changes
         std::mdspan)
 
 target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
@@ -246,7 +242,7 @@ endif()
 
 target_link_libraries(raft_distance INTERFACE
     raft::raft
-    $<$<BOOL:${cuco_ADDED}>:cuco::cuco>
+    $<$<BOOL:${RAFT_ENABLE_cuco_DEPENDENCY}>:cuco::cuco>
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )

From c22437b12d718a3769dd95011e31585176730c8a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 23 Mar 2022 14:02:06 -0400
Subject: [PATCH 136/167] Removing faiss dependfency from tests. Skipping cuco
 dependency if not in components

---
 cpp/CMakeLists.txt                  | 9 ++++++++-
 cpp/cmake/thirdparty/get_cuco.cmake | 3 ++-
 cpp/test/CMakeLists.txt             | 1 -
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4fba1b258e..2516118104 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -64,7 +64,10 @@ option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared li
 option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations" OFF)
 option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss" ${RAFT_COMPILE_LIBRARIES})
 
-option(RAFT_ENABLE_cuco_DEPENDENCY "Enable cuCollections dependency" ON)
+option(RAFT_ENABLE_cuco_DEPENDENCY "Enable cuCollections dependency" OFF)
+if(raft IN_LIST raft_FIND_COMPONENTS)
+  set(RAFT_ENABLE_cuco_DEPENDENCY ON)
+endif()
 
 # Currently, UCX and NCCL are only needed to build Pyraft and so a simple find_package() is sufficient
 option(RAFT_ENABLE_nccl_DEPENDENCY "Enable NCCL dependency" OFF)
@@ -75,6 +78,7 @@ option(RAFT_EXCLUDE_FAISS_FROM_ALL "Exclude FAISS targets from RAFT's 'all' targ
 include(CMakeDependentOption)
 cmake_dependent_option(RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARIES OFF)
 
+message(VERBOSE "RAFT: Building optional components: ${raft_FIND_COMPONENTS}")
 message(VERBOSE "RAFT: Build RAFT unit-tests: ${BUILD_TESTS}")
 message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_BENCH}")
 message(VERBOSE "RAFT: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
@@ -151,6 +155,9 @@ target_include_directories(raft INTERFACE
         "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
         "$<INSTALL_INTERFACE:include>")
 
+# Keep RAFT as lightweight as possible.
+# Only CUDA libs, rmm, and mdspan should
+# be used in global target.
 target_link_libraries(raft INTERFACE
         raft::Thrust
         $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 81172a2d8e..ac2181ee53 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -16,7 +16,8 @@
 
 function(find_and_configure_cuco VERSION)
 
-    if(RAFT_ENABLE_cuco_DEPENDENCY)
+    if(RAFT_ENABLE_cuco_DEPENDENCY OR RAFT_COMPILE_LIBRARIES OR
+            RAFT_COMPILE_DIST_LIBRARY)
         rapids_cpm_find(cuco ${VERSION}
           GLOBAL_TARGETS      cuco::cuco
           BUILD_EXPORT_SET    raft-distance-exports
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index c03e5d6bcd..354b5e8fc4 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -164,7 +164,6 @@ target_link_libraries(test_raft
         raft::raft
         raft::distance
         raft::nn
-        faiss::faiss
         GTest::gtest
         GTest::gtest_main
         Threads::Threads

From 3550d5022b1e6d970f287a159588eb0e7daa1dd3 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 23 Mar 2022 16:21:14 -0400
Subject: [PATCH 137/167] Passing faiss::faiss through raft::nn when
 dependencies are enabled

---
 cpp/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2516118104..038fd644a3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -293,6 +293,7 @@ endif()
 
 target_link_libraries(raft_nn INTERFACE
     raft::raft
+    $<$<BOOL:${RAFT_ENABLE_NN_DEPENDENCIES}>:faiss::faiss>
     $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 

From 828167905cf02be38a29a4787a3cece083f6469c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 23 Mar 2022 21:27:21 -0400
Subject: [PATCH 138/167] Review feedback

---
 cpp/doxygen/main_page.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index adc6d75f1a..ff0c7820c7 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -3,7 +3,7 @@
 RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
 
 By taking a primitives-based approach to algorithm development, RAFT
-- accelerates algorithm construction time
+- accelerates algorithm construction time,
 - reduces the maintenance burden by maximizing reuse across projects, and
 - centralizes core reusable computations, allowing future optimizations to benefit all algorithms that use them.
 

From 0ed30332ca237d35dca6e57e350ade278a61d4a5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 23 Mar 2022 22:56:08 -0400
Subject: [PATCH 139/167] Making mdspan and thrust optional and turning off
 compile libraries by default.

---
 cpp/CMakeLists.txt                    | 13 ++++++++-----
 cpp/cmake/thirdparty/get_mdspan.cmake | 24 +++++++++++++-----------
 cpp/cmake/thirdparty/get_thrust.cmake | 10 ++++++----
 3 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e48508099f..4ec2ad5b23 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -48,7 +48,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 ##############################################################################
 # - User Options  ------------------------------------------------------------
 
-option(BUILD_TESTS "Build raft unit-tests" ON)
+option(BUILD_TESTS "Build raft unit-tests" OFF)
 option(BUILD_BENCH "Build raft C++ benchmark tests" OFF)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF)
@@ -59,12 +59,14 @@ option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(NVTX "Enable nvtx markers" OFF)
 option(RAFT_STATIC_LINK_LIBRARIES "Statically link compiled libraft libraries")
 
-option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" ON)
+option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" OFF)
 option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations" OFF)
 option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations" OFF)
 option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss" ${RAFT_COMPILE_LIBRARIES})
 
 option(RAFT_ENABLE_cuco_DEPENDENCY "Enable cuCollections dependency" ON)
+option(RAFT_ENABLE_mdspan_DEPENDENCY "Enable mdspan dependency" ON)
+option(RAFT_ENABLE_thrust_DEPENDENCY "Enable Thrust dependency" ON)
 
 # Currently, UCX and NCCL are only needed to build Pyraft and so a simple find_package() is sufficient
 option(RAFT_ENABLE_nccl_DEPENDENCY "Enable NCCL dependency" OFF)
@@ -152,7 +154,7 @@ target_include_directories(raft INTERFACE
         "$<INSTALL_INTERFACE:include>")
 
 target_link_libraries(raft INTERFACE
-        raft::Thrust
+        $<$<BOOL:${RAFT_ENABLE_cuco_DEPENDENCY}>:raft::Thrust>
         $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
         CUDA::cublas
         CUDA::curand
@@ -161,7 +163,8 @@ target_link_libraries(raft INTERFACE
         CUDA::cusparse
         rmm::rmm
         $<$<BOOL:${RAFT_ENABLE_cuco_DEPENDENCY}>:cuco::cuco>
-        std::mdspan)
+        $<$<BOOL:${RAFT_ENABLE_mdspan_DEPENDENCY}>:std::mdspan>
+)
 
 target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
@@ -349,7 +352,7 @@ Imported Targets:
 set(code_string
 [=[
 
-if(NOT TARGET raft::Thrust)
+if(RAFT_ENABLE_thrust_DEPENDENCY AND NOT TARGET raft::Thrust)
   thrust_create_target(raft::Thrust FROM_OPTIONS)
 endif()
 
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index 12ac7ab0fd..cc99de9f2c 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -13,17 +13,19 @@
 # =============================================================================
 
 function(find_and_configure_mdspan VERSION)
-  rapids_cpm_find(
-    mdspan ${VERSION}
-    GLOBAL_TARGETS std::mdspan
-    BUILD_EXPORT_SET    raft-exports
-    INSTALL_EXPORT_SET  raft-exports
-    CPM_ARGS
-      GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
-      GIT_TAG b3042485358d2ee168ae2b486c98c2c61ec5aec1
-      OPTIONS "MDSPAN_ENABLE_CUDA ON"
-              "MDSPAN_CXX_STANDARD ON"
-  )
+  if(RAFT_ENABLE_mdspan_DEPENDENCY)
+    rapids_cpm_find(
+      mdspan ${VERSION}
+      GLOBAL_TARGETS std::mdspan
+      BUILD_EXPORT_SET    raft-exports
+      INSTALL_EXPORT_SET  raft-exports
+      CPM_ARGS
+        GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
+        GIT_TAG b3042485358d2ee168ae2b486c98c2c61ec5aec1
+        OPTIONS "MDSPAN_ENABLE_CUDA ON"
+                "MDSPAN_CXX_STANDARD ON"
+    )
+  endif()
 endfunction()
 
 find_and_configure_mdspan(0.2.0)
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 03dfecde6a..c533b04256 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -14,11 +14,13 @@
 
 # Use CPM to find or clone thrust
 function(find_and_configure_thrust)
-    include(${rapids-cmake-dir}/cpm/thrust.cmake)
+    if(RAFT_ENABLE_thrust_DEPENDENCY)
+        include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
-    rapids_cpm_thrust( NAMESPACE raft
-                       BUILD_EXPORT_SET raft-exports
-                       INSTALL_EXPORT_SET raft-exports)
+        rapids_cpm_thrust( NAMESPACE raft
+                           BUILD_EXPORT_SET raft-exports
+                           INSTALL_EXPORT_SET raft-exports)
+    endif()
 endfunction()
 
 find_and_configure_thrust()

From 3dbe7d4f776fe50af54cc9d787ea28a6f41a0eff Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 23 Mar 2022 22:59:53 -0400
Subject: [PATCH 140/167] Turnbing on compiling tests by default but not
 compiling shared libs

---
 cpp/CMakeLists.txt               | 2 +-
 cpp/test/spatial/fused_l2_knn.cu | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4ec2ad5b23..eba801eaaf 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -48,7 +48,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 ##############################################################################
 # - User Options  ------------------------------------------------------------
 
-option(BUILD_TESTS "Build raft unit-tests" OFF)
+option(BUILD_TESTS "Build raft unit-tests" ON)
 option(BUILD_BENCH "Build raft C++ benchmark tests" OFF)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF)
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index 2ec4e86d1f..cfaa959d8a 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -25,6 +25,10 @@
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 #include <raft/spatial/knn/knn.cuh>
 
+#if defined RAFT_NN_COMPILED
+#include <raft/spatial/knn/specializations.cuh>
+#endif
+
 #include <rmm/device_buffer.hpp>
 
 #include <gtest/gtest.h>

From 3afbd024d15fb03f7c34d346e8e24d361e05078f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 23 Mar 2022 23:02:11 -0400
Subject: [PATCH 141/167] Fixing flag

---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eba801eaaf..5fb1a75a97 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -154,7 +154,6 @@ target_include_directories(raft INTERFACE
         "$<INSTALL_INTERFACE:include>")
 
 target_link_libraries(raft INTERFACE
-        $<$<BOOL:${RAFT_ENABLE_cuco_DEPENDENCY}>:raft::Thrust>
         $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
         CUDA::cublas
         CUDA::curand
@@ -162,6 +161,7 @@ target_link_libraries(raft INTERFACE
         CUDA::cudart
         CUDA::cusparse
         rmm::rmm
+        $<$<BOOL:${RAFT_ENABLE_thrust_DEPENDENCY}>:raft::Thrust>
         $<$<BOOL:${RAFT_ENABLE_cuco_DEPENDENCY}>:cuco::cuco>
         $<$<BOOL:${RAFT_ENABLE_mdspan_DEPENDENCY}>:std::mdspan>
 )

From b9a91a6d824a45512c4fdb7dca86df2146aa5784 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 14:18:06 -0400
Subject: [PATCH 142/167] Installing version string

---
 cpp/CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 860c5b7197..3f83d0f03b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #=============================================================================
+set(RAPIDS_VERSION "22.04")
+set(RAFT_VERSION "${RAPIDS_VERSION}.00")
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
 file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.04/RAPIDS.cmake
@@ -26,7 +28,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(RAFT)
 
-project(RAFT VERSION 22.04.00 LANGUAGES CXX CUDA)
+project(RAFT VERSION ${RAFT_VERSION} LANGUAGES CXX CUDA)
 
 # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
 # have different values for the `Threads::Threads` target. Setting this flag ensures
@@ -338,6 +340,9 @@ install(DIRECTORY include/raft
 install(FILES include/raft.hpp
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft)
 
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/raft/version_config.hpp
+        DESTINATION include/raft)
+
 ##############################################################################
 # - install export -----------------------------------------------------------
 set(doc_string

From ed933ae4e2905b0f93a211da5de3ee53ff12898b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 14:48:33 -0400
Subject: [PATCH 143/167] Proper enabling/disable rules. Adding docs

---
 BUILD.md                                  | 32 ++++++++++++++---------
 build.sh                                  |  4 +++
 cpp/CMakeLists.txt                        |  6 ++++-
 cpp/cmake/thirdparty/get_cuco.cmake       |  3 +--
 cpp/cmake/thirdparty/get_libcudacxx.cmake |  9 ++++---
 5 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index ef2d1a2bda..8e02bb2d5d 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -26,12 +26,12 @@
 In addition to the libraries included with cudatoolkit 11.0+, there are some other dependencies below for building RAFT from source. Many of the dependencies are optional and depend only on the primitives being used. All of these can be installed with cmake or [rapids-cpm](https://github.com/rapidsai/rapids-cmake#cpm) and many of them can be installed with [conda](https://anaconda.org).
 
 #### Required
-- [Thrust](https://github.com/NVIDIA/thrust) v1.15 / [CUB](https://github.com/NVIDIA/cub)
 - [RMM](https://github.com/rapidsai/rmm) corresponding to RAFT version.
-- [mdspan](https://github.com/rapidsai/mdspan)
   
 #### Optional
-- [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API
+- [mdspan](https://github.com/rapidsai/mdspan) - On by default but can be disabled. 
+- [Thrust](https://github.com/NVIDIA/thrust) v1.15 / [CUB](https://github.com/NVIDIA/cub) - On by default but can be disabled.
+- [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API.
 - [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0
 - [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::spatial::knn` API and needed to build tests.
 - [NCCL](https://github.com/NVIDIA/nccl) - Used in `raft::comms` API and needed to build `Pyraft`
@@ -110,11 +110,14 @@ RAFT's cmake has the following configurable flags available:.
 | --- | --- | --- | --- |
 | BUILD_TESTS | ON, OFF | ON | Compile Googletests |
 | BUILD_BENCH | ON, OFF | ON | Compile benchmarks |
+| raft_FIND_COMPONENTS | nn distance | | Configures the optional components as a space-separated list |
 | RAFT_COMPILE_LIBRARIES | ON, OFF | OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
-| RAFT_COMPILE_NN_LIBRARY | ON, OFF | ON | Compiles the `libraft-nn` shared library |
-| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |
+| RAFT_COMPILE_NN_LIBRARY | ON, OFF | OFF | Compiles the `libraft-nn` shared library |
+| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | OFF | Compiles the `libraft-distance` shared library |
 | RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. Needed for `raft::spatial::knn` |
-| RAFT_ENABLE_cuco_DEPENDENCY | ON, OFF | ON | Enables the cuCollections dependency used by `raft::sparse::distance` |
+| RAFT_ENABLE_cuco_DEPENDENCY | ON, OFF | OFF | Enables the cuCollections dependency used by `raft::sparse::distance`. This is turned on automatically when enabling the distance component |
+| RAFT_ENABLE_thrust_DEPENDENCY | ON, OFF | ON | Enables the Thrust dependency. This can be disabled when using many simple utilities or to override with a different Thrust version. |
+| RAFT_ENABLE_mdspan_DEPENDENCY | ON, OFF | ON | Enables the std::mdspan dependency. This can be disabled when using many simple utilities. |
 | RAFT_ENABLE_nccl_DEPENDENCY | ON, OFF | OFF | Enables NCCL dependency used by `raft::comms` and needed to build `pyraft` |
 | RAFT_ENABLE_ucx_DEPENDENCY | ON, OFF | OFF | Enables UCX dependency used by `raft::comms` and needed to build `pyraft` |
 | RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | 
@@ -212,7 +215,8 @@ set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
 function(find_and_configure_raft)
   set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC
           COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES CLONE_ON_PIN
-          USE_NN_LIBRARY USE_DISTANCE_LIBRARY)
+          USE_NN_LIBRARY USE_DISTANCE_LIBRARY 
+          ENABLE_thrust_DEPENDENCY ENABLE_mdspan_DEPENDENCY)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
                             "${multiValueArgs}" ${ARGN} )
 
@@ -256,6 +260,8 @@ function(find_and_configure_raft)
           "RAFT_ENABLE_NN_DEPENDENCIES ${PKG_ENABLE_NN_DEPENDENCIES}"
           "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}"
           "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}"
+          "RAFT_ENABLE_thrust_DEPENDENCY ${PKG_ENABLE_thrust_DEPENDENCY}"
+          "RAFT_ENABLE_mdspan_DEPENDENCY ${PKG_ENABLE_mdspan_DEPENDENCY}"
   )
 
 endfunction()
@@ -272,11 +278,13 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
         # even if it's already installed.
         CLONE_ON_PIN     ON
 
-        COMPILE_LIBRARIES      NO
-        USE_NN_LIBRARY         NO
-        USE_DISTANCE_LIBRARY   NO
-        ENABLE_NN_DEPENDENCIES NO  # This builds FAISS if not installed
-        USE_FAISS_STATIC       NO
+        COMPILE_LIBRARIES        NO
+        USE_NN_LIBRARY           NO
+        USE_DISTANCE_LIBRARY     NO
+        ENABLE_NN_DEPENDENCIES   NO  # This builds FAISS if not installed
+        USE_FAISS_STATIC         NO
+        ENABLE_thrust_DEPENDENCY YES
+        ENABLE_mdspan_DEPENDENCY YES
 )
 ```
 
diff --git a/build.sh b/build.sh
index 0c3fbaccb6..b03db15379 100755
--- a/build.sh
+++ b/build.sh
@@ -62,6 +62,10 @@ COMPILE_LIBRARIES=OFF
 COMPILE_NN_LIBRARY=OFF
 COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=OFF
+
+ENABLE_thrust_DEPENDENCY
+ENABLE_mdspan_DEPENDENCY
+
 ENABLE_ucx_DEPENDENCY=OFF
 ENABLE_nccl_DEPENDENCY=OFF
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3f83d0f03b..04e67471c4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -69,10 +69,14 @@ option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss"
 option(RAFT_ENABLE_cuco_DEPENDENCY "Enable cuCollections dependency" OFF)
 option(RAFT_ENABLE_mdspan_DEPENDENCY "Enable mdspan dependency" ON)
 option(RAFT_ENABLE_thrust_DEPENDENCY "Enable Thrust dependency" ON)
-if(raft IN_LIST raft_FIND_COMPONENTS)
+if(distance IN_LIST raft_FIND_COMPONENTS OR RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
   set(RAFT_ENABLE_cuco_DEPENDENCY ON)
 endif()
 
+if(BUILD_TESTS)
+  set(RAFT_ENABLE_thrust_DEPENDENCY ON)
+endif()
+
 # Currently, UCX and NCCL are only needed to build Pyraft and so a simple find_package() is sufficient
 option(RAFT_ENABLE_nccl_DEPENDENCY "Enable NCCL dependency" OFF)
 option(RAFT_ENABLE_ucx_DEPENDENCY "Enable ucx dependency" OFF)
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index ac2181ee53..81172a2d8e 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -16,8 +16,7 @@
 
 function(find_and_configure_cuco VERSION)
 
-    if(RAFT_ENABLE_cuco_DEPENDENCY OR RAFT_COMPILE_LIBRARIES OR
-            RAFT_COMPILE_DIST_LIBRARY)
+    if(RAFT_ENABLE_cuco_DEPENDENCY)
         rapids_cpm_find(cuco ${VERSION}
           GLOBAL_TARGETS      cuco::cuco
           BUILD_EXPORT_SET    raft-distance-exports
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index a018341b24..f2c4d0e700 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -14,10 +14,13 @@
 
 # This function finds libcudacxx and sets any additional necessary environment variables.
 function(find_and_configure_libcudacxx)
-  include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
-                        INSTALL_EXPORT_SET raft-exports)
+  if(RAFT_ENABLE_cuco_DEPENDENCY)
+    include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
+
+    rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
+                          INSTALL_EXPORT_SET raft-exports)
+  endif()
 
 endfunction()
 

From 2f5df6ed55f310a9cd4999a85dc873760f1dc9da Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 14:57:39 -0400
Subject: [PATCH 144/167] Adding opton to build.sh to disable thrust during
 header-only mode

---
 build.sh           | 13 +++++++++----
 cpp/CMakeLists.txt |  3 ++-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/build.sh b/build.sh
index b03db15379..4a228ef316 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft pylibraft docs tests bench clean -v -g --install --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
+VALIDARGS="clean libraft pyraft pylibraft docs tests bench clean -v -g --install --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss --no-thrust"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -63,8 +63,7 @@ COMPILE_NN_LIBRARY=OFF
 COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=OFF
 
-ENABLE_thrust_DEPENDENCY
-ENABLE_mdspan_DEPENDENCY
+ENABLE_thrust_DEPENDENCY=ON
 
 ENABLE_ucx_DEPENDENCY=OFF
 ENABLE_nccl_DEPENDENCY=OFF
@@ -109,6 +108,11 @@ fi
 if hasArg --install; then
   INSTALL_TARGET="install"
 fi
+
+if hasArg --no-thrust; then
+  ENABLE_thrust_DEPENDENCY=OFF
+fi
+
 if hasArg -v; then
     VERBOSE_FLAG="-v"
     CMAKE_LOG_LEVEL="VERBOSE"
@@ -222,7 +226,8 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
           -DRAFT_COMPILE_DIST_LIBRARY=${COMPILE_DIST_LIBRARY} \
           -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS} \
           -DRAFT_ENABLE_nccl_DEPENDENCY=${ENABLE_nccl_DEPENDENCY} \
-          -DRAFT_ENABLE_ucx_DEPENDENCY=${ENABLE_ucx_DEPENDENCY}
+          -DRAFT_ENABLE_ucx_DEPENDENCY=${ENABLE_ucx_DEPENDENCY} \
+          -DRAFT_ENABLE_thrust_DEPENDENCY=${ENABLE_thrust_DEPENDENCY}
 
   if [[ ${CMAKE_TARGET} != "" ]]; then
       echo "-- Compiling targets: ${CMAKE_TARGET}, verbose=${VERBOSE_FLAG}"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 04e67471c4..0f5ae0fa40 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -73,7 +73,8 @@ if(distance IN_LIST raft_FIND_COMPONENTS OR RAFT_COMPILE_LIBRARIES OR RAFT_COMPI
   set(RAFT_ENABLE_cuco_DEPENDENCY ON)
 endif()
 
-if(BUILD_TESTS)
+if(BUILD_TESTS AND NOT RAFT_ENABLE_thrust_DEPENDENCY)
+  message(VERBOSE "RAFT: BUILD_TESTS is enabled, overriding RAFT_ENABLE_thrust_DEPENDENCY")
   set(RAFT_ENABLE_thrust_DEPENDENCY ON)
 endif()
 

From a74866bf1934063433b519c6d2ff92b02afe4cab Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 14:58:47 -0400
Subject: [PATCH 145/167] more docs

---
 build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/build.sh b/build.sh
index 4a228ef316..14f1e42a88 100755
--- a/build.sh
+++ b/build.sh
@@ -36,6 +36,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    --compile-libs   - compile shared libraries for all components
    --compile-nn     - compile shared library for nn component
    --compile-dist   - compile shared library for distance component
+   --no-thrust      - disable thrust dependency. can be useful for header-only install
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
    --install        - install cmake targets

From c4ad1e63e8d891318e14dde45f4158e0ebf1663b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 15:01:16 -0400
Subject: [PATCH 146/167] Removing cuco from raft global links

---
 cpp/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0f5ae0fa40..23862aaac1 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -176,7 +176,6 @@ target_link_libraries(raft INTERFACE
         CUDA::cusparse
         rmm::rmm
         $<$<BOOL:${RAFT_ENABLE_thrust_DEPENDENCY}>:raft::Thrust>
-        $<$<BOOL:${RAFT_ENABLE_cuco_DEPENDENCY}>:cuco::cuco>
         $<$<BOOL:${RAFT_ENABLE_mdspan_DEPENDENCY}>:std::mdspan>
 )
 

From a94cc62dbdfcbd1e468c3656a1d2138492668ed5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 15:03:29 -0400
Subject: [PATCH 147/167] adding no-thrust example to build instrucitons

---
 BUILD.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/BUILD.md b/BUILD.md
index 8e02bb2d5d..8e5e617131 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -53,6 +53,11 @@ The following example will download the needed dependencies and install the RAFT
 ./build.sh libraft --install
 ```
 
+The `--no-thrust` flag can be used to install the headers with minimal dependencies:
+```bash
+./build.sh libraft --install --no-thrust
+```
+
 ### <a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
 
 For larger projects which make heavy use of the pairwise distances or nearest neighbors APIs, shared libraries can be built to speed up compile times. These shared libraries can also significantly improve re-compile times both while developing RAFT and developing against the APIs. Build all of the available shared libraries by passing `--compile-libs` flag to `build.sh`:

From 70eec1e2e3dbb7164e070b8c31cd6068646195e5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 15:22:35 -0400
Subject: [PATCH 148/167] Moving public things over to core/ package

---
 cpp/include/raft/common/logger.hpp            | 282 +-------
 cpp/include/raft/common/nvtx.hpp              | 138 +---
 cpp/include/raft/comms/comms.hpp              | 626 +----------------
 cpp/include/raft/core/comms.hpp               | 633 +++++++++++++++++
 cpp/include/raft/core/cublas_macros.hpp       | 126 ++++
 cpp/include/raft/core/cudart_utils.hpp        | 409 +++++++++++
 cpp/include/raft/core/cusolver_macros.hpp     | 122 ++++
 cpp/include/raft/core/cusparse_macros.hpp     | 123 ++++
 cpp/include/raft/core/error.hpp               | 181 +++++
 cpp/include/raft/core/handle.hpp              | 344 +++++++++
 cpp/include/raft/core/interruptible.hpp       | 276 ++++++++
 cpp/include/raft/core/logger.hpp              | 298 ++++++++
 cpp/include/raft/core/mdarray.hpp             | 650 ++++++++++++++++++
 cpp/include/raft/core/nvtx.hpp                | 155 +++++
 cpp/include/raft/core/span.hpp                | 282 ++++++++
 cpp/include/raft/cudart_utils.h               | 392 +----------
 cpp/include/raft/error.hpp                    | 164 +----
 cpp/include/raft/handle.hpp                   | 327 +--------
 cpp/include/raft/interruptible.hpp            | 259 +------
 cpp/include/raft/linalg/cublas_macros.h       | 108 +--
 cpp/include/raft/linalg/cusolver_macros.h     | 104 +--
 cpp/include/raft/mdarray.hpp                  | 635 +----------------
 cpp/include/raft/span.hpp                     | 267 +------
 .../raft/sparse/detail/cusparse_macros.h      | 108 +--
 24 files changed, 3634 insertions(+), 3375 deletions(-)
 create mode 100644 cpp/include/raft/core/comms.hpp
 create mode 100644 cpp/include/raft/core/cublas_macros.hpp
 create mode 100644 cpp/include/raft/core/cudart_utils.hpp
 create mode 100644 cpp/include/raft/core/cusolver_macros.hpp
 create mode 100644 cpp/include/raft/core/cusparse_macros.hpp
 create mode 100644 cpp/include/raft/core/error.hpp
 create mode 100644 cpp/include/raft/core/handle.hpp
 create mode 100644 cpp/include/raft/core/interruptible.hpp
 create mode 100644 cpp/include/raft/core/logger.hpp
 create mode 100644 cpp/include/raft/core/mdarray.hpp
 create mode 100644 cpp/include/raft/core/nvtx.hpp
 create mode 100644 cpp/include/raft/core/span.hpp

diff --git a/cpp/include/raft/common/logger.hpp b/cpp/include/raft/common/logger.hpp
index 9066e103d0..0a4c7044bc 100644
--- a/cpp/include/raft/common/logger.hpp
+++ b/cpp/include/raft/common/logger.hpp
@@ -15,284 +15,4 @@
  */
 #pragma once
 
-#include <stdarg.h>
-
-#include <algorithm>
-
-#include <memory>
-#include <mutex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include <stdarg.h>
-
-#define SPDLOG_HEADER_ONLY
-#include <raft/common/detail/callback_sink.hpp>
-#include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
-#include <spdlog/spdlog.h>                    // NOLINT
-
-/**
- * @defgroup logging levels used in raft
- *
- * @note exactly match the corresponding ones (but reverse in terms of value)
- *       in spdlog for wrapping purposes
- *
- * @{
- */
-#define RAFT_LEVEL_TRACE    6
-#define RAFT_LEVEL_DEBUG    5
-#define RAFT_LEVEL_INFO     4
-#define RAFT_LEVEL_WARN     3
-#define RAFT_LEVEL_ERROR    2
-#define RAFT_LEVEL_CRITICAL 1
-#define RAFT_LEVEL_OFF      0
-/** @} */
-
-#if !defined(RAFT_ACTIVE_LEVEL)
-#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_DEBUG
-#endif
-
-namespace raft {
-
-static const std::string RAFT_NAME = "raft";
-static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
-
-/**
- * @defgroup CStringFormat Expand a C-style format string
- *
- * @brief Expands C-style formatted string into std::string
- *
- * @param[in] fmt format string
- * @param[in] vl  respective values for each of format modifiers in the string
- *
- * @return the expanded `std::string`
- *
- * @{
- */
-std::string format(const char* fmt, va_list& vl)
-{
-  char buf[4096];
-  vsnprintf(buf, sizeof(buf), fmt, vl);
-  return std::string(buf);
-}
-
-std::string format(const char* fmt, ...)
-{
-  va_list vl;
-  va_start(vl, fmt);
-  std::string str = format(fmt, vl);
-  va_end(vl);
-  return str;
-}
-/** @} */
-
-int convert_level_to_spdlog(int level)
-{
-  level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
-  return RAFT_LEVEL_TRACE - level;
-}
-
-/**
- * @brief The main Logging class for raft library.
- *
- * This class acts as a thin wrapper over the underlying `spdlog` interface. The
- * design is done in this way in order to avoid us having to also ship `spdlog`
- * header files in our installation.
- *
- * @todo This currently only supports logging to stdout. Need to add support in
- *       future to add custom loggers as well [Issue #2046]
- */
-class logger {
- public:
-  // @todo setting the logger once per process with
-  logger(std::string const& name_ = "")
-    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
-      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
-      cur_pattern()
-  {
-    set_pattern(default_log_pattern);
-    set_level(RAFT_LEVEL_INFO);
-  }
-  /**
-   * @brief Singleton method to get the underlying logger object
-   *
-   * @return the singleton logger object
-   */
-  static logger& get(std::string const& name = "")
-  {
-    if (log_map.find(name) == log_map.end()) {
-      log_map[name] = std::make_shared<raft::logger>(name);
-    }
-    return *log_map[name];
-  }
-
-  /**
-   * @brief Set the logging level.
-   *
-   * Only messages with level equal or above this will be printed
-   *
-   * @param[in] level logging level
-   *
-   * @note The log level will actually be set only if the input is within the
-   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
-   *       be ignored. See documentation of decisiontree for how this gets used
-   */
-  void set_level(int level)
-  {
-    level = convert_level_to_spdlog(level);
-    spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
-  }
-
-  /**
-   * @brief Set the logging pattern
-   *
-   * @param[in] pattern the pattern to be set. Refer this link
-   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
-   *                    to know the right syntax of this pattern
-   */
-  void set_pattern(const std::string& pattern)
-  {
-    cur_pattern = pattern;
-    spdlogger->set_pattern(pattern);
-  }
-
-  /**
-   * @brief Register a callback function to be run in place of usual log call
-   *
-   * @param[in] callback the function to be run on all logged messages
-   */
-  void set_callback(void (*callback)(int lvl, const char* msg)) { sink->set_callback(callback); }
-
-  /**
-   * @brief Register a flush function compatible with the registered callback
-   *
-   * @param[in] flush the function to use when flushing logs
-   */
-  void set_flush(void (*flush)()) { sink->set_flush(flush); }
-
-  /**
-   * @brief Tells whether messages will be logged for the given log level
-   *
-   * @param[in] level log level to be checked for
-   * @return true if messages will be logged for this level, else false
-   */
-  bool should_log_for(int level) const
-  {
-    level        = convert_level_to_spdlog(level);
-    auto level_e = static_cast<spdlog::level::level_enum>(level);
-    return spdlogger->should_log(level_e);
-  }
-
-  /**
-   * @brief Query for the current log level
-   *
-   * @return the current log level
-   */
-  int get_level() const
-  {
-    auto level_e = spdlogger->level();
-    return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
-  }
-
-  /**
-   * @brief Get the current logging pattern
-   * @return the pattern
-   */
-  std::string get_pattern() const { return cur_pattern; }
-
-  /**
-   * @brief Main logging method
-   *
-   * @param[in] level logging level of this message
-   * @param[in] fmt   C-like format string, followed by respective params
-   */
-  void log(int level, const char* fmt, ...)
-  {
-    level        = convert_level_to_spdlog(level);
-    auto level_e = static_cast<spdlog::level::level_enum>(level);
-    // explicit check to make sure that we only expand messages when required
-    if (spdlogger->should_log(level_e)) {
-      va_list vl;
-      va_start(vl, fmt);
-      auto msg = format(fmt, vl);
-      va_end(vl);
-      spdlogger->log(level_e, msg);
-    }
-  }
-
-  /**
-   * @brief Flush logs by calling flush on underlying logger
-   */
-  void flush() { spdlogger->flush(); }
-
-  ~logger() {}
-
- private:
-  logger();
-
-  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
-  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
-  std::shared_ptr<spdlog::logger> spdlogger;
-  std::string cur_pattern;
-  int cur_level;
-};  // class logger
-
-};  // namespace raft
-
-/**
- * @defgroup loggerMacros Helper macros for dealing with logging
- * @{
- */
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
-#define RAFT_LOG_TRACE(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_TRACE(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
-#define RAFT_LOG_DEBUG(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::format("%s:%d ", __FILE__, __LINE__);                     \
-    ss << raft::format(fmt, ##__VA_ARGS__);                               \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_DEBUG(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
-#define RAFT_LOG_INFO(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_INFO(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
-#define RAFT_LOG_WARN(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_WARN(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
-#define RAFT_LOG_ERROR(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_ERROR(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
-#define RAFT_LOG_CRITICAL(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
-#endif
-/** @} */
+#include <raft/core/logger.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/common/nvtx.hpp b/cpp/include/raft/common/nvtx.hpp
index 918d5e10d8..6125c937ea 100644
--- a/cpp/include/raft/common/nvtx.hpp
+++ b/cpp/include/raft/common/nvtx.hpp
@@ -16,140 +16,4 @@
 
 #pragma once
 
-#include "detail/nvtx.hpp"
-#include <optional>
-
-/**
- * \section Usage
- *
- * To add NVTX ranges to your code, use the `nvtx::range` RAII object. A
- * range begins when the object is created, and ends when the object is
- * destroyed.
- *
- * The example below creates nested NVTX ranges. The range `fun_scope` spans
- * the whole function, while the range `epoch_scope` spans an iteration
- * (and appears 5 times in the timeline).
- * \code{.cpp}
- * #include <raft/common/nvtx.hpp>
- * void some_function(int k){
- *   // Begins a NVTX range with the messsage "some_function_{k}"
- *   // The range ends when some_function() returns
- *   common::nvtx::range fun_scope( r{"some_function_%d", k};
- *
- *   for(int i = 0; i < 5; i++){
- *     common::nvtx::range epoch_scope{"epoch-%d", i};
- *     // some logic inside the loop
- *   }
- * }
- * \endcode
- *
- * \section Domains
- *
- * All NVTX ranges are assigned to domains. A domain defines a named timeline in
- * the Nsight Systems view. By default, we put all ranges into a domain `domain::app`
- * named "application". This is controlled by the template parameter `Domain`.
- *
- * The example below defines a domain and uses it in a function.
- * \code{.cpp}
- * #include <raft/common/nvtx.hpp>
- *
- * struct my_app_domain {
- *   static constexpr char const* name{"my application"};
- * }
- *
- * void some_function(int k){
- *   // This NVTX range appears in the timeline named "my application" in Nsight Systems.
- *   common::nvtx::range<my_app_domain> fun_scope( r{"some_function_%d", k};
- *   // some logic inside the loop
- * }
- * \endcode
- */
-namespace raft::common::nvtx {
-
-namespace domain {
-
-/** @brief The default NVTX domain. */
-struct app {
-  static constexpr char const* name{"application"};
-};
-
-/** @brief This NVTX domain is supposed to be used within raft.  */
-struct raft {
-  static constexpr char const* name{"raft"};
-};
-
-}  // namespace domain
-
-/**
- * @brief Push a named NVTX range.
- *
- * @tparam Domain optional struct that defines the NVTX domain message;
- *   You can create a new domain with a custom message as follows:
- *   \code{.cpp}
- *      struct custom_domain { static constexpr char const* name{"custom message"}; }
- *   \endcode
- *   NB: make sure to use the same domain for `push_range` and `pop_range`.
- * @param format range name format (accepts printf-style arguments)
- * @param args the arguments for the printf-style formatting
- */
-template <typename Domain = domain::app, typename... Args>
-inline void push_range(const char* format, Args... args)
-{
-  detail::push_range<Domain, Args...>(format, args...);
-}
-
-/**
- * @brief Pop the latest range.
- *
- * @tparam Domain optional struct that defines the NVTX domain message;
- *   You can create a new domain with a custom message as follows:
- *   \code{.cpp}
- *      struct custom_domain { static constexpr char const* name{"custom message"}; }
- *   \endcode
- *   NB: make sure to use the same domain for `push_range` and `pop_range`.
- */
-template <typename Domain = domain::app>
-inline void pop_range()
-{
-  detail::pop_range<Domain>();
-}
-
-/**
- * @brief Push a named NVTX range that would be popped at the end of the object lifetime.
- *
- * Refer to \ref Usage for the usage examples.
- *
- * @tparam Domain optional struct that defines the NVTX domain message;
- *   You can create a new domain with a custom message as follows:
- *   \code{.cpp}
- *      struct custom_domain { static constexpr char const* name{"custom message"}; }
- *   \endcode
- */
-template <typename Domain = domain::app>
-class range {
- public:
-  /**
-   * Push a named NVTX range.
-   * At the end of the object lifetime, pop the range back.
-   *
-   * @param format range name format (accepts printf-style arguments)
-   * @param args the arguments for the printf-style formatting
-   */
-  template <typename... Args>
-  explicit range(const char* format, Args... args)
-  {
-    push_range<Domain, Args...>(format, args...);
-  }
-
-  ~range() { pop_range<Domain>(); }
-
-  /* This object is not meant to be touched. */
-  range(const range&) = delete;
-  range(range&&)      = delete;
-  auto operator=(const range&) -> range& = delete;
-  auto operator=(range&&) -> range&                = delete;
-  static auto operator new(std::size_t) -> void*   = delete;
-  static auto operator new[](std::size_t) -> void* = delete;
-};
-
-}  // namespace raft::common::nvtx
+#include <raft/core/nvtx.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index b30a4648a6..00627e6654 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -13,631 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/comms.hpp instead.
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
 
-#ifndef __RAFT_RT_COMMS_H
-#define __RAFT_RT_COMMS_H
-
 #pragma once
 
-#include <memory>
-#include <raft/error.hpp>
-#include <vector>
-
-namespace raft {
-namespace comms {
-
-typedef unsigned int request_t;
-enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
-enum class op_t { SUM, PROD, MIN, MAX };
-
-/**
- * The resulting status of distributed stream synchronization
- */
-enum class status_t {
-  SUCCESS,  // Synchronization successful
-  ERROR,    // An error occured querying sync status
-  ABORT     // A failure occurred in sync, queued operations aborted
-};
-
-template <typename value_t>
-constexpr datatype_t
-
-get_type();
-
-template <>
-constexpr datatype_t
-
-get_type<char>()
-{
-  return datatype_t::CHAR;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<uint8_t>()
-{
-  return datatype_t::UINT8;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<int>()
-{
-  return datatype_t::INT32;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<uint32_t>()
-{
-  return datatype_t::UINT32;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<int64_t>()
-{
-  return datatype_t::INT64;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<uint64_t>()
-{
-  return datatype_t::UINT64;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<float>()
-{
-  return datatype_t::FLOAT32;
-}
-
-template <>
-constexpr datatype_t
-
-get_type<double>()
-{
-  return datatype_t::FLOAT64;
-}
-
-class comms_iface {
- public:
-  virtual ~comms_iface() {}
-
-  virtual int get_size() const = 0;
-
-  virtual int get_rank() const = 0;
-
-  virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
-
-  virtual void barrier() const = 0;
-
-  virtual status_t sync_stream(cudaStream_t stream) const = 0;
-
-  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
-
-  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
-
-  virtual void waitall(int count, request_t array_of_requests[]) const = 0;
-
-  virtual void allreduce(const void* sendbuff,
-                         void* recvbuff,
-                         size_t count,
-                         datatype_t datatype,
-                         op_t op,
-                         cudaStream_t stream) const = 0;
-
-  virtual void bcast(
-    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
-
-  virtual void bcast(const void* sendbuff,
-                     void* recvbuff,
-                     size_t count,
-                     datatype_t datatype,
-                     int root,
-                     cudaStream_t stream) const = 0;
-
-  virtual void reduce(const void* sendbuff,
-                      void* recvbuff,
-                      size_t count,
-                      datatype_t datatype,
-                      op_t op,
-                      int root,
-                      cudaStream_t stream) const = 0;
-
-  virtual void allgather(const void* sendbuff,
-                         void* recvbuff,
-                         size_t sendcount,
-                         datatype_t datatype,
-                         cudaStream_t stream) const = 0;
-
-  virtual void allgatherv(const void* sendbuf,
-                          void* recvbuf,
-                          const size_t* recvcounts,
-                          const size_t* displs,
-                          datatype_t datatype,
-                          cudaStream_t stream) const = 0;
-
-  virtual void gather(const void* sendbuff,
-                      void* recvbuff,
-                      size_t sendcount,
-                      datatype_t datatype,
-                      int root,
-                      cudaStream_t stream) const = 0;
-
-  virtual void gatherv(const void* sendbuf,
-                       void* recvbuf,
-                       size_t sendcount,
-                       const size_t* recvcounts,
-                       const size_t* displs,
-                       datatype_t datatype,
-                       int root,
-                       cudaStream_t stream) const = 0;
-
-  virtual void reducescatter(const void* sendbuff,
-                             void* recvbuff,
-                             size_t recvcount,
-                             datatype_t datatype,
-                             op_t op,
-                             cudaStream_t stream) const = 0;
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
-
-  virtual void device_sendrecv(const void* sendbuf,
-                               size_t sendsize,
-                               int dest,
-                               void* recvbuf,
-                               size_t recvsize,
-                               int source,
-                               cudaStream_t stream) const = 0;
-
-  virtual void device_multicast_sendrecv(const void* sendbuf,
-                                         std::vector<size_t> const& sendsizes,
-                                         std::vector<size_t> const& sendoffsets,
-                                         std::vector<int> const& dests,
-                                         void* recvbuf,
-                                         std::vector<size_t> const& recvsizes,
-                                         std::vector<size_t> const& recvoffsets,
-                                         std::vector<int> const& sources,
-                                         cudaStream_t stream) const = 0;
-};
-
-class comms_t {
- public:
-  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
-  {
-    ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
-  }
-
-  /**
-   * Virtual Destructor to enable polymorphism
-   */
-  virtual ~comms_t() {}
-
-  /**
-   * Returns the size of the communicator clique
-   */
-
-  int get_size() const { return impl_->get_size(); }
-
-  /**
-   * Returns the local rank
-   */
-  int get_rank() const { return impl_->get_rank(); }
-
-  /**
-   * Splits the current communicator clique into sub-cliques matching
-   * the given color and key
-   *
-   * @param color ranks w/ the same color are placed in the same communicator
-   * @param key controls rank assignment
-   */
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const
-  {
-    return impl_->comm_split(color, key);
-  }
-
-  /**
-   * Performs a collective barrier synchronization
-   */
-  void barrier() const { impl_->barrier(); }
-
-  /**
-   * Some collective communications implementations (eg. NCCL) might use asynchronous
-   * collectives that are explicitly synchronized. It's important to always synchronize
-   * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
-   * to prevent the potential for deadlocks.
-   *
-   * @param stream the cuda stream to sync collective operations on
-   */
-  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
-
-  /**
-   * Performs an asynchronous point-to-point send
-   * @tparam value_t the type of data to send
-   * @param buf pointer to array of data to send
-   * @param size number of elements in buf
-   * @param dest destination rank
-   * @param tag a tag to use for the receiver to filter
-   * @param request pointer to hold returned request_t object.
-   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-   */
-  template <typename value_t>
-  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
-  {
-    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
-  }
-
-  /**
-   * Performs an asynchronous point-to-point receive
-   * @tparam value_t the type of data to be received
-   * @param buf pointer to (initialized) array that will hold received data
-   * @param size number of elements in buf
-   * @param source source rank
-   * @param tag a tag to use for message filtering
-   * @param request pointer to hold returned request_t object.
-   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-   */
-  template <typename value_t>
-  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
-  {
-    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
-  }
-
-  /**
-   * Synchronize on an array of request_t objects returned from isend/irecv
-   * @param count number of requests to synchronize on
-   * @param array_of_requests an array of request_t objects returned from isend/irecv
-   */
-  void waitall(int count, request_t array_of_requests[]) const
-  {
-    impl_->waitall(count, array_of_requests);
-  }
-
-  /**
-   * Perform an allreduce collective
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff data to reduce
-   * @param recvbuff buffer to hold the reduced result
-   * @param count number of elements in sendbuff
-   * @param op reduction operation to perform
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allreduce(
-    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
-  {
-    impl_->allreduce(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff),
-                     count,
-                     get_type<value_t>(),
-                     op,
-                     stream);
-  }
-
-  /**
-   * Broadcast data from one rank to the rest
-   * @tparam value_t datatype of underlying buffers
-   * @param buff buffer to send
-   * @param count number of elements if buff
-   * @param root the rank initiating the broadcast
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
-  {
-    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
-  }
-
-  /**
-   * Broadcast data from one rank to the rest
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to broadcast (only used in root)
-   * @param recvbuff buffer to receive broadcasted data
-   * @param count number of elements if buff
-   * @param root the rank initiating the broadcast
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void bcast(
-    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
-  {
-    impl_->bcast(static_cast<const void*>(sendbuff),
-                 static_cast<void*>(recvbuff),
-                 count,
-                 get_type<value_t>(),
-                 root,
-                 stream);
-  }
-
-  /**
-   * Reduce data from many ranks down to a single rank
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to reduce
-   * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
-   * @param count number of elements in sendbuff
-   * @param op reduction operation to perform
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void reduce(const value_t* sendbuff,
-              value_t* recvbuff,
-              size_t count,
-              op_t op,
-              int root,
-              cudaStream_t stream) const
-  {
-    impl_->reduce(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff),
-                  count,
-                  get_type<value_t>(),
-                  op,
-                  root,
-                  stream);
-  }
-
-  /**
-   * Gathers data from each rank onto all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to gather
-   * @param recvbuff buffer containing gathered data from all ranks
-   * @param sendcount number of elements in send buffer
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allgather(const value_t* sendbuff,
-                 value_t* recvbuff,
-                 size_t sendcount,
-                 cudaStream_t stream) const
-  {
-    impl_->allgather(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff),
-                     sendcount,
-                     get_type<value_t>(),
-                     stream);
-  }
-
-  /**
-   * Gathers data from all ranks and delivers to combined data to all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuf buffer containing data to send
-   * @param recvbuf buffer containing data to receive
-   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-   *                   elements that are to be received from each rank
-   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-   *               (relative to recvbuf) at which to place the incoming data from each rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void allgatherv(const value_t* sendbuf,
-                  value_t* recvbuf,
-                  const size_t* recvcounts,
-                  const size_t* displs,
-                  cudaStream_t stream) const
-  {
-    impl_->allgatherv(static_cast<const void*>(sendbuf),
-                      static_cast<void*>(recvbuf),
-                      recvcounts,
-                      displs,
-                      get_type<value_t>(),
-                      stream);
-  }
-
-  /**
-   * Gathers data from each rank onto all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to gather
-   * @param recvbuff buffer containing gathered data from all ranks
-   * @param sendcount number of elements in send buffer
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void gather(const value_t* sendbuff,
-              value_t* recvbuff,
-              size_t sendcount,
-              int root,
-              cudaStream_t stream) const
-  {
-    impl_->gather(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff),
-                  sendcount,
-                  get_type<value_t>(),
-                  root,
-                  stream);
-  }
-
-  /**
-   * Gathers data from all ranks and delivers to combined data to all ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuf buffer containing data to send
-   * @param recvbuf buffer containing data to receive
-   * @param sendcount number of elements in send buffer
-   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-   *                   elements that are to be received from each rank
-   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-   *               (relative to recvbuf) at which to place the incoming data from each rank
-   * @param root rank to store the results
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void gatherv(const value_t* sendbuf,
-               value_t* recvbuf,
-               size_t sendcount,
-               const size_t* recvcounts,
-               const size_t* displs,
-               int root,
-               cudaStream_t stream) const
-  {
-    impl_->gatherv(static_cast<const void*>(sendbuf),
-                   static_cast<void*>(recvbuf),
-                   sendcount,
-                   recvcounts,
-                   displs,
-                   get_type<value_t>(),
-                   root,
-                   stream);
-  }
-
-  /**
-   * Reduces data from all ranks then scatters the result across ranks
-   * @tparam value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
-   * @param recvbuff buffer containing received data
-   * @param recvcount number of items to receive
-   * @param op reduction operation to perform
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void reducescatter(const value_t* sendbuff,
-                     value_t* recvbuff,
-                     size_t recvcount,
-                     op_t op,
-                     cudaStream_t stream) const
-  {
-    impl_->reducescatter(static_cast<const void*>(sendbuff),
-                         static_cast<void*>(recvbuff),
-                         recvcount,
-                         get_type<value_t>(),
-                         op,
-                         stream);
-  }
-
-  /**
-   * Performs a point-to-point send
-   *
-   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-   *
-   * @tparam value_t the type of data to send
-   * @param buf pointer to array of data to send
-   * @param size number of elements in buf
-   * @param dest destination rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
-  {
-    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
-  }
-
-  /**
-   * Performs a point-to-point receive
-   *
-   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-   *
-   * @tparam value_t the type of data to be received
-   * @param buf pointer to (initialized) array that will hold received data
-   * @param size number of elements in buf
-   * @param source source rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
-  {
-    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
-  }
-
-  /**
-   * Performs a point-to-point send/receive
-   *
-   * @tparam value_t the type of data to be sent & received
-   * @param sendbuf pointer to array of data to send
-   * @param sendsize number of elements in sendbuf
-   * @param dest destination rank
-   * @param recvbuf pointer to (initialized) array that will hold received data
-   * @param recvsize number of elements in recvbuf
-   * @param source source rank
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_sendrecv(const value_t* sendbuf,
-                       size_t sendsize,
-                       int dest,
-                       value_t* recvbuf,
-                       size_t recvsize,
-                       int source,
-                       cudaStream_t stream) const
-  {
-    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
-                           sendsize * sizeof(value_t),
-                           dest,
-                           static_cast<void*>(recvbuf),
-                           recvsize * sizeof(value_t),
-                           source,
-                           stream);
-  }
-
-  /**
-   * Performs a multicast send/receive
-   *
-   * @tparam value_t the type of data to be sent & received
-   * @param sendbuf pointer to array of data to send
-   * @param sendsizes numbers of elements to send
-   * @param sendoffsets offsets in a number of elements from sendbuf
-   * @param dests destination ranks
-   * @param recvbuf pointer to (initialized) array that will hold received data
-   * @param recvsizes numbers of elements to recv
-   * @param recvoffsets offsets in a number of elements from recvbuf
-   * @param sources source ranks
-   * @param stream CUDA stream to synchronize operation
-   */
-  template <typename value_t>
-  void device_multicast_sendrecv(const value_t* sendbuf,
-                                 std::vector<size_t> const& sendsizes,
-                                 std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests,
-                                 value_t* recvbuf,
-                                 std::vector<size_t> const& recvsizes,
-                                 std::vector<size_t> const& recvoffsets,
-                                 std::vector<int> const& sources,
-                                 cudaStream_t stream) const
-  {
-    auto sendbytesizes   = sendsizes;
-    auto sendbyteoffsets = sendoffsets;
-    for (size_t i = 0; i < sendsizes.size(); ++i) {
-      sendbytesizes[i] *= sizeof(value_t);
-      sendbyteoffsets[i] *= sizeof(value_t);
-    }
-    auto recvbytesizes   = recvsizes;
-    auto recvbyteoffsets = recvoffsets;
-    for (size_t i = 0; i < recvsizes.size(); ++i) {
-      recvbytesizes[i] *= sizeof(value_t);
-      recvbyteoffsets[i] *= sizeof(value_t);
-    }
-    impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
-                                     sendbytesizes,
-                                     sendbyteoffsets,
-                                     dests,
-                                     static_cast<void*>(recvbuf),
-                                     recvbytesizes,
-                                     recvbyteoffsets,
-                                     sources,
-                                     stream);
-  }
-
- private:
-  std::unique_ptr<comms_iface> impl_;
-};
-
-}  // namespace comms
-}  // namespace raft
-
-#endif
+#include <raft/core/comms.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/core/comms.hpp b/cpp/include/raft/core/comms.hpp
new file mode 100644
index 0000000000..b93924b3d2
--- /dev/null
+++ b/cpp/include/raft/core/comms.hpp
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <raft/error.hpp>
+#include <vector>
+
+namespace raft {
+    namespace comms {
+
+        typedef unsigned int request_t;
+        enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
+        enum class op_t { SUM, PROD, MIN, MAX };
+
+/**
+ * The resulting status of distributed stream synchronization
+ */
+        enum class status_t {
+            SUCCESS,  // Synchronization successful
+            ERROR,    // An error occured querying sync status
+            ABORT     // A failure occurred in sync, queued operations aborted
+        };
+
+        template <typename value_t>
+        constexpr datatype_t
+
+        get_type();
+
+        template <>
+        constexpr datatype_t
+
+        get_type<char>()
+        {
+            return datatype_t::CHAR;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<uint8_t>()
+        {
+            return datatype_t::UINT8;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<int>()
+        {
+            return datatype_t::INT32;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<uint32_t>()
+        {
+            return datatype_t::UINT32;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<int64_t>()
+        {
+            return datatype_t::INT64;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<uint64_t>()
+        {
+            return datatype_t::UINT64;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<float>()
+        {
+            return datatype_t::FLOAT32;
+        }
+
+        template <>
+        constexpr datatype_t
+
+        get_type<double>()
+        {
+            return datatype_t::FLOAT64;
+        }
+
+        class comms_iface {
+        public:
+            virtual ~comms_iface() {}
+
+            virtual int get_size() const = 0;
+
+            virtual int get_rank() const = 0;
+
+            virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
+
+            virtual void barrier() const = 0;
+
+            virtual status_t sync_stream(cudaStream_t stream) const = 0;
+
+            virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
+
+            virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
+
+            virtual void waitall(int count, request_t array_of_requests[]) const = 0;
+
+            virtual void allreduce(const void* sendbuff,
+                                   void* recvbuff,
+                                   size_t count,
+                                   datatype_t datatype,
+                                   op_t op,
+                                   cudaStream_t stream) const = 0;
+
+            virtual void bcast(
+                    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
+
+            virtual void bcast(const void* sendbuff,
+                               void* recvbuff,
+                               size_t count,
+                               datatype_t datatype,
+                               int root,
+                               cudaStream_t stream) const = 0;
+
+            virtual void reduce(const void* sendbuff,
+                                void* recvbuff,
+                                size_t count,
+                                datatype_t datatype,
+                                op_t op,
+                                int root,
+                                cudaStream_t stream) const = 0;
+
+            virtual void allgather(const void* sendbuff,
+                                   void* recvbuff,
+                                   size_t sendcount,
+                                   datatype_t datatype,
+                                   cudaStream_t stream) const = 0;
+
+            virtual void allgatherv(const void* sendbuf,
+                                    void* recvbuf,
+                                    const size_t* recvcounts,
+                                    const size_t* displs,
+                                    datatype_t datatype,
+                                    cudaStream_t stream) const = 0;
+
+            virtual void gather(const void* sendbuff,
+                                void* recvbuff,
+                                size_t sendcount,
+                                datatype_t datatype,
+                                int root,
+                                cudaStream_t stream) const = 0;
+
+            virtual void gatherv(const void* sendbuf,
+                                 void* recvbuf,
+                                 size_t sendcount,
+                                 const size_t* recvcounts,
+                                 const size_t* displs,
+                                 datatype_t datatype,
+                                 int root,
+                                 cudaStream_t stream) const = 0;
+
+            virtual void reducescatter(const void* sendbuff,
+                                       void* recvbuff,
+                                       size_t recvcount,
+                                       datatype_t datatype,
+                                       op_t op,
+                                       cudaStream_t stream) const = 0;
+
+            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+            virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
+
+            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+            virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
+
+            virtual void device_sendrecv(const void* sendbuf,
+                                         size_t sendsize,
+                                         int dest,
+                                         void* recvbuf,
+                                         size_t recvsize,
+                                         int source,
+                                         cudaStream_t stream) const = 0;
+
+            virtual void device_multicast_sendrecv(const void* sendbuf,
+                                                   std::vector<size_t> const& sendsizes,
+                                                   std::vector<size_t> const& sendoffsets,
+                                                   std::vector<int> const& dests,
+                                                   void* recvbuf,
+                                                   std::vector<size_t> const& recvsizes,
+                                                   std::vector<size_t> const& recvoffsets,
+                                                   std::vector<int> const& sources,
+                                                   cudaStream_t stream) const = 0;
+        };
+
+        class comms_t {
+        public:
+            comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
+            {
+                ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
+            }
+
+            /**
+             * Virtual Destructor to enable polymorphism
+             */
+            virtual ~comms_t() {}
+
+            /**
+             * Returns the size of the communicator clique
+             */
+
+            int get_size() const { return impl_->get_size(); }
+
+            /**
+             * Returns the local rank
+             */
+            int get_rank() const { return impl_->get_rank(); }
+
+            /**
+             * Splits the current communicator clique into sub-cliques matching
+             * the given color and key
+             *
+             * @param color ranks w/ the same color are placed in the same communicator
+             * @param key controls rank assignment
+             */
+            std::unique_ptr<comms_iface> comm_split(int color, int key) const
+            {
+                return impl_->comm_split(color, key);
+            }
+
+            /**
+             * Performs a collective barrier synchronization
+             */
+            void barrier() const { impl_->barrier(); }
+
+            /**
+             * Some collective communications implementations (eg. NCCL) might use asynchronous
+             * collectives that are explicitly synchronized. It's important to always synchronize
+             * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
+             * to prevent the potential for deadlocks.
+             *
+             * @param stream the cuda stream to sync collective operations on
+             */
+            status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
+
+            /**
+             * Performs an asynchronous point-to-point send
+             * @tparam value_t the type of data to send
+             * @param buf pointer to array of data to send
+             * @param size number of elements in buf
+             * @param dest destination rank
+             * @param tag a tag to use for the receiver to filter
+             * @param request pointer to hold returned request_t object.
+             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+             */
+            template <typename value_t>
+            void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
+            {
+                impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
+            }
+
+            /**
+             * Performs an asynchronous point-to-point receive
+             * @tparam value_t the type of data to be received
+             * @param buf pointer to (initialized) array that will hold received data
+             * @param size number of elements in buf
+             * @param source source rank
+             * @param tag a tag to use for message filtering
+             * @param request pointer to hold returned request_t object.
+             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+             */
+            template <typename value_t>
+            void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
+            {
+                impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
+            }
+
+            /**
+             * Synchronize on an array of request_t objects returned from isend/irecv
+             * @param count number of requests to synchronize on
+             * @param array_of_requests an array of request_t objects returned from isend/irecv
+             */
+            void waitall(int count, request_t array_of_requests[]) const
+            {
+                impl_->waitall(count, array_of_requests);
+            }
+
+            /**
+             * Perform an allreduce collective
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff data to reduce
+             * @param recvbuff buffer to hold the reduced result
+             * @param count number of elements in sendbuff
+             * @param op reduction operation to perform
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void allreduce(
+                    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
+            {
+                impl_->allreduce(static_cast<const void*>(sendbuff),
+                                 static_cast<void*>(recvbuff),
+                                 count,
+                                 get_type<value_t>(),
+                                 op,
+                                 stream);
+            }
+
+            /**
+             * Broadcast data from one rank to the rest
+             * @tparam value_t datatype of underlying buffers
+             * @param buff buffer to send
+             * @param count number of elements if buff
+             * @param root the rank initiating the broadcast
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
+            {
+                impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
+            }
+
+            /**
+             * Broadcast data from one rank to the rest
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to broadcast (only used in root)
+             * @param recvbuff buffer to receive broadcasted data
+             * @param count number of elements if buff
+             * @param root the rank initiating the broadcast
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void bcast(
+                    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
+            {
+                impl_->bcast(static_cast<const void*>(sendbuff),
+                             static_cast<void*>(recvbuff),
+                             count,
+                             get_type<value_t>(),
+                             root,
+                             stream);
+            }
+
+            /**
+             * Reduce data from many ranks down to a single rank
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to reduce
+             * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
+             * @param count number of elements in sendbuff
+             * @param op reduction operation to perform
+             * @param root rank to store the results
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void reduce(const value_t* sendbuff,
+                        value_t* recvbuff,
+                        size_t count,
+                        op_t op,
+                        int root,
+                        cudaStream_t stream) const
+            {
+                impl_->reduce(static_cast<const void*>(sendbuff),
+                              static_cast<void*>(recvbuff),
+                              count,
+                              get_type<value_t>(),
+                              op,
+                              root,
+                              stream);
+            }
+
+            /**
+             * Gathers data from each rank onto all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to gather
+             * @param recvbuff buffer containing gathered data from all ranks
+             * @param sendcount number of elements in send buffer
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void allgather(const value_t* sendbuff,
+                           value_t* recvbuff,
+                           size_t sendcount,
+                           cudaStream_t stream) const
+            {
+                impl_->allgather(static_cast<const void*>(sendbuff),
+                                 static_cast<void*>(recvbuff),
+                                 sendcount,
+                                 get_type<value_t>(),
+                                 stream);
+            }
+
+            /**
+             * Gathers data from all ranks and delivers to combined data to all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuf buffer containing data to send
+             * @param recvbuf buffer containing data to receive
+             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+             *                   elements that are to be received from each rank
+             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+             *               (relative to recvbuf) at which to place the incoming data from each rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void allgatherv(const value_t* sendbuf,
+                            value_t* recvbuf,
+                            const size_t* recvcounts,
+                            const size_t* displs,
+                            cudaStream_t stream) const
+            {
+                impl_->allgatherv(static_cast<const void*>(sendbuf),
+                                  static_cast<void*>(recvbuf),
+                                  recvcounts,
+                                  displs,
+                                  get_type<value_t>(),
+                                  stream);
+            }
+
+            /**
+             * Gathers data from each rank onto all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to gather
+             * @param recvbuff buffer containing gathered data from all ranks
+             * @param sendcount number of elements in send buffer
+             * @param root rank to store the results
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void gather(const value_t* sendbuff,
+                        value_t* recvbuff,
+                        size_t sendcount,
+                        int root,
+                        cudaStream_t stream) const
+            {
+                impl_->gather(static_cast<const void*>(sendbuff),
+                              static_cast<void*>(recvbuff),
+                              sendcount,
+                              get_type<value_t>(),
+                              root,
+                              stream);
+            }
+
+            /**
+             * Gathers data from all ranks and delivers to combined data to all ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuf buffer containing data to send
+             * @param recvbuf buffer containing data to receive
+             * @param sendcount number of elements in send buffer
+             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+             *                   elements that are to be received from each rank
+             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+             *               (relative to recvbuf) at which to place the incoming data from each rank
+             * @param root rank to store the results
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void gatherv(const value_t* sendbuf,
+                         value_t* recvbuf,
+                         size_t sendcount,
+                         const size_t* recvcounts,
+                         const size_t* displs,
+                         int root,
+                         cudaStream_t stream) const
+            {
+                impl_->gatherv(static_cast<const void*>(sendbuf),
+                               static_cast<void*>(recvbuf),
+                               sendcount,
+                               recvcounts,
+                               displs,
+                               get_type<value_t>(),
+                               root,
+                               stream);
+            }
+
+            /**
+             * Reduces data from all ranks then scatters the result across ranks
+             * @tparam value_t datatype of underlying buffers
+             * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
+             * @param recvbuff buffer containing received data
+             * @param recvcount number of items to receive
+             * @param op reduction operation to perform
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void reducescatter(const value_t* sendbuff,
+                               value_t* recvbuff,
+                               size_t recvcount,
+                               op_t op,
+                               cudaStream_t stream) const
+            {
+                impl_->reducescatter(static_cast<const void*>(sendbuff),
+                                     static_cast<void*>(recvbuff),
+                                     recvcount,
+                                     get_type<value_t>(),
+                                     op,
+                                     stream);
+            }
+
+            /**
+             * Performs a point-to-point send
+             *
+             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+             *
+             * @tparam value_t the type of data to send
+             * @param buf pointer to array of data to send
+             * @param size number of elements in buf
+             * @param dest destination rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
+            {
+                impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
+            }
+
+            /**
+             * Performs a point-to-point receive
+             *
+             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+             *
+             * @tparam value_t the type of data to be received
+             * @param buf pointer to (initialized) array that will hold received data
+             * @param size number of elements in buf
+             * @param source source rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
+            {
+                impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
+            }
+
+            /**
+             * Performs a point-to-point send/receive
+             *
+             * @tparam value_t the type of data to be sent & received
+             * @param sendbuf pointer to array of data to send
+             * @param sendsize number of elements in sendbuf
+             * @param dest destination rank
+             * @param recvbuf pointer to (initialized) array that will hold received data
+             * @param recvsize number of elements in recvbuf
+             * @param source source rank
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_sendrecv(const value_t* sendbuf,
+                                 size_t sendsize,
+                                 int dest,
+                                 value_t* recvbuf,
+                                 size_t recvsize,
+                                 int source,
+                                 cudaStream_t stream) const
+            {
+                impl_->device_sendrecv(static_cast<const void*>(sendbuf),
+                                       sendsize * sizeof(value_t),
+                                       dest,
+                                       static_cast<void*>(recvbuf),
+                                       recvsize * sizeof(value_t),
+                                       source,
+                                       stream);
+            }
+
+            /**
+             * Performs a multicast send/receive
+             *
+             * @tparam value_t the type of data to be sent & received
+             * @param sendbuf pointer to array of data to send
+             * @param sendsizes numbers of elements to send
+             * @param sendoffsets offsets in a number of elements from sendbuf
+             * @param dests destination ranks
+             * @param recvbuf pointer to (initialized) array that will hold received data
+             * @param recvsizes numbers of elements to recv
+             * @param recvoffsets offsets in a number of elements from recvbuf
+             * @param sources source ranks
+             * @param stream CUDA stream to synchronize operation
+             */
+            template <typename value_t>
+            void device_multicast_sendrecv(const value_t* sendbuf,
+                                           std::vector<size_t> const& sendsizes,
+                                           std::vector<size_t> const& sendoffsets,
+                                           std::vector<int> const& dests,
+                                           value_t* recvbuf,
+                                           std::vector<size_t> const& recvsizes,
+                                           std::vector<size_t> const& recvoffsets,
+                                           std::vector<int> const& sources,
+                                           cudaStream_t stream) const
+            {
+                auto sendbytesizes   = sendsizes;
+                auto sendbyteoffsets = sendoffsets;
+                for (size_t i = 0; i < sendsizes.size(); ++i) {
+                    sendbytesizes[i] *= sizeof(value_t);
+                    sendbyteoffsets[i] *= sizeof(value_t);
+                }
+                auto recvbytesizes   = recvsizes;
+                auto recvbyteoffsets = recvoffsets;
+                for (size_t i = 0; i < recvsizes.size(); ++i) {
+                    recvbytesizes[i] *= sizeof(value_t);
+                    recvbyteoffsets[i] *= sizeof(value_t);
+                }
+                impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
+                                                 sendbytesizes,
+                                                 sendbyteoffsets,
+                                                 dests,
+                                                 static_cast<void*>(recvbuf),
+                                                 recvbytesizes,
+                                                 recvbyteoffsets,
+                                                 sources,
+                                                 stream);
+            }
+
+        private:
+            std::unique_ptr<comms_iface> impl_;
+        };
+
+    }  // namespace comms
+}  // namespace raft
diff --git a/cpp/include/raft/core/cublas_macros.hpp b/cpp/include/raft/core/cublas_macros.hpp
new file mode 100644
index 0000000000..5a96444e45
--- /dev/null
+++ b/cpp/include/raft/core/cublas_macros.hpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/cublas_macros.hpp instead.
+ */
+
+#ifndef __RAFT_RT_CUBLAS_MACROS_H
+#define __RAFT_RT_CUBLAS_MACROS_H
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <raft/error.hpp>
+
+///@todo: enable this once we have logger enabled
+//#include <cuml/common/logger.hpp>
+
+#include <cstdint>
+
+#define _CUBLAS_ERR_TO_STR(err) \
+  case err: return #err
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuBLAS error is encountered.
+ */
+    struct cublas_error : public raft::exception {
+        explicit cublas_error(char const* const message) : raft::exception(message) {}
+        explicit cublas_error(std::string const& message) : raft::exception(message) {}
+    };
+
+    namespace linalg {
+        namespace detail {
+
+            inline const char* cublas_error_to_string(cublasStatus_t err)
+            {
+                switch (err) {
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
+                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
+                    default: return "CUBLAS_STATUS_UNKNOWN";
+                };
+            }
+
+        }  // namespace detail
+    }  // namespace linalg
+}  // namespace raft
+
+#undef _CUBLAS_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuBLAS runtime API functions.
+ *
+ * Invokes a cuBLAS runtime API function call, if the call does not return
+ * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
+ */
+#define RAFT_CUBLAS_TRY(call)                                              \
+  do {                                                                     \
+    cublasStatus_t const status = (call);                                  \
+    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                   \
+      SET_ERROR_MSG(msg,                                                   \
+                    "cuBLAS error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                             \
+                    #call,                                                 \
+                    status,                                                \
+                    raft::linalg::detail::cublas_error_to_string(status)); \
+      throw raft::cublas_error(msg);                                       \
+    }                                                                      \
+  } while (0)
+
+// FIXME: Remove after consumers rename
+#ifndef CUBLAS_TRY
+#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUBLAS_TRY_NO_THROW(call)                               \
+  do {                                                               \
+    cublasStatus_t const status = call;                              \
+    if (CUBLAS_STATUS_SUCCESS != status) {                           \
+      printf("CUBLAS call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                  \
+             __FILE__,                                               \
+             __LINE__,                                               \
+             raft::linalg::detail::cublas_error_to_string(status));  \
+    }                                                                \
+  } while (0)
+
+/** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK
+#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
+#endif
+
+/** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK_NO_THROW
+#define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp
new file mode 100644
index 0000000000..ecc840b77e
--- /dev/null
+++ b/cpp/include/raft/core/cudart_utils.hpp
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/cudart_utils.hpp instead.
+ */
+
+#ifndef __RAFT_RT_CUDART_UTILS_H
+#define __RAFT_RT_CUDART_UTILS_H
+
+#pragma once
+
+#include <raft/error.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <cuda_runtime.h>
+
+#include <chrono>
+#include <cstdio>
+#include <execinfo.h>
+#include <iomanip>
+#include <iostream>
+#include <mutex>
+#include <unordered_map>
+
+///@todo: enable once logging has been enabled in raft
+//#include "logger.hpp"
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a CUDA error is encountered.
+ */
+    struct cuda_error : public raft::exception {
+        explicit cuda_error(char const* const message) : raft::exception(message) {}
+        explicit cuda_error(std::string const& message) : raft::exception(message) {}
+    };
+
+}  // namespace raft
+
+/**
+ * @brief Error checking macro for CUDA runtime API functions.
+ *
+ * Invokes a CUDA runtime API function call, if the call does not return
+ * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
+ * exception detailing the CUDA error that occurred
+ *
+ */
+#define RAFT_CUDA_TRY(call)                        \
+  do {                                             \
+    cudaError_t const status = call;               \
+    if (status != cudaSuccess) {                   \
+      cudaGetLastError();                          \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "CUDA error encountered at: ", \
+                    "call='%s', Reason=%s:%s",     \
+                    #call,                         \
+                    cudaGetErrorName(status),      \
+                    cudaGetErrorString(status));   \
+      throw raft::cuda_error(msg);                 \
+    }                                              \
+  } while (0)
+
+// FIXME: Remove after consumers rename
+#ifndef CUDA_TRY
+#define CUDA_TRY(call) RAFT_CUDA_TRY(call)
+#endif
+
+/**
+ * @brief Debug macro to check for CUDA errors
+ *
+ * In a non-release build, this macro will synchronize the specified stream
+ * before error checking. In both release and non-release builds, this macro
+ * checks for any pending CUDA errors from previous calls. If an error is
+ * reported, an exception is thrown detailing the CUDA error that occurred.
+ *
+ * The intent of this macro is to provide a mechanism for synchronous and
+ * deterministic execution for debugging asynchronous CUDA execution. It should
+ * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an
+ * asynchronous kernel launch.
+ */
+#ifndef NDEBUG
+#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+#else
+#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError());
+#endif
+
+// FIXME: Remove after consumers rename
+#ifndef CHECK_CUDA
+#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call)
+#endif
+
+/** FIXME: remove after cuml rename */
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(call) RAFT_CUDA_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUDA_TRY_NO_THROW(call)                               \
+  do {                                                             \
+    cudaError_t const status = call;                               \
+    if (cudaSuccess != status) {                                   \
+      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                \
+             __FILE__,                                             \
+             __LINE__,                                             \
+             cudaGetErrorString(status));                          \
+    }                                                              \
+  } while (0)
+
+// FIXME: Remove after cuml rename
+#ifndef CUDA_CHECK_NO_THROW
+#define CUDA_CHECK_NO_THROW(call) RAFT_CUDA_TRY_NO_THROW(call)
+#endif
+
+/**
+ * Alias to raft scope for now.
+ * TODO: Rename original implementations in 22.04 to fix
+ * https://github.com/rapidsai/raft/issues/128
+ */
+
+namespace raft {
+
+/** Helper method to get to know warp size in device code */
+    __host__ __device__ constexpr inline int warp_size() { return 32; }
+
+    __host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
+
+/**
+ * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
+ * elements to threads.
+ */
+    class grid_1d_thread_t {
+    public:
+        int const block_size{0};
+        int const num_blocks{0};
+
+        /**
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         * @param elements_per_thread Typically, a single kernel thread processes more than a single
+         * element; this affects the number of threads the grid must contain
+         */
+        grid_1d_thread_t(size_t overall_num_elements,
+                         size_t num_threads_per_block,
+                         size_t max_num_blocks_1d,
+                         size_t elements_per_thread = 1)
+                : block_size(num_threads_per_block),
+                  num_blocks(
+                          std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
+                                   (elements_per_thread * num_threads_per_block),
+                                   max_num_blocks_1d))
+        {
+            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                         "num_threads_per_block / warp_size() must be > 0");
+            RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
+        }
+    };
+
+/**
+ * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
+ * elements to warps.
+ */
+    class grid_1d_warp_t {
+    public:
+        int const block_size{0};
+        int const num_blocks{0};
+
+        /**
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         */
+        grid_1d_warp_t(size_t overall_num_elements,
+                       size_t num_threads_per_block,
+                       size_t max_num_blocks_1d)
+                : block_size(num_threads_per_block),
+                  num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
+                                      (num_threads_per_block / warp_size()),
+                                      max_num_blocks_1d))
+        {
+            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                         "num_threads_per_block / warp_size() must be > 0");
+        }
+    };
+
+/**
+ * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
+ * elements to blocks.
+ */
+    class grid_1d_block_t {
+    public:
+        int const block_size{0};
+        int const num_blocks{0};
+
+        /**
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         */
+        grid_1d_block_t(size_t overall_num_elements,
+                        size_t num_threads_per_block,
+                        size_t max_num_blocks_1d)
+                : block_size(num_threads_per_block),
+                  num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
+        {
+            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                         "num_threads_per_block / warp_size() must be > 0");
+        }
+    };
+
+/**
+ * @brief Generic copy method for all kinds of transfers
+ * @tparam Type data type
+ * @param dst destination pointer
+ * @param src source pointer
+ * @param len lenth of the src/dst buffers in terms of number of elements
+ * @param stream cuda stream
+ */
+    template <typename Type>
+    void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
+    {
+        CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+    }
+
+/**
+ * @defgroup Copy Copy methods
+ * These are here along with the generic 'copy' method in order to improve
+ * code readability using explicitly specified function names
+ * @{
+ */
+/** performs a host to device copy */
+    template <typename Type>
+    void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
+    {
+        copy(d_ptr, h_ptr, len, stream);
+    }
+
+/** performs a device to host copy */
+    template <typename Type>
+    void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
+    {
+        copy(h_ptr, d_ptr, len, stream);
+    }
+
+    template <typename Type>
+    void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
+    {
+        CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+    }
+/** @} */
+
+/**
+ * @defgroup Debug Utils for debugging host/device buffers
+ * @{
+ */
+    template <class T, class OutStream>
+    void print_host_vector(const char* variable_name,
+                           const T* host_mem,
+                           size_t componentsCount,
+                           OutStream& out)
+    {
+        out << variable_name << "=[";
+        for (size_t i = 0; i < componentsCount; ++i) {
+            if (i != 0) out << ",";
+            out << host_mem[i];
+        }
+        out << "];\n";
+    }
+
+    template <class T, class OutStream>
+    void print_device_vector(const char* variable_name,
+                             const T* devMem,
+                             size_t componentsCount,
+                             OutStream& out)
+    {
+        T* host_mem = new T[componentsCount];
+        CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
+        print_host_vector(variable_name, host_mem, componentsCount, out);
+        delete[] host_mem;
+    }
+/** @} */
+
+/** helper method to get max usable shared mem per block parameter */
+    inline int getSharedMemPerBlock()
+    {
+        int devId;
+        RAFT_CUDA_TRY(cudaGetDevice(&devId));
+        int smemPerBlk;
+        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
+        return smemPerBlk;
+    }
+
+/** helper method to get multi-processor count parameter */
+    inline int getMultiProcessorCount()
+    {
+        int devId;
+        RAFT_CUDA_TRY(cudaGetDevice(&devId));
+        int mpCount;
+        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+        return mpCount;
+    }
+
+/** helper method to convert an array on device to a string on host */
+    template <typename T>
+    std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
+    {
+        std::stringstream ss;
+
+        T* arr_h = (T*)malloc(size * sizeof(T));
+        update_host(arr_h, arr, size, stream);
+        RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+        ss << name << " = [ ";
+        for (int i = 0; i < size; i++) {
+            ss << std::setw(width) << arr_h[i];
+
+            if (i < size - 1) ss << ", ";
+        }
+        ss << " ]" << std::endl;
+
+        free(arr_h);
+
+        return ss.str();
+    }
+
+/** this seems to be unused, but may be useful in the future */
+    template <typename T>
+    void ASSERT_DEVICE_MEM(T* ptr, std::string name)
+    {
+        cudaPointerAttributes s_att;
+        cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
+
+        if (s_err != 0 || s_att.device == -1)
+            std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
+                      << ", err=" << s_err << std::endl;
+    }
+
+    inline uint32_t curTimeMillis()
+    {
+        auto now      = std::chrono::high_resolution_clock::now();
+        auto duration = now.time_since_epoch();
+        return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+    }
+
+/** Helper function to calculate need memory for allocate to store dense matrix.
+ * @param rows number of rows in matrix
+ * @param columns number of columns in matrix
+ * @return need number of items to allocate via allocate()
+ * @sa allocate()
+ */
+    inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
+
+/** Helper function to check alignment of pointer.
+ * @param ptr the pointer to check
+ * @param alignment to be checked for
+ * @return true if address in bytes is a multiple of alignment
+ */
+    template <typename Type>
+    bool is_aligned(Type* ptr, size_t alignment)
+    {
+        return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+    }
+
+/** calculate greatest common divisor of two numbers
+ * @a integer
+ * @b integer
+ * @ return gcd of a and b
+ */
+    template <typename IntType>
+    IntType gcd(IntType a, IntType b)
+    {
+        while (b != 0) {
+            IntType tmp = b;
+            b           = a % b;
+            a           = tmp;
+        }
+        return a;
+    }
+
+}  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/core/cusolver_macros.hpp b/cpp/include/raft/core/cusolver_macros.hpp
new file mode 100644
index 0000000000..87bca0e4e0
--- /dev/null
+++ b/cpp/include/raft/core/cusolver_macros.hpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/cusolver_macros.hpp instead.
+ */
+
+#ifndef __RAFT_RT_CUSOLVER_MACROS_H
+#define __RAFT_RT_CUSOLVER_MACROS_H
+
+#pragma once
+
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+///@todo: enable this once logging is enabled
+//#include <cuml/common/logger.hpp>
+#include <raft/cudart_utils.h>
+#include <type_traits>
+
+#define _CUSOLVER_ERR_TO_STR(err) \
+  case err: return #err;
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuSOLVER error is encountered.
+ */
+    struct cusolver_error : public raft::exception {
+        explicit cusolver_error(char const* const message) : raft::exception(message) {}
+        explicit cusolver_error(std::string const& message) : raft::exception(message) {}
+    };
+
+    namespace linalg {
+
+        inline const char* cusolver_error_to_string(cusolverStatus_t err)
+        {
+            switch (err) {
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
+                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
+                default: return "CUSOLVER_STATUS_UNKNOWN";
+            };
+        }
+
+    }  // namespace linalg
+}  // namespace raft
+
+#undef _CUSOLVER_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuSOLVER runtime API functions.
+ *
+ * Invokes a cuSOLVER runtime API function call, if the call does not return
+ * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
+ */
+#define RAFT_CUSOLVER_TRY(call)                                              \
+  do {                                                                       \
+    cusolverStatus_t const status = (call);                                  \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                     \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSOLVER error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
+                    raft::linalg::detail::cusolver_error_to_string(status)); \
+      throw raft::cusolver_error(msg);                                       \
+    }                                                                        \
+  } while (0)
+
+// FIXME: remove after consumer rename
+#ifndef CUSOLVER_TRY
+#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUSOLVER_TRY_NO_THROW(call)                               \
+  do {                                                                 \
+    cusolverStatus_t const status = call;                              \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                           \
+      printf("CUSOLVER call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                    \
+             __FILE__,                                                 \
+             __LINE__,                                                 \
+             raft::linalg::detail::cusolver_error_to_string(status));  \
+    }                                                                  \
+  } while (0)
+
+// FIXME: remove after cuml rename
+#ifndef CUSOLVER_CHECK
+#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
+#endif
+
+#ifndef CUSOLVER_CHECK_NO_THROW
+#define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/cusparse_macros.hpp b/cpp/include/raft/core/cusparse_macros.hpp
new file mode 100644
index 0000000000..1983dadec8
--- /dev/null
+++ b/cpp/include/raft/core/cusparse_macros.hpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse.h>
+#include <raft/error.hpp>
+///@todo: enable this once logging is enabled
+//#include <cuml/common/logger.hpp>
+
+#define _CUSPARSE_ERR_TO_STR(err) \
+  case err: return #err;
+
+// Notes:
+//(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
+//(2.) to enforce a lower version,
+//
+//`#define CUDA_ENFORCE_LOWER
+// #include <raft/sparse/detail/cusparse_wrappers.h>`
+//
+// (i.e., before including this header)
+//
+#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10100)
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuSparse error is encountered.
+ */
+    struct cusparse_error : public raft::exception {
+        explicit cusparse_error(char const* const message) : raft::exception(message) {}
+        explicit cusparse_error(std::string const& message) : raft::exception(message) {}
+    };
+
+    namespace sparse {
+        namespace detail {
+
+            inline const char* cusparse_error_to_string(cusparseStatus_t err)
+            {
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
+                return cusparseGetErrorString(err);
+#else   // CUDART_VERSION
+                switch (err) {
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
+                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+                    default: return "CUSPARSE_STATUS_UNKNOWN";
+                };
+#endif  // CUDART_VERSION
+            }
+
+        }  // namespace detail
+    }  // namespace sparse
+}  // namespace raft
+
+#undef _CUSPARSE_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuSparse runtime API functions.
+ *
+ * Invokes a cuSparse runtime API function call, if the call does not return
+ * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred
+ */
+#define RAFT_CUSPARSE_TRY(call)                                              \
+  do {                                                                       \
+    cusparseStatus_t const status = (call);                                  \
+    if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                     \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSparse error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
+                    raft::sparse::detail::cusparse_error_to_string(status)); \
+      throw raft::cusparse_error(msg);                                       \
+    }                                                                        \
+  } while (0)
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_TRY
+#define CUSPARSE_TRY(call) RAFT_CUSPARSE_TRY(call)
+#endif
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_CHECK
+#define CUSPARSE_CHECK(call) CUSPARSE_TRY(call)
+#endif
+
+//@todo: use logger here once logging is enabled
+/** check for cusparse runtime API errors but do not assert */
+#define RAFT_CUSPARSE_TRY_NO_THROW(call)                           \
+  do {                                                             \
+    cusparseStatus_t err = call;                                   \
+    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
+      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
+             #call,                                                \
+             err,                                                  \
+             raft::sparse::detail::cusparse_error_to_string(err)); \
+    }                                                              \
+  } while (0)
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_CHECK_NO_THROW
+#define CUSPARSE_CHECK_NO_THROW(call) RAFT_CUSPARSE_TRY_NO_THROW(call)
+#endif
diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp
new file mode 100644
index 0000000000..8b49715f79
--- /dev/null
+++ b/cpp/include/raft/core/error.hpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the include/raft_runtime/error.hpp instead.
+ */
+
+#ifndef __RAFT_RT_ERROR
+#define __RAFT_RT_ERROR
+
+#pragma once
+
+#include <cstdio>
+#include <execinfo.h>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace raft {
+
+/** base exception class for the whole of raft */
+    class exception : public std::exception {
+    public:
+        /** default ctor */
+        explicit exception() noexcept : std::exception(), msg_() {}
+
+        /** copy ctor */
+        exception(exception const& src) noexcept : std::exception(), msg_(src.what())
+        {
+            collect_call_stack();
+        }
+
+        /** ctor from an input message */
+        explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
+        {
+            collect_call_stack();
+        }
+
+        /** get the message associated with this exception */
+        char const* what() const noexcept override { return msg_.c_str(); }
+
+    private:
+        /** message associated with this exception */
+        std::string msg_;
+
+        /** append call stack info to this exception's message for ease of debug */
+        // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
+        void collect_call_stack() noexcept
+        {
+#ifdef __GNUC__
+            constexpr int kMaxStackDepth = 64;
+    void* stack[kMaxStackDepth];  // NOLINT
+    auto depth = backtrace(stack, kMaxStackDepth);
+    std::ostringstream oss;
+    oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
+    char** strings = backtrace_symbols(stack, depth);
+    if (strings == nullptr) {
+      oss << "But no stack trace could be found!" << std::endl;
+      msg_ += oss.str();
+      return;
+    }
+    ///@todo: support for demangling of C++ symbol names
+    for (int i = 0; i < depth; ++i) {
+      oss << "#" << i << " in " << strings[i] << std::endl;
+    }
+    free(strings);
+    msg_ += oss.str();
+#endif  // __GNUC__
+        }
+    };
+
+/**
+ * @brief Exception thrown when logical precondition is violated.
+ *
+ * This exception should not be thrown directly and is instead thrown by the
+ * RAFT_EXPECTS and  RAFT_FAIL macros.
+ *
+ */
+    struct logic_error : public raft::exception {
+        explicit logic_error(char const* const message) : raft::exception(message) {}
+        explicit logic_error(std::string const& message) : raft::exception(message) {}
+    };
+
+}  // namespace raft
+
+// FIXME: Need to be replaced with RAFT_FAIL
+/** macro to throw a runtime error */
+#define THROW(fmt, ...)                                                                      \
+  do {                                                                                       \
+    int size1 =                                                                              \
+      std::snprintf(nullptr, 0, "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
+    int size2 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                               \
+    if (size1 < 0 || size2 < 0)                                                              \
+      throw raft::exception("Error in snprintf, cannot handle raft exception.");             \
+    auto size = size1 + size2 + 1; /* +1 for final '\0' */                                   \
+    auto buf  = std::make_unique<char[]>(size_t(size));                                      \
+    std::snprintf(buf.get(),                                                                 \
+                  size1 + 1 /* +1 for '\0' */,                                               \
+                  "exception occured! file=%s line=%d: ",                                    \
+                  __FILE__,                                                                  \
+                  __LINE__);                                                                 \
+    std::snprintf(buf.get() + size1, size2 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);       \
+    std::string msg(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
+    throw raft::exception(msg);                                                              \
+  } while (0)
+
+// FIXME: Need to be replaced with RAFT_EXPECTS
+/** macro to check for a conditional and assert on failure */
+#define ASSERT(check, fmt, ...)              \
+  do {                                       \
+    if (!(check)) THROW(fmt, ##__VA_ARGS__); \
+  } while (0)
+
+/**
+ * Macro to append error message to first argument.
+ * This should only be called in contexts where it is OK to throw exceptions!
+ */
+#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                           \
+  do {                                                                                          \
+    int size1 = std::snprintf(nullptr, 0, "%s", location_prefix);                               \
+    int size2 = std::snprintf(nullptr, 0, "file=%s line=%d: ", __FILE__, __LINE__);             \
+    int size3 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                                  \
+    if (size1 < 0 || size2 < 0 || size3 < 0)                                                    \
+      throw raft::exception("Error in snprintf, cannot handle raft exception.");                \
+    auto size = size1 + size2 + size3 + 1; /* +1 for final '\0' */                              \
+    auto buf  = std::make_unique<char[]>(size_t(size));                                         \
+    std::snprintf(buf.get(), size1 + 1 /* +1 for '\0' */, "%s", location_prefix);               \
+    std::snprintf(                                                                              \
+      buf.get() + size1, size2 + 1 /* +1 for '\0' */, "file=%s line=%d: ", __FILE__, __LINE__); \
+    std::snprintf(buf.get() + size1 + size2, size3 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);  \
+    msg += std::string(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
+  } while (0)
+
+/**
+ * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false
+ *
+ * @param[in] cond Expression that evaluates to true or false
+ * @param[in] fmt String literal description of the reason that cond is expected to be true with
+ * optinal format tagas
+ * @throw raft::logic_error if the condition evaluates to false.
+ */
+#define RAFT_EXPECTS(cond, fmt, ...)                              \
+  do {                                                            \
+    if (!(cond)) {                                                \
+      std::string msg{};                                          \
+      SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
+      throw raft::logic_error(msg);                               \
+    }                                                             \
+  } while (0)
+
+/**
+ * @brief Indicates that an erroneous code path has been taken.
+ *
+ * @param[in] fmt String literal description of the reason that this code path is erroneous with
+ * optinal format tagas
+ * @throw always throws raft::logic_error
+ */
+#define RAFT_FAIL(fmt, ...)                                     \
+  do {                                                          \
+    std::string msg{};                                          \
+    SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
+    throw raft::logic_error(msg);                               \
+  } while (0)
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp
new file mode 100644
index 0000000000..13a3fc26d9
--- /dev/null
+++ b/cpp/include/raft/core/handle.hpp
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the include/raft_runtime/handle.hpp instead.
+ */
+
+#ifndef __RAFT_RT_HANDLE
+#define __RAFT_RT_HANDLE
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+#include <cusparse.h>
+
+///@todo: enable once we have migrated cuml-comms layer too
+//#include <common/cuml_comms_int.hpp>
+
+#include "cudart_utils.h"
+
+#include <raft/comms/comms.hpp>
+#include <raft/interruptible.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
+#include <raft/sparse/detail/cusparse_macros.h>
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace raft {
+
+/**
+ * @brief Main handle object that stores all necessary context used for calling
+ *        necessary cuda kernels and/or libraries
+ */
+    class handle_t {
+    public:
+        // delete copy/move constructors and assignment operators as
+        // copying and moving underlying resources is unsafe
+        handle_t(const handle_t&) = delete;
+        handle_t& operator=(const handle_t&) = delete;
+        handle_t(handle_t&&)                 = delete;
+        handle_t& operator=(handle_t&&) = delete;
+
+        /**
+         * @brief Construct a handle with a stream view and stream pool
+         *
+         * @param[in] stream_view the default stream (which has the default per-thread stream if
+         * unspecified)
+         * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
+         */
+        handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
+                 std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
+                : dev_id_([]() -> int {
+            int cur_dev = -1;
+            RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
+            return cur_dev;
+        }()),
+                  stream_view_{stream_view},
+                  stream_pool_{stream_pool}
+        {
+            create_resources();
+        }
+
+        /** Destroys all held-up resources */
+        virtual ~handle_t() { destroy_resources(); }
+
+        int get_device() const { return dev_id_; }
+
+        cublasHandle_t get_cublas_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cublas_initialized_) {
+                RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
+                RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
+                cublas_initialized_ = true;
+            }
+            return cublas_handle_;
+        }
+
+        cusolverDnHandle_t get_cusolver_dn_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cusolver_dn_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
+                cusolver_dn_initialized_ = true;
+            }
+            return cusolver_dn_handle_;
+        }
+
+        cusolverSpHandle_t get_cusolver_sp_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cusolver_sp_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
+                cusolver_sp_initialized_ = true;
+            }
+            return cusolver_sp_handle_;
+        }
+
+        cusparseHandle_t get_cusparse_handle() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!cusparse_initialized_) {
+                RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
+                RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
+                cusparse_initialized_ = true;
+            }
+            return cusparse_handle_;
+        }
+
+        rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
+
+        /**
+         * @brief synchronize a stream on the handle
+         */
+        void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
+
+        /**
+         * @brief synchronize main stream on the handle
+         */
+        void sync_stream() const { sync_stream(stream_view_); }
+
+        /**
+         * @brief returns main stream on the handle
+         */
+        rmm::cuda_stream_view get_stream() const { return stream_view_; }
+
+        /**
+         * @brief returns whether stream pool was initialized on the handle
+         */
+
+        bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
+
+        /**
+         * @brief returns stream pool on the handle
+         */
+        const rmm::cuda_stream_pool& get_stream_pool() const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            return *stream_pool_;
+        }
+
+        std::size_t get_stream_pool_size() const
+        {
+            return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
+        }
+
+        /**
+         * @brief return stream from pool
+         */
+        rmm::cuda_stream_view get_stream_from_stream_pool() const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            return stream_pool_->get_stream();
+        }
+
+        /**
+         * @brief return stream from pool at index
+         */
+        rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            return stream_pool_->get_stream(stream_idx);
+        }
+
+        /**
+         * @brief return stream from pool if size > 0, else main stream on handle
+         */
+        rmm::cuda_stream_view get_next_usable_stream() const
+        {
+            return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
+        }
+
+        /**
+         * @brief return stream from pool at index if size > 0, else main stream on handle
+         *
+         * @param[in] stream_idx the required index of the stream in the stream pool if available
+         */
+        rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
+        {
+            return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
+        }
+
+        /**
+         * @brief synchronize the stream pool on the handle
+         */
+        void sync_stream_pool() const
+        {
+            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+                sync_stream(stream_pool_->get_stream(i));
+            }
+        }
+
+        /**
+         * @brief synchronize subset of stream pool
+         *
+         * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+         */
+        void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
+        {
+            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+            for (const auto& stream_index : stream_indices) {
+                sync_stream(stream_pool_->get_stream(stream_index));
+            }
+        }
+
+        /**
+         * @brief ask stream pool to wait on last event in main stream
+         */
+        void wait_stream_pool_on_stream() const
+        {
+            RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
+            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+                RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
+            }
+        }
+
+        void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
+
+        const comms::comms_t& get_comms() const
+        {
+            RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
+            return *communicator_;
+        }
+
+        void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+        {
+            subcomms_[key] = subcomm;
+        }
+
+        const comms::comms_t& get_subcomm(std::string key) const
+        {
+            RAFT_EXPECTS(
+                    subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
+
+            auto subcomm = subcomms_.at(key);
+
+            RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
+
+            return *subcomm;
+        }
+
+        bool comms_initialized() const { return (nullptr != communicator_.get()); }
+
+        const cudaDeviceProp& get_device_properties() const
+        {
+            std::lock_guard<std::mutex> _(mutex_);
+            if (!device_prop_initialized_) {
+                RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
+                device_prop_initialized_ = true;
+            }
+            return prop_;
+        }
+
+    private:
+        std::shared_ptr<comms::comms_t> communicator_;
+        std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
+
+        const int dev_id_;
+        mutable cublasHandle_t cublas_handle_;
+        mutable bool cublas_initialized_{false};
+        mutable cusolverDnHandle_t cusolver_dn_handle_;
+        mutable bool cusolver_dn_initialized_{false};
+        mutable cusolverSpHandle_t cusolver_sp_handle_;
+        mutable bool cusolver_sp_initialized_{false};
+        mutable cusparseHandle_t cusparse_handle_;
+        mutable bool cusparse_initialized_{false};
+        std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
+        rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
+        std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
+        cudaEvent_t event_;
+        mutable cudaDeviceProp prop_;
+        mutable bool device_prop_initialized_{false};
+        mutable std::mutex mutex_;
+
+        void create_resources()
+        {
+            thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
+
+            RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+        }
+
+        void destroy_resources()
+        {
+            if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
+            if (cusolver_dn_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+            }
+            if (cusolver_sp_initialized_) {
+                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+            }
+            if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
+            RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
+        }
+    };  // class handle_t
+
+/**
+ * @brief RAII approach to synchronizing across all streams in the handle
+ */
+    class stream_syncer {
+    public:
+        explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
+        ~stream_syncer()
+        {
+            handle_.wait_stream_pool_on_stream();
+            handle_.sync_stream_pool();
+        }
+
+        stream_syncer(const stream_syncer& other) = delete;
+        stream_syncer& operator=(const stream_syncer& other) = delete;
+
+    private:
+        const handle_t& handle_;
+    };  // class stream_syncer
+
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/interruptible.hpp b/cpp/include/raft/core/interruptible.hpp
new file mode 100644
index 0000000000..43b64ce430
--- /dev/null
+++ b/cpp/include/raft/core/interruptible.hpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the include/raft_runtime/interruptible.hpp instead.
+ */
+
+#ifndef __RAFT_RT_INTERRUPTIBLE_H
+#define __RAFT_RT_INTERRUPTIBLE_H
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <raft/cudart_utils.h>
+#include <raft/error.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <thread>
+#include <unordered_map>
+
+namespace raft {
+
+/**
+ * @brief Exception thrown during `interruptible::synchronize` call when it detects a request
+ * to cancel the work performed in this CPU thread.
+ */
+    struct interrupted_exception : public raft::exception {
+        using raft::exception::exception;
+    };
+
+/**
+ * @brief Cooperative-style interruptible execution.
+ *
+ * This class provides facilities for interrupting execution of a C++ thread at designated points
+ * in code from outside of the thread. In particular, it provides an interruptible version of the
+ * blocking CUDA synchronization function, that allows dropping a long-running GPU work.
+ *
+ *
+ * **Important:** Although CUDA synchronize calls serve as cancellation points, the interruptible
+ * machinery has nothing to do with CUDA streams or events. In other words, when you call `cancel`,
+ * it’s the CPU waiting function what is interrupted, not the GPU stream work. This means, when the
+ * `interrupted_exception` is raised, any unfinished GPU stream work continues to run. It’s the
+ * responsibility of the developer then to make sure the unfinished stream work does not affect the
+ * program in an undesirable way.
+ *
+ *
+ * What can happen to CUDA stream when the `synchronize` is cancelled? If you catch the
+ * `interrupted_exception` immediately, you can safely wait on the stream again.
+ * Otherwise, some of the allocated resources may be released before the active kernel finishes
+ * using them, which will result in writing into deallocated or reallocated memory and undefined
+ * behavior in general. A dead-locked kernel may never finish (or may crash if you’re lucky). In
+ * practice, the outcome is usually acceptable for the use case of emergency program interruption
+ * (e.g., CTRL+C), but extra effort on the use side is required to allow safe interrupting and
+ * resuming of the GPU stream work.
+ */
+    class interruptible {
+    public:
+        /**
+         * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
+         * called on this CPU thread.
+         *
+         * @param [in] stream a CUDA stream.
+         *
+         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+         * thread before the currently captured work has been finished.
+         * @throw raft::cuda_error if another CUDA error happens.
+         */
+        static inline void synchronize(rmm::cuda_stream_view stream)
+        {
+            get_token()->synchronize_impl(cudaStreamQuery, stream);
+        }
+
+        /**
+         * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
+         * called on this CPU thread.
+         *
+         * @param [in] event a CUDA event.
+         *
+         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+         * thread before the currently captured work has been finished.
+         * @throw raft::cuda_error if another CUDA error happens.
+         */
+        static inline void synchronize(cudaEvent_t event)
+        {
+            get_token()->synchronize_impl(cudaEventQuery, event);
+        }
+
+        /**
+         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+         * `interruptible::cancel`.
+         *
+         * This is a cancellation point for an interruptible thread. It's called in the internals of
+         * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
+         * recommended to call `interruptible::yield()` in between to make sure the thread does not become
+         * unresponsive for too long.
+         *
+         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+         *
+         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+         * thread.
+         */
+        static inline void yield() { get_token()->yield_impl(); }
+
+        /**
+         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+         * `interruptible::cancel`.
+         *
+         * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
+         *
+         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+         *
+         * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
+         */
+        static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
+
+        /**
+         * @brief Get a cancellation token for this CPU thread.
+         *
+         * @return an object that can be used to cancel the GPU work waited on this CPU thread.
+         */
+        static inline auto get_token() -> std::shared_ptr<interruptible>
+        {
+            // NB: using static thread-local storage to keep the token alive once it is initialized
+            static thread_local std::shared_ptr<interruptible> s(
+                    get_token_impl<true>(std::this_thread::get_id()));
+            return s;
+        }
+
+        /**
+         * @brief Get a cancellation token for a CPU thread given by its id.
+         *
+         * The returned token may live longer than the associated thread. In that case, using its
+         * `cancel` method has no effect.
+         *
+         * @param [in] thread_id an id of a C++ CPU thread.
+         * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
+         */
+        static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+        {
+            return get_token_impl<false>(thread_id);
+        }
+
+        /**
+         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+         * CPU thread given by the `thread_id`
+         *
+         * Note, this function uses a mutex to safely get a cancellation token that may be shared
+         * among multiple threads. If you plan to use it from a signal handler, consider the non-static
+         * `cancel()` instead.
+         *
+         * @param [in] thread_id a CPU thread, in which the work should be interrupted.
+         */
+        static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
+
+        /**
+         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+         * CPU thread given by this `interruptible` token.
+         *
+         * Note, this function does not involve thread synchronization/locks and does not throw any
+         * exceptions, so it's safe to call from a signal handler.
+         */
+        inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
+
+        // don't allow the token to leave the shared_ptr
+        interruptible(interruptible const&) = delete;
+        interruptible(interruptible&&)      = delete;
+        auto operator=(interruptible const&) -> interruptible& = delete;
+        auto operator=(interruptible&&) -> interruptible& = delete;
+
+    private:
+        /** Global registry of thread-local cancellation stores. */
+        static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
+        /** Protect the access to the registry. */
+        static inline std::mutex mutex_;
+
+        /**
+         * Create a new interruptible token or get an existing from the global registry_.
+         *
+         * Presumptions:
+         *
+         *   1. get_token_impl<true> must be called at most once per thread.
+         *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
+         *   3. get_token_impl<false> can be called as many times as needed, producing a valid
+         *      token for any input thread_id, independent of whether a C++ thread with this
+         *      id exists or not.
+         *
+         * @tparam Claim whether to bind the token to the given thread.
+         * @param [in] thread_id the id of the associated C++ thread.
+         * @return new or existing interruptible token.
+         */
+        template <bool Claim>
+        static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+        {
+            std::lock_guard<std::mutex> guard_get(mutex_);
+            // the following constructs an empty shared_ptr if the key does not exist.
+            auto& weak_store  = registry_[thread_id];
+            auto thread_store = weak_store.lock();
+            if (!thread_store || (Claim && thread_store->claimed_)) {
+                // Create a new thread_store in two cases:
+                //  1. It does not exist in the map yet
+                //  2. The previous store in the map has not yet been deleted
+                thread_store.reset(new interruptible(), [thread_id](auto ts) {
+                    std::lock_guard<std::mutex> guard_erase(mutex_);
+                    auto found = registry_.find(thread_id);
+                    if (found != registry_.end()) {
+                        auto stored = found->second.lock();
+                        // thread_store is not moveable, thus retains its original location.
+                        // Not equal pointers below imply the new store has been already placed
+                        // in the registry_ by the same std::thread::id
+                        if (!stored || stored.get() == ts) { registry_.erase(found); }
+                    }
+                    delete ts;
+                });
+                std::weak_ptr<interruptible>(thread_store).swap(weak_store);
+            }
+            // The thread_store is "claimed" by the thread
+            if constexpr (Claim) { thread_store->claimed_ = true; }
+            return thread_store;
+        }
+
+        /**
+         * Communicate whether the thread is in a cancelled state or can continue execution.
+         *
+         * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
+         * These are the only two places where it's used.
+         */
+        std::atomic_flag continue_;
+        /** This flag is set to true when the created token is placed into a thread-local storage. */
+        bool claimed_ = false;
+
+        interruptible() noexcept { yield_no_throw_impl(); }
+
+        void yield_impl()
+        {
+            if (!yield_no_throw_impl()) {
+                throw interrupted_exception("The work in this thread was cancelled.");
+            }
+        }
+
+        auto yield_no_throw_impl() noexcept -> bool
+        {
+            return continue_.test_and_set(std::memory_order_relaxed);
+        }
+
+        template <typename Query, typename Object>
+        inline void synchronize_impl(Query query, Object object)
+        {
+            cudaError_t query_result;
+            while (true) {
+                yield_impl();
+                query_result = query(object);
+                if (query_result != cudaErrorNotReady) { break; }
+                std::this_thread::yield();
+            }
+            RAFT_CUDA_TRY(query_result);
+        }
+    };
+
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger.hpp
new file mode 100644
index 0000000000..a25f3fa4a6
--- /dev/null
+++ b/cpp/include/raft/core/logger.hpp
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <stdarg.h>
+
+#include <algorithm>
+
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include <stdarg.h>
+
+#define SPDLOG_HEADER_ONLY
+#include <raft/common/detail/callback_sink.hpp>
+#include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
+#include <spdlog/spdlog.h>                    // NOLINT
+
+/**
+ * @defgroup logging levels used in raft
+ *
+ * @note exactly match the corresponding ones (but reverse in terms of value)
+ *       in spdlog for wrapping purposes
+ *
+ * @{
+ */
+#define RAFT_LEVEL_TRACE    6
+#define RAFT_LEVEL_DEBUG    5
+#define RAFT_LEVEL_INFO     4
+#define RAFT_LEVEL_WARN     3
+#define RAFT_LEVEL_ERROR    2
+#define RAFT_LEVEL_CRITICAL 1
+#define RAFT_LEVEL_OFF      0
+/** @} */
+
+#if !defined(RAFT_ACTIVE_LEVEL)
+#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_DEBUG
+#endif
+
+namespace raft {
+
+    static const std::string RAFT_NAME = "raft";
+    static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
+
+/**
+ * @defgroup CStringFormat Expand a C-style format string
+ *
+ * @brief Expands C-style formatted string into std::string
+ *
+ * @param[in] fmt format string
+ * @param[in] vl  respective values for each of format modifiers in the string
+ *
+ * @return the expanded `std::string`
+ *
+ * @{
+ */
+    std::string format(const char* fmt, va_list& vl)
+    {
+        char buf[4096];
+        vsnprintf(buf, sizeof(buf), fmt, vl);
+        return std::string(buf);
+    }
+
+    std::string format(const char* fmt, ...)
+    {
+        va_list vl;
+        va_start(vl, fmt);
+        std::string str = format(fmt, vl);
+        va_end(vl);
+        return str;
+    }
+/** @} */
+
+    int convert_level_to_spdlog(int level)
+    {
+        level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
+        return RAFT_LEVEL_TRACE - level;
+    }
+
+/**
+ * @brief The main Logging class for raft library.
+ *
+ * This class acts as a thin wrapper over the underlying `spdlog` interface. The
+ * design is done in this way in order to avoid us having to also ship `spdlog`
+ * header files in our installation.
+ *
+ * @todo This currently only supports logging to stdout. Need to add support in
+ *       future to add custom loggers as well [Issue #2046]
+ */
+    class logger {
+    public:
+        // @todo setting the logger once per process with
+        logger(std::string const& name_ = "")
+                : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
+                  spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
+                  cur_pattern()
+        {
+            set_pattern(default_log_pattern);
+            set_level(RAFT_LEVEL_INFO);
+        }
+        /**
+         * @brief Singleton method to get the underlying logger object
+         *
+         * @return the singleton logger object
+         */
+        static logger& get(std::string const& name = "")
+        {
+            if (log_map.find(name) == log_map.end()) {
+                log_map[name] = std::make_shared<raft::logger>(name);
+            }
+            return *log_map[name];
+        }
+
+        /**
+         * @brief Set the logging level.
+         *
+         * Only messages with level equal or above this will be printed
+         *
+         * @param[in] level logging level
+         *
+         * @note The log level will actually be set only if the input is within the
+         *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
+         *       be ignored. See documentation of decisiontree for how this gets used
+         */
+        void set_level(int level)
+        {
+            level = convert_level_to_spdlog(level);
+            spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
+        }
+
+        /**
+         * @brief Set the logging pattern
+         *
+         * @param[in] pattern the pattern to be set. Refer this link
+         *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
+         *                    to know the right syntax of this pattern
+         */
+        void set_pattern(const std::string& pattern)
+        {
+            cur_pattern = pattern;
+            spdlogger->set_pattern(pattern);
+        }
+
+        /**
+         * @brief Register a callback function to be run in place of usual log call
+         *
+         * @param[in] callback the function to be run on all logged messages
+         */
+        void set_callback(void (*callback)(int lvl, const char* msg)) { sink->set_callback(callback); }
+
+        /**
+         * @brief Register a flush function compatible with the registered callback
+         *
+         * @param[in] flush the function to use when flushing logs
+         */
+        void set_flush(void (*flush)()) { sink->set_flush(flush); }
+
+        /**
+         * @brief Tells whether messages will be logged for the given log level
+         *
+         * @param[in] level log level to be checked for
+         * @return true if messages will be logged for this level, else false
+         */
+        bool should_log_for(int level) const
+        {
+            level        = convert_level_to_spdlog(level);
+            auto level_e = static_cast<spdlog::level::level_enum>(level);
+            return spdlogger->should_log(level_e);
+        }
+
+        /**
+         * @brief Query for the current log level
+         *
+         * @return the current log level
+         */
+        int get_level() const
+        {
+            auto level_e = spdlogger->level();
+            return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
+        }
+
+        /**
+         * @brief Get the current logging pattern
+         * @return the pattern
+         */
+        std::string get_pattern() const { return cur_pattern; }
+
+        /**
+         * @brief Main logging method
+         *
+         * @param[in] level logging level of this message
+         * @param[in] fmt   C-like format string, followed by respective params
+         */
+        void log(int level, const char* fmt, ...)
+        {
+            level        = convert_level_to_spdlog(level);
+            auto level_e = static_cast<spdlog::level::level_enum>(level);
+            // explicit check to make sure that we only expand messages when required
+            if (spdlogger->should_log(level_e)) {
+                va_list vl;
+                va_start(vl, fmt);
+                auto msg = format(fmt, vl);
+                va_end(vl);
+                spdlogger->log(level_e, msg);
+            }
+        }
+
+        /**
+         * @brief Flush logs by calling flush on underlying logger
+         */
+        void flush() { spdlogger->flush(); }
+
+        ~logger() {}
+
+    private:
+        logger();
+
+        static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
+        std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
+        std::shared_ptr<spdlog::logger> spdlogger;
+        std::string cur_pattern;
+        int cur_level;
+    };  // class logger
+
+};  // namespace raft
+
+/**
+ * @defgroup loggerMacros Helper macros for dealing with logging
+ * @{
+ */
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
+#define RAFT_LOG_TRACE(fmt, ...)                                          \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
+    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_TRACE(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
+#define RAFT_LOG_DEBUG(fmt, ...)                                          \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::format("%s:%d ", __FILE__, __LINE__);                     \
+    ss << raft::format(fmt, ##__VA_ARGS__);                               \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_DEBUG(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
+#define RAFT_LOG_INFO(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_INFO(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
+#define RAFT_LOG_WARN(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_WARN(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
+#define RAFT_LOG_ERROR(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_ERROR(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
+#define RAFT_LOG_CRITICAL(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
+#endif
+/** @} */
diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
new file mode 100644
index 0000000000..502686786e
--- /dev/null
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -0,0 +1,650 @@
+/*
+ * Copyright (2019) Sandia Corporation
+ *
+ * The source code is licensed under the 3-clause BSD license found in the LICENSE file
+ * thirdparty/LICENSES/mdarray.license
+ */
+
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <experimental/mdspan>
+#include <raft/detail/mdarray.hpp>
+#include <raft/handle.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft {
+/**
+ * @\brief C-Contiguous layout for mdarray and mdspan. Implies row-major and contiguous memory.
+ */
+    using layout_c_contiguous = detail::stdex::layout_right;
+
+/**
+ * @\brief F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory.
+ */
+    using layout_f_contiguous = detail::stdex::layout_left;
+
+/**
+ * @brief stdex::mdspan with device tag to avoid accessing incorrect memory location.
+ */
+    template <typename ElementType,
+            typename Extents,
+            typename LayoutPolicy   = layout_c_contiguous,
+            typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
+    using device_mdspan = detail::stdex::
+    mdspan<ElementType, Extents, LayoutPolicy, detail::device_accessor<AccessorPolicy>>;
+
+/**
+ * @brief stdex::mdspan with host tag to avoid accessing incorrect memory location.
+ */
+    template <typename ElementType,
+            typename Extents,
+            typename LayoutPolicy   = layout_c_contiguous,
+            typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
+    using host_mdspan =
+    detail::stdex::mdspan<ElementType, Extents, LayoutPolicy, detail::host_accessor<AccessorPolicy>>;
+
+/**
+ * @brief Modified from the c++ mdarray proposal
+ *
+ *   https://isocpp.org/files/papers/D1684R0.html
+ *
+ * mdarray is a container type for mdspan with similar template arguments.  However there
+ * are some inconsistencies in between them.  We have made some modificiations to fit our
+ * needs, which are listed below.
+ *
+ * - Layout policy is different, the mdarray in raft uses `stdex::extent` directly just
+ *   like `mdspan`, while the `mdarray` in the reference implementation uses varidic
+ *   template.
+ *
+ * - Most of the constructors from the reference implementation is removed to make sure
+ *   CUDA stream is honorred.
+ *
+ * - unique_size is not implemented, which is still working in progress in the proposal
+ *
+ * - For container policy, we adopt the alternative approach documented in the proposal
+ *   [sec 2.4.3], which requires an additional make_accessor method for it to be used in
+ *   mdspan.  The container policy reference implementation has multiple `access` methods
+ *   that accommodate needs for both mdarray and mdspan.  This is more difficult for us
+ *   since the policy might contain states that are unwanted inside a CUDA kernel.  Also,
+ *   on host we return a proxy to the actual value as `device_ref` so different access
+ *   methods will have different return type, which is less desirable.
+ *
+ * - For the above reasons, copying from other mdarray with different policy type is also
+ *   removed.
+ */
+    template <typename ElementType, typename Extents, typename LayoutPolicy, typename ContainerPolicy>
+    class mdarray {
+        static_assert(!std::is_const<ElementType>::value,
+        "Element type for container must not be const.");
+
+    public:
+        using extents_type = Extents;
+        using layout_type  = LayoutPolicy;
+        using mapping_type = typename layout_type::template mapping<extents_type>;
+        using element_type = ElementType;
+
+        using value_type      = std::remove_cv_t<element_type>;
+        using index_type      = std::size_t;
+        using difference_type = std::ptrdiff_t;
+        // Naming: ref impl: container_policy_type, proposal: container_policy
+        using container_policy_type = ContainerPolicy;
+        using container_type        = typename container_policy_type::container_type;
+
+        using pointer         = typename container_policy_type::pointer;
+        using const_pointer   = typename container_policy_type::const_pointer;
+        using reference       = typename container_policy_type::reference;
+        using const_reference = typename container_policy_type::const_reference;
+
+    private:
+        template <typename E,
+                typename ViewAccessorPolicy =
+                std::conditional_t<std::is_const_v<E>,
+                        typename container_policy_type::const_accessor_policy,
+                        typename container_policy_type::accessor_policy>>
+        using view_type_impl =
+        std::conditional_t<container_policy_type::is_host_type::value,
+                host_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>,
+                device_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>>;
+
+    public:
+        /**
+         * \brief the mdspan type returned by view method.
+         */
+        using view_type       = view_type_impl<element_type>;
+        using const_view_type = view_type_impl<element_type const>;
+
+    public:
+        constexpr mdarray() noexcept(std::is_nothrow_default_constructible_v<container_type>)
+                : cp_{rmm::cuda_stream_default}, c_{cp_.create(0)} {};
+        constexpr mdarray(mdarray const&) noexcept(std::is_nothrow_copy_constructible_v<container_type>) =
+        default;
+        constexpr mdarray(mdarray&&) noexcept(std::is_nothrow_move_constructible<container_type>::value) =
+        default;
+
+        constexpr auto operator                                               =(mdarray const&) noexcept(
+                std::is_nothrow_copy_assignable<container_type>::value) -> mdarray& = default;
+        constexpr auto operator                                               =(mdarray&&) noexcept(
+                std::is_nothrow_move_assignable<container_type>::value) -> mdarray& = default;
+
+        ~mdarray() noexcept(std::is_nothrow_destructible<container_type>::value) = default;
+
+#ifndef RAFT_MDARRAY_CTOR_CONSTEXPR
+#if !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
+// 11.0:
+// Error: Internal Compiler Error (codegen): "there was an error in verifying the lgenfe output!"
+//
+// 11.2:
+// Call parameter type does not match function signature!
+// i8** null
+// i8*  %call14 = call i32 null(void (i8*)* null, i8* null, i8** null), !dbg !1060
+// <unnamed>: parse Invalid record (Producer: 'LLVM7.0.1' Reader: 'LLVM 7.0.1')
+#define RAFT_MDARRAY_CTOR_CONSTEXPR constexpr
+#else
+#define RAFT_MDARRAY_CTOR_CONSTEXPR
+#endif  // !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
+#endif  // RAFT_MDARRAY_CTOR_CONSTEXPR
+
+        /**
+         * @brief The only constructor that can create storage, this is to make sure CUDA stream is being
+         * used.
+         */
+        RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type const& cp)
+                : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
+        {
+        }
+        RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type& cp)
+                : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
+        {
+        }
+
+#undef RAFT_MDARRAY_CTOR_CONSTEXPR
+
+        /**
+         * @brief Get a mdspan that can be passed down to CUDA kernels.
+         */
+        auto view() noexcept { return view_type(c_.data(), map_, cp_.make_accessor_policy()); }
+        /**
+         * @brief Get a mdspan that can be passed down to CUDA kernels.
+         */
+        auto view() const noexcept
+        {
+            return const_view_type(c_.data(), map_, cp_.make_accessor_policy());
+        }
+
+        [[nodiscard]] constexpr auto size() const noexcept -> index_type { return this->view().size(); }
+
+        [[nodiscard]] auto data() noexcept -> pointer { return c_.data(); }
+        [[nodiscard]] constexpr auto data() const noexcept -> const_pointer { return c_.data(); }
+
+        /**
+         * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
+         */
+        template <typename... IndexType>
+        auto operator()(IndexType&&... indices)
+        -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
+                (std::is_convertible_v<IndexType, index_type> && ...) &&
+        std::is_constructible_v<extents_type, IndexType...> &&
+                std::is_constructible_v<mapping_type, extents_type>,
+        /* device policy is not default constructible due to requirement for CUDA
+           stream. */
+        /* std::is_default_constructible_v<container_policy_type> */
+        reference>
+        {
+            return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
+        }
+
+        /**
+         * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
+         */
+        template <typename... IndexType>
+        auto operator()(IndexType&&... indices) const
+        -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
+                (std::is_convertible_v<IndexType, index_type> && ...) &&
+        std::is_constructible_v<extents_type, IndexType...> &&
+                std::is_constructible<mapping_type, extents_type>::value,
+        /* device policy is not default constructible due to requirement for CUDA
+           stream. */
+        /* std::is_default_constructible_v<container_policy_type> */
+        const_reference>
+        {
+            return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
+        }
+
+        // basic_mdarray observers of the domain multidimensional index space (also in basic_mdspan)
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank() noexcept -> index_type
+        {
+            return extents_type::rank();
+        }
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank_dynamic() noexcept -> index_type
+        {
+            return extents_type::rank_dynamic();
+        }
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto static_extent(size_t r) noexcept
+        -> index_type
+        {
+            return extents_type::static_extent(r);
+        }
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extents() const noexcept -> extents_type
+        {
+            return map_.extents();
+        }
+        /**
+         * @brief the extent of rank r
+         */
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extent(size_t r) const noexcept -> index_type
+        {
+            return map_.extents().extent(r);
+        }
+        // mapping
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto mapping() const noexcept -> mapping_type
+        {
+            return map_;
+        }
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_unique() const noexcept -> bool
+        {
+            return map_.is_unique();
+        }
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_contiguous() const noexcept -> bool
+        {
+            return map_.is_contiguous();
+        }
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_strided() const noexcept -> bool
+        {
+            return map_.is_strided();
+        }
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto stride(size_t r) const -> index_type
+        {
+            return map_.stride(r);
+        }
+
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_unique() noexcept -> bool
+        {
+            return mapping_type::is_always_unique();
+        }
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_contiguous() noexcept -> bool
+        {
+            return mapping_type::is_always_contiguous();
+        }
+        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_strided() noexcept -> bool
+        {
+            return mapping_type::is_always_strided();
+        }
+
+    private:
+        template <typename, typename, typename, typename>
+        friend class mdarray;
+
+    private:
+        container_policy_type cp_;
+        mapping_type map_;
+        container_type c_;
+    };
+
+/**
+ * @brief mdarray with host container policy
+ * @tparam ElementType the data type of the elements
+ * @tparam Extents defines the shape
+ * @tparam LayoutPolicy policy for indexing strides and layout ordering
+ * @tparam ContainerPolicy storage and accessor policy
+ */
+    template <typename ElementType,
+            typename Extents,
+            typename LayoutPolicy    = layout_c_contiguous,
+            typename ContainerPolicy = detail::host_vector_policy<ElementType>>
+    using host_mdarray =
+    mdarray<ElementType, Extents, LayoutPolicy, detail::host_accessor<ContainerPolicy>>;
+
+/**
+ * @brief mdarray with device container policy
+ * @tparam ElementType the data type of the elements
+ * @tparam Extents defines the shape
+ * @tparam LayoutPolicy policy for indexing strides and layout ordering
+ * @tparam ContainerPolicy storage and accessor policy
+ */
+    template <typename ElementType,
+            typename Extents,
+            typename LayoutPolicy    = layout_c_contiguous,
+            typename ContainerPolicy = detail::device_uvector_policy<ElementType>>
+    using device_mdarray =
+    mdarray<ElementType, Extents, LayoutPolicy, detail::device_accessor<ContainerPolicy>>;
+
+/**
+ * @brief Shorthand for 0-dim host mdarray (scalar).
+ * @tparam ElementType the data type of the scalar element
+ */
+    template <typename ElementType>
+    using host_scalar = host_mdarray<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 0-dim host mdarray (scalar).
+ * @tparam ElementType the data type of the scalar element
+ */
+    template <typename ElementType>
+    using device_scalar = device_mdarray<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 1-dim host mdarray.
+ * @tparam ElementType the data type of the vector elements
+ */
+    template <typename ElementType>
+    using host_vector = host_mdarray<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for 1-dim device mdarray.
+ * @tparam ElementType the data type of the vector elements
+ */
+    template <typename ElementType>
+    using device_vector = device_mdarray<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for c-contiguous host matrix.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+    using host_matrix = host_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for c-contiguous device matrix.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+    using device_matrix = device_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for 0-dim host mdspan (scalar).
+ * @tparam ElementType the data type of the scalar element
+ */
+    template <typename ElementType>
+    using host_scalar_view = host_mdspan<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 0-dim host mdspan (scalar).
+ * @tparam ElementType the data type of the scalar element
+ */
+    template <typename ElementType>
+    using device_scalar_view = device_mdspan<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 1-dim host mdspan.
+ * @tparam ElementType the data type of the vector elements
+ */
+    template <typename ElementType>
+    using host_vector_view = host_mdspan<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for 1-dim device mdspan.
+ * @tparam ElementType the data type of the vector elements
+ */
+    template <typename ElementType>
+    using device_vector_view = device_mdspan<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for c-contiguous host matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ *
+ */
+    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+    using host_matrix_view = host_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for c-contiguous device matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ *
+ */
+    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+    using device_matrix_view = device_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
+
+/**
+ * @brief Create a 0-dim (scalar) mdspan instance for host value.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @param[in] ptr on device to wrap
+ */
+    template <typename ElementType>
+    auto make_host_scalar_view(ElementType* ptr)
+    {
+        detail::scalar_extent extents;
+        return host_scalar_view<ElementType>{ptr, extents};
+    }
+
+/**
+ * @brief Create a 0-dim (scalar) mdspan instance for device value.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @param[in] ptr on device to wrap
+ */
+    template <typename ElementType>
+    auto make_device_scalar_view(ElementType* ptr)
+    {
+        detail::scalar_extent extents;
+        return device_scalar_view<ElementType>{ptr, extents};
+    }
+
+/**
+ * @brief Create a 2-dim c-contiguous mdspan instance for host pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr on host to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+    auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+    {
+        detail::matrix_extent extents{n_rows, n_cols};
+        return host_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
+    }
+/**
+ * @brief Create a 2-dim c-contiguous mdspan instance for device pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr on device to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+    auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+    {
+        detail::matrix_extent extents{n_rows, n_cols};
+        return device_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
+    }
+
+/**
+ * @brief Create a 1-dim mdspan instance for host pointer.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] ptr on host to wrap
+ * @param[in] n number of elements in pointer
+ * @return raft::host_vector_view
+ */
+    template <typename ElementType>
+    auto make_host_vector_view(ElementType* ptr, size_t n)
+    {
+        detail::vector_extent extents{n};
+        return host_matrix_view<ElementType>{ptr, extents};
+    }
+
+/**
+ * @brief Create a 1-dim mdspan instance for device pointer.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] ptr on device to wrap
+ * @param[in] n number of elements in pointer
+ * @return raft::device_vector_view
+ */
+    template <typename ElementType>
+    auto make_device_vector_view(ElementType* ptr, size_t n)
+    {
+        detail::vector_extent extents{n};
+        return device_matrix_view<ElementType>{ptr, extents};
+    }
+
+/**
+ * @brief Create a 2-dim c-contiguous host mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] n_rows number or rows in matrix
+ * @param[in] n_cols number of columns in matrix
+ * @return raft::host_matrix
+ */
+    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+    auto make_host_matrix(size_t n_rows, size_t n_cols)
+    {
+        detail::matrix_extent extents{n_rows, n_cols};
+        using policy_t = typename host_matrix<ElementType>::container_policy_type;
+        policy_t policy;
+        return host_matrix<ElementType, LayoutPolicy>{extents, policy};
+    }
+
+/**
+ * @brief Create a 2-dim c-contiguous device mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] n_rows number or rows in matrix
+ * @param[in] n_cols number of columns in matrix
+ * @param[in] stream cuda stream for ordering events
+ * @return raft::device_matrix
+ */
+    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+    auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
+    {
+        detail::matrix_extent extents{n_rows, n_cols};
+        using policy_t = typename device_matrix<ElementType>::container_policy_type;
+        policy_t policy{stream};
+        return device_matrix<ElementType, LayoutPolicy>{extents, policy};
+    }
+
+/**
+ * @brief Create a 2-dim c-contiguous device mdarray.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] handle raft handle for managing expensive resources
+ * @param[in] n_rows number or rows in matrix
+ * @param[in] n_cols number of columns in matrix
+ * @return raft::device_matrix
+ */
+    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+    auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_cols)
+    {
+        return make_device_matrix<ElementType, LayoutPolicy>(n_rows, n_cols, handle.get_stream());
+    }
+
+/**
+ * @brief Create a host scalar from v.
+ *
+ * @tparam ElementType the data type of the scalar element
+ * @param[in] v scalar type to wrap
+ * @return raft::host_scalar
+ */
+    template <typename ElementType>
+    auto make_host_scalar(ElementType const& v)
+    {
+        // FIXME(jiamingy): We can optimize this by using std::array as container policy, which
+        // requires some more compile time dispatching. This is enabled in the ref impl but
+        // hasn't been ported here yet.
+        detail::scalar_extent extents;
+        using policy_t = typename host_scalar<ElementType>::container_policy_type;
+        policy_t policy;
+        auto scalar = host_scalar<ElementType>{extents, policy};
+        scalar(0)   = v;
+        return scalar;
+    }
+
+/**
+ * @brief Create a device scalar from v.
+ *
+ * @tparam ElementType the data type of the scalar element
+ * @param[in] v scalar type to wrap on device
+ * @param[in] stream the cuda stream for ordering events
+ * @return raft::device_scalar
+ */
+    template <typename ElementType>
+    auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
+    {
+        detail::scalar_extent extents;
+        using policy_t = typename device_scalar<ElementType>::container_policy_type;
+        policy_t policy{stream};
+        auto scalar = device_scalar<ElementType>{extents, policy};
+        scalar(0)   = v;
+        return scalar;
+    }
+
+/**
+ * @brief Create a device scalar from v.
+ *
+ * @tparam ElementType the data type of the scalar element
+ * @param[in] handle raft handle for managing expensive cuda resources
+ * @param[in] v scalar to wrap on device
+ * @return raft::device_scalar
+ */
+    template <typename ElementType>
+    auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
+    {
+        return make_device_scalar<ElementType>(v, handle.get_stream());
+    }
+
+/**
+ * @brief Create a 1-dim host mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] n number of elements in vector
+ * @return raft::host_vector
+ */
+    template <typename ElementType>
+    auto make_host_vector(size_t n)
+    {
+        detail::vector_extent extents{n};
+        using policy_t = typename host_vector<ElementType>::container_policy_type;
+        policy_t policy;
+        return host_vector<ElementType>{extents, policy};
+    }
+
+/**
+ * @brief Create a 1-dim device mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] n number of elements in vector
+ * @param[in] stream the cuda stream for ordering events
+ * @return raft::device_vector
+ */
+    template <typename ElementType>
+    auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
+    {
+        detail::vector_extent extents{n};
+        using policy_t = typename device_vector<ElementType>::container_policy_type;
+        policy_t policy{stream};
+        return device_vector<ElementType>{extents, policy};
+    }
+
+/**
+ * @brief Create a 1-dim device mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] handle raft handle for managing expensive cuda resources
+ * @param[in] n number of elements in vector
+ * @return raft::device_vector
+ */
+    template <typename ElementType>
+    auto make_device_vector(raft::handle_t const& handle, size_t n)
+    {
+        return make_device_vector<ElementType>(n, handle.get_stream());
+    }
+}  // namespace raft
diff --git a/cpp/include/raft/core/nvtx.hpp b/cpp/include/raft/core/nvtx.hpp
new file mode 100644
index 0000000000..65f3204a06
--- /dev/null
+++ b/cpp/include/raft/core/nvtx.hpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/nvtx.hpp"
+#include <optional>
+
+/**
+ * \section Usage
+ *
+ * To add NVTX ranges to your code, use the `nvtx::range` RAII object. A
+ * range begins when the object is created, and ends when the object is
+ * destroyed.
+ *
+ * The example below creates nested NVTX ranges. The range `fun_scope` spans
+ * the whole function, while the range `epoch_scope` spans an iteration
+ * (and appears 5 times in the timeline).
+ * \code{.cpp}
+ * #include <raft/common/nvtx.hpp>
+ * void some_function(int k){
+ *   // Begins a NVTX range with the messsage "some_function_{k}"
+ *   // The range ends when some_function() returns
+ *   common::nvtx::range fun_scope( r{"some_function_%d", k};
+ *
+ *   for(int i = 0; i < 5; i++){
+ *     common::nvtx::range epoch_scope{"epoch-%d", i};
+ *     // some logic inside the loop
+ *   }
+ * }
+ * \endcode
+ *
+ * \section Domains
+ *
+ * All NVTX ranges are assigned to domains. A domain defines a named timeline in
+ * the Nsight Systems view. By default, we put all ranges into a domain `domain::app`
+ * named "application". This is controlled by the template parameter `Domain`.
+ *
+ * The example below defines a domain and uses it in a function.
+ * \code{.cpp}
+ * #include <raft/common/nvtx.hpp>
+ *
+ * struct my_app_domain {
+ *   static constexpr char const* name{"my application"};
+ * }
+ *
+ * void some_function(int k){
+ *   // This NVTX range appears in the timeline named "my application" in Nsight Systems.
+ *   common::nvtx::range<my_app_domain> fun_scope( r{"some_function_%d", k};
+ *   // some logic inside the loop
+ * }
+ * \endcode
+ */
+namespace raft::common::nvtx {
+
+    namespace domain {
+
+/** @brief The default NVTX domain. */
+        struct app {
+            static constexpr char const* name{"application"};
+        };
+
+/** @brief This NVTX domain is supposed to be used within raft.  */
+        struct raft {
+            static constexpr char const* name{"raft"};
+        };
+
+    }  // namespace domain
+
+/**
+ * @brief Push a named NVTX range.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ *   NB: make sure to use the same domain for `push_range` and `pop_range`.
+ * @param format range name format (accepts printf-style arguments)
+ * @param args the arguments for the printf-style formatting
+ */
+    template <typename Domain = domain::app, typename... Args>
+    inline void push_range(const char* format, Args... args)
+    {
+        detail::push_range<Domain, Args...>(format, args...);
+    }
+
+/**
+ * @brief Pop the latest range.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ *   NB: make sure to use the same domain for `push_range` and `pop_range`.
+ */
+    template <typename Domain = domain::app>
+    inline void pop_range()
+    {
+        detail::pop_range<Domain>();
+    }
+
+/**
+ * @brief Push a named NVTX range that would be popped at the end of the object lifetime.
+ *
+ * Refer to \ref Usage for the usage examples.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ */
+    template <typename Domain = domain::app>
+    class range {
+    public:
+        /**
+         * Push a named NVTX range.
+         * At the end of the object lifetime, pop the range back.
+         *
+         * @param format range name format (accepts printf-style arguments)
+         * @param args the arguments for the printf-style formatting
+         */
+        template <typename... Args>
+        explicit range(const char* format, Args... args)
+        {
+            push_range<Domain, Args...>(format, args...);
+        }
+
+        ~range() { pop_range<Domain>(); }
+
+        /* This object is not meant to be touched. */
+        range(const range&) = delete;
+        range(range&&)      = delete;
+        auto operator=(const range&) -> range& = delete;
+        auto operator=(range&&) -> range&                = delete;
+        static auto operator new(std::size_t) -> void*   = delete;
+        static auto operator new[](std::size_t) -> void* = delete;
+    };
+
+}  // namespace raft::common::nvtx
diff --git a/cpp/include/raft/core/span.hpp b/cpp/include/raft/core/span.hpp
new file mode 100644
index 0000000000..d9177b8a3d
--- /dev/null
+++ b/cpp/include/raft/core/span.hpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cassert>
+#include <cinttypes>  // size_t
+#include <cstddef>    // std::byte
+#include <raft/detail/span.hpp>
+#include <thrust/functional.h>
+#include <thrust/host_vector.h>  // __host__ __device__
+#include <thrust/iterator/reverse_iterator.h>
+#include <type_traits>
+
+namespace raft {
+/**
+ * @brief The span class defined in ISO C++20.  Iterator is defined as plain pointer and
+ *        most of the methods have bound check on debug build.
+ *
+ * @code
+ *   rmm::device_uvector<float> uvec(10, rmm::cuda_stream_default);
+ *   auto view = device_span<float>{uvec.data(), uvec.size()};
+ * @endcode
+ */
+    template <typename T, bool is_device, std::size_t Extent = dynamic_extent>
+    class span {
+    public:
+        using element_type    = T;
+        using value_type      = typename std::remove_cv<T>::type;
+        using size_type       = std::size_t;
+        using difference_type = std::ptrdiff_t;
+        using pointer         = T*;
+        using const_pointer   = T const*;
+        using reference       = T&;
+        using const_reference = T const&;
+
+        using iterator               = pointer;
+        using const_iterator         = const_pointer;
+        using reverse_iterator       = thrust::reverse_iterator<iterator>;
+        using const_reverse_iterator = thrust::reverse_iterator<const_iterator>;
+
+        /**
+         * @brief Default constructor that constructs a span with size 0 and nullptr.
+         */
+        constexpr span() noexcept = default;
+
+        /**
+         * @brief Constructs a span that is a view over the range [first, first + count);
+         */
+        constexpr span(pointer ptr, size_type count) noexcept : storage_{ptr, count}
+                {
+                        assert(!(Extent != dynamic_extent && count != Extent));
+                assert(ptr || count == 0);
+                }
+        /**
+         * @brief Constructs a span that is a view over the range [first, last)
+         */
+        constexpr span(pointer first, pointer last) noexcept
+                : span{first, static_cast<size_type>(thrust::distance(first, last))}
+                {
+                }
+        /**
+         * @brief Constructs a span that is a view over the array arr.
+         */
+        template <std::size_t N>
+        constexpr span(element_type (&arr)[N]) noexcept : span{&arr[0], N}
+                {
+                }
+
+        /**
+         * @brief Initialize a span class from another one who's underlying type is convertible
+         *        to element_type.
+         */
+        template <class U,
+                std::size_t OtherExtent,
+                class = typename std::enable_if<
+                        detail::is_allowed_element_type_conversion_t<U, T>::value &&
+                        detail::is_allowed_extent_conversion_t<OtherExtent, Extent>::value>>
+        constexpr span(const span<U, is_device, OtherExtent>& other) noexcept
+                : span{other.data(), other.size()}
+                {
+                }
+
+        constexpr span(span const& other) noexcept = default;
+        constexpr span(span&& other) noexcept      = default;
+
+        constexpr auto operator=(span const& other) noexcept -> span& = default;
+        constexpr auto operator=(span&& other) noexcept -> span& = default;
+
+        constexpr auto begin() const noexcept -> iterator { return data(); }
+
+        constexpr auto end() const noexcept -> iterator { return data() + size(); }
+
+        constexpr auto cbegin() const noexcept -> const_iterator { return data(); }
+
+        constexpr auto cend() const noexcept -> const_iterator { return data() + size(); }
+
+        __host__ __device__ constexpr auto rbegin() const noexcept -> reverse_iterator
+        {
+            return reverse_iterator{end()};
+        }
+
+        __host__ __device__ constexpr auto rend() const noexcept -> reverse_iterator
+        {
+            return reverse_iterator{begin()};
+        }
+
+        __host__ __device__ constexpr auto crbegin() const noexcept -> const_reverse_iterator
+        {
+            return const_reverse_iterator{cend()};
+        }
+
+        __host__ __device__ constexpr auto crend() const noexcept -> const_reverse_iterator
+        {
+            return const_reverse_iterator{cbegin()};
+        }
+
+        // element access
+        constexpr auto front() const -> reference { return (*this)[0]; }
+
+        constexpr auto back() const -> reference { return (*this)[size() - 1]; }
+
+        template <typename Index>
+        constexpr auto operator[](Index _idx) const -> reference
+        {
+            assert(static_cast<size_type>(_idx) < size());
+            return data()[_idx];
+        }
+
+        constexpr auto data() const noexcept -> pointer { return storage_.data(); }
+
+        // Observers
+        [[nodiscard]] constexpr auto size() const noexcept -> size_type { return storage_.size(); }
+        [[nodiscard]] constexpr auto size_bytes() const noexcept -> size_type
+        {
+            return size() * sizeof(T);
+        }
+
+        constexpr auto empty() const noexcept { return size() == 0; }
+
+        // Subviews
+        template <std::size_t Count>
+        constexpr auto first() const -> span<element_type, is_device, Count>
+        {
+            assert(Count <= size());
+            return {data(), Count};
+        }
+
+        constexpr auto first(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
+        {
+            assert(_count <= size());
+            return {data(), _count};
+        }
+
+        template <std::size_t Count>
+        constexpr auto last() const -> span<element_type, is_device, Count>
+        {
+            assert(Count <= size());
+            return {data() + size() - Count, Count};
+        }
+
+        constexpr auto last(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
+        {
+            assert(_count <= size());
+            return subspan(size() - _count, _count);
+        }
+
+        /*!
+         * If Count is std::dynamic_extent, r.size() == this->size() - Offset;
+         * Otherwise r.size() == Count.
+         */
+        template <std::size_t Offset, std::size_t Count = dynamic_extent>
+        constexpr auto subspan() const
+        -> span<element_type, is_device, detail::extent_value_t<Extent, Offset, Count>::value>
+        {
+            assert((Count == dynamic_extent) ? (Offset <= size()) : (Offset + Count <= size()));
+            return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
+        }
+
+        constexpr auto subspan(size_type _offset, size_type _count = dynamic_extent) const
+        -> span<element_type, is_device, dynamic_extent>
+        {
+            assert((_count == dynamic_extent) ? (_offset <= size()) : (_offset + _count <= size()));
+            return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
+        }
+
+    private:
+        detail::span_storage<T, Extent> storage_;
+    };
+
+/**
+ * @brief A span class for host pointer.
+ */
+    template <typename T, size_t extent = dynamic_extent>
+    using host_span = span<T, false, extent>;
+
+/**
+ * @brief A span class for device pointer.
+ */
+    template <typename T, size_t extent = dynamic_extent>
+    using device_span = span<T, true, extent>;
+
+    template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+    constexpr auto operator==(span<T, is_device, X> l, span<U, is_device, Y> r) -> bool
+{
+    if (l.size() != r.size()) { return false; }
+for (auto l_beg = l.cbegin(), r_beg = r.cbegin(); l_beg != l.cend(); ++l_beg, ++r_beg) {
+if (*l_beg != *r_beg) { return false; }
+}
+return true;
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator!=(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+    return !(l == r);
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator<(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+    return detail::lexicographical_compare<
+           typename span<T, is_device, X>::iterator,
+            typename span<U, is_device, Y>::iterator,
+            thrust::less<typename span<T, is_device, X>::element_type>>(
+                    l.begin(), l.end(), r.begin(), r.end());
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator<=(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+    return !(l > r);
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator>(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+    return detail::lexicographical_compare<
+           typename span<T, is_device, X>::iterator,
+            typename span<U, is_device, Y>::iterator,
+            thrust::greater<typename span<T, is_device, X>::element_type>>(
+                    l.begin(), l.end(), r.begin(), r.end());
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator>=(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+    return !(l < r);
+}
+
+/**
+ * @brief Converts a span into a view of its underlying bytes
+ */
+template <class T, bool is_device, std::size_t E>
+auto as_bytes(span<T, is_device, E> s) noexcept
+-> span<const std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
+{
+return {reinterpret_cast<const std::byte*>(s.data()), s.size_bytes()};
+}
+
+/**
+ * @brief Converts a span into a mutable view of its underlying bytes
+ */
+template <class T, bool is_device, std::size_t E>
+auto as_writable_bytes(span<T, is_device, E> s) noexcept
+-> span<std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
+{
+return {reinterpret_cast<std::byte*>(s.data()), s.size_bytes()};
+}
+}  // namespace raft
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 3a18d7e420..de298d98fc 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -15,395 +15,9 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cudart_utils.hpp instead.
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
 
-#ifndef __RAFT_RT_CUDART_UTILS_H
-#define __RAFT_RT_CUDART_UTILS_H
-
 #pragma once
-
-#include <raft/error.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-#include <cuda_runtime.h>
-
-#include <chrono>
-#include <cstdio>
-#include <execinfo.h>
-#include <iomanip>
-#include <iostream>
-#include <mutex>
-#include <unordered_map>
-
-///@todo: enable once logging has been enabled in raft
-//#include "logger.hpp"
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a CUDA error is encountered.
- */
-struct cuda_error : public raft::exception {
-  explicit cuda_error(char const* const message) : raft::exception(message) {}
-  explicit cuda_error(std::string const& message) : raft::exception(message) {}
-};
-
-}  // namespace raft
-
-/**
- * @brief Error checking macro for CUDA runtime API functions.
- *
- * Invokes a CUDA runtime API function call, if the call does not return
- * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
- * exception detailing the CUDA error that occurred
- *
- */
-#define RAFT_CUDA_TRY(call)                        \
-  do {                                             \
-    cudaError_t const status = call;               \
-    if (status != cudaSuccess) {                   \
-      cudaGetLastError();                          \
-      std::string msg{};                           \
-      SET_ERROR_MSG(msg,                           \
-                    "CUDA error encountered at: ", \
-                    "call='%s', Reason=%s:%s",     \
-                    #call,                         \
-                    cudaGetErrorName(status),      \
-                    cudaGetErrorString(status));   \
-      throw raft::cuda_error(msg);                 \
-    }                                              \
-  } while (0)
-
-// FIXME: Remove after consumers rename
-#ifndef CUDA_TRY
-#define CUDA_TRY(call) RAFT_CUDA_TRY(call)
-#endif
-
-/**
- * @brief Debug macro to check for CUDA errors
- *
- * In a non-release build, this macro will synchronize the specified stream
- * before error checking. In both release and non-release builds, this macro
- * checks for any pending CUDA errors from previous calls. If an error is
- * reported, an exception is thrown detailing the CUDA error that occurred.
- *
- * The intent of this macro is to provide a mechanism for synchronous and
- * deterministic execution for debugging asynchronous CUDA execution. It should
- * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an
- * asynchronous kernel launch.
- */
-#ifndef NDEBUG
-#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-#else
-#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError());
-#endif
-
-// FIXME: Remove after consumers rename
-#ifndef CHECK_CUDA
-#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call)
-#endif
-
-/** FIXME: remove after cuml rename */
-#ifndef CUDA_CHECK
-#define CUDA_CHECK(call) RAFT_CUDA_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUDA_TRY_NO_THROW(call)                               \
-  do {                                                             \
-    cudaError_t const status = call;                               \
-    if (cudaSuccess != status) {                                   \
-      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                \
-             __FILE__,                                             \
-             __LINE__,                                             \
-             cudaGetErrorString(status));                          \
-    }                                                              \
-  } while (0)
-
-// FIXME: Remove after cuml rename
-#ifndef CUDA_CHECK_NO_THROW
-#define CUDA_CHECK_NO_THROW(call) RAFT_CUDA_TRY_NO_THROW(call)
-#endif
-
-/**
- * Alias to raft scope for now.
- * TODO: Rename original implementations in 22.04 to fix
- * https://github.com/rapidsai/raft/issues/128
- */
-
-namespace raft {
-
-/** Helper method to get to know warp size in device code */
-__host__ __device__ constexpr inline int warp_size() { return 32; }
-
-__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
-
-/**
- * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
- * elements to threads.
- */
-class grid_1d_thread_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   * @param elements_per_thread Typically, a single kernel thread processes more than a single
-   * element; this affects the number of threads the grid must contain
-   */
-  grid_1d_thread_t(size_t overall_num_elements,
-                   size_t num_threads_per_block,
-                   size_t max_num_blocks_1d,
-                   size_t elements_per_thread = 1)
-    : block_size(num_threads_per_block),
-      num_blocks(
-        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
-                   (elements_per_thread * num_threads_per_block),
-                 max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-    RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
-  }
-};
-
-/**
- * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
- * elements to warps.
- */
-class grid_1d_warp_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   */
-  grid_1d_warp_t(size_t overall_num_elements,
-                 size_t num_threads_per_block,
-                 size_t max_num_blocks_1d)
-    : block_size(num_threads_per_block),
-      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
-                            (num_threads_per_block / warp_size()),
-                          max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-  }
-};
-
-/**
- * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
- * elements to blocks.
- */
-class grid_1d_block_t {
- public:
-  int const block_size{0};
-  int const num_blocks{0};
-
-  /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   */
-  grid_1d_block_t(size_t overall_num_elements,
-                  size_t num_threads_per_block,
-                  size_t max_num_blocks_1d)
-    : block_size(num_threads_per_block),
-      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
-  {
-    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                 "num_threads_per_block / warp_size() must be > 0");
-  }
-};
-
-/**
- * @brief Generic copy method for all kinds of transfers
- * @tparam Type data type
- * @param dst destination pointer
- * @param src source pointer
- * @param len lenth of the src/dst buffers in terms of number of elements
- * @param stream cuda stream
- */
-template <typename Type>
-void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
-{
-  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
-}
-
-/**
- * @defgroup Copy Copy methods
- * These are here along with the generic 'copy' method in order to improve
- * code readability using explicitly specified function names
- * @{
- */
-/** performs a host to device copy */
-template <typename Type>
-void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
-{
-  copy(d_ptr, h_ptr, len, stream);
-}
-
-/** performs a device to host copy */
-template <typename Type>
-void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
-{
-  copy(h_ptr, d_ptr, len, stream);
-}
-
-template <typename Type>
-void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
-{
-  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
-}
-/** @} */
-
-/**
- * @defgroup Debug Utils for debugging host/device buffers
- * @{
- */
-template <class T, class OutStream>
-void print_host_vector(const char* variable_name,
-                       const T* host_mem,
-                       size_t componentsCount,
-                       OutStream& out)
-{
-  out << variable_name << "=[";
-  for (size_t i = 0; i < componentsCount; ++i) {
-    if (i != 0) out << ",";
-    out << host_mem[i];
-  }
-  out << "];\n";
-}
-
-template <class T, class OutStream>
-void print_device_vector(const char* variable_name,
-                         const T* devMem,
-                         size_t componentsCount,
-                         OutStream& out)
-{
-  T* host_mem = new T[componentsCount];
-  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
-  print_host_vector(variable_name, host_mem, componentsCount, out);
-  delete[] host_mem;
-}
-/** @} */
-
-/** helper method to get max usable shared mem per block parameter */
-inline int getSharedMemPerBlock()
-{
-  int devId;
-  RAFT_CUDA_TRY(cudaGetDevice(&devId));
-  int smemPerBlk;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
-  return smemPerBlk;
-}
-
-/** helper method to get multi-processor count parameter */
-inline int getMultiProcessorCount()
-{
-  int devId;
-  RAFT_CUDA_TRY(cudaGetDevice(&devId));
-  int mpCount;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
-  return mpCount;
-}
-
-/** helper method to convert an array on device to a string on host */
-template <typename T>
-std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
-{
-  std::stringstream ss;
-
-  T* arr_h = (T*)malloc(size * sizeof(T));
-  update_host(arr_h, arr, size, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-
-  ss << name << " = [ ";
-  for (int i = 0; i < size; i++) {
-    ss << std::setw(width) << arr_h[i];
-
-    if (i < size - 1) ss << ", ";
-  }
-  ss << " ]" << std::endl;
-
-  free(arr_h);
-
-  return ss.str();
-}
-
-/** this seems to be unused, but may be useful in the future */
-template <typename T>
-void ASSERT_DEVICE_MEM(T* ptr, std::string name)
-{
-  cudaPointerAttributes s_att;
-  cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
-
-  if (s_err != 0 || s_att.device == -1)
-    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
-              << ", err=" << s_err << std::endl;
-}
-
-inline uint32_t curTimeMillis()
-{
-  auto now      = std::chrono::high_resolution_clock::now();
-  auto duration = now.time_since_epoch();
-  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
-}
-
-/** Helper function to calculate need memory for allocate to store dense matrix.
- * @param rows number of rows in matrix
- * @param columns number of columns in matrix
- * @return need number of items to allocate via allocate()
- * @sa allocate()
- */
-inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
-
-/** Helper function to check alignment of pointer.
- * @param ptr the pointer to check
- * @param alignment to be checked for
- * @return true if address in bytes is a multiple of alignment
- */
-template <typename Type>
-bool is_aligned(Type* ptr, size_t alignment)
-{
-  return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
-}
-
-/** calculate greatest common divisor of two numbers
- * @a integer
- * @b integer
- * @ return gcd of a and b
- */
-template <typename IntType>
-IntType gcd(IntType a, IntType b)
-{
-  while (b != 0) {
-    IntType tmp = b;
-    b           = a % b;
-    a           = tmp;
-  }
-  return a;
-}
-
-}  // namespace raft
-
-#endif
+#include <raft/core/cudart_utils.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index 5e1aa3af28..dde6b127c9 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -15,167 +15,9 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use the include/raft_runtime/error.hpp instead.
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
 
-#ifndef __RAFT_RT_ERROR
-#define __RAFT_RT_ERROR
-
 #pragma once
-
-#include <cstdio>
-#include <execinfo.h>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
-namespace raft {
-
-/** base exception class for the whole of raft */
-class exception : public std::exception {
- public:
-  /** default ctor */
-  explicit exception() noexcept : std::exception(), msg_() {}
-
-  /** copy ctor */
-  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
-  {
-    collect_call_stack();
-  }
-
-  /** ctor from an input message */
-  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
-  {
-    collect_call_stack();
-  }
-
-  /** get the message associated with this exception */
-  char const* what() const noexcept override { return msg_.c_str(); }
-
- private:
-  /** message associated with this exception */
-  std::string msg_;
-
-  /** append call stack info to this exception's message for ease of debug */
-  // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-  void collect_call_stack() noexcept
-  {
-#ifdef __GNUC__
-    constexpr int kMaxStackDepth = 64;
-    void* stack[kMaxStackDepth];  // NOLINT
-    auto depth = backtrace(stack, kMaxStackDepth);
-    std::ostringstream oss;
-    oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
-    char** strings = backtrace_symbols(stack, depth);
-    if (strings == nullptr) {
-      oss << "But no stack trace could be found!" << std::endl;
-      msg_ += oss.str();
-      return;
-    }
-    ///@todo: support for demangling of C++ symbol names
-    for (int i = 0; i < depth; ++i) {
-      oss << "#" << i << " in " << strings[i] << std::endl;
-    }
-    free(strings);
-    msg_ += oss.str();
-#endif  // __GNUC__
-  }
-};
-
-/**
- * @brief Exception thrown when logical precondition is violated.
- *
- * This exception should not be thrown directly and is instead thrown by the
- * RAFT_EXPECTS and  RAFT_FAIL macros.
- *
- */
-struct logic_error : public raft::exception {
-  explicit logic_error(char const* const message) : raft::exception(message) {}
-  explicit logic_error(std::string const& message) : raft::exception(message) {}
-};
-
-}  // namespace raft
-
-// FIXME: Need to be replaced with RAFT_FAIL
-/** macro to throw a runtime error */
-#define THROW(fmt, ...)                                                                      \
-  do {                                                                                       \
-    int size1 =                                                                              \
-      std::snprintf(nullptr, 0, "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
-    int size2 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                               \
-    if (size1 < 0 || size2 < 0)                                                              \
-      throw raft::exception("Error in snprintf, cannot handle raft exception.");             \
-    auto size = size1 + size2 + 1; /* +1 for final '\0' */                                   \
-    auto buf  = std::make_unique<char[]>(size_t(size));                                      \
-    std::snprintf(buf.get(),                                                                 \
-                  size1 + 1 /* +1 for '\0' */,                                               \
-                  "exception occured! file=%s line=%d: ",                                    \
-                  __FILE__,                                                                  \
-                  __LINE__);                                                                 \
-    std::snprintf(buf.get() + size1, size2 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);       \
-    std::string msg(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
-    throw raft::exception(msg);                                                              \
-  } while (0)
-
-// FIXME: Need to be replaced with RAFT_EXPECTS
-/** macro to check for a conditional and assert on failure */
-#define ASSERT(check, fmt, ...)              \
-  do {                                       \
-    if (!(check)) THROW(fmt, ##__VA_ARGS__); \
-  } while (0)
-
-/**
- * Macro to append error message to first argument.
- * This should only be called in contexts where it is OK to throw exceptions!
- */
-#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                           \
-  do {                                                                                          \
-    int size1 = std::snprintf(nullptr, 0, "%s", location_prefix);                               \
-    int size2 = std::snprintf(nullptr, 0, "file=%s line=%d: ", __FILE__, __LINE__);             \
-    int size3 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                                  \
-    if (size1 < 0 || size2 < 0 || size3 < 0)                                                    \
-      throw raft::exception("Error in snprintf, cannot handle raft exception.");                \
-    auto size = size1 + size2 + size3 + 1; /* +1 for final '\0' */                              \
-    auto buf  = std::make_unique<char[]>(size_t(size));                                         \
-    std::snprintf(buf.get(), size1 + 1 /* +1 for '\0' */, "%s", location_prefix);               \
-    std::snprintf(                                                                              \
-      buf.get() + size1, size2 + 1 /* +1 for '\0' */, "file=%s line=%d: ", __FILE__, __LINE__); \
-    std::snprintf(buf.get() + size1 + size2, size3 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);  \
-    msg += std::string(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
-  } while (0)
-
-/**
- * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false
- *
- * @param[in] cond Expression that evaluates to true or false
- * @param[in] fmt String literal description of the reason that cond is expected to be true with
- * optinal format tagas
- * @throw raft::logic_error if the condition evaluates to false.
- */
-#define RAFT_EXPECTS(cond, fmt, ...)                              \
-  do {                                                            \
-    if (!(cond)) {                                                \
-      std::string msg{};                                          \
-      SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
-      throw raft::logic_error(msg);                               \
-    }                                                             \
-  } while (0)
-
-/**
- * @brief Indicates that an erroneous code path has been taken.
- *
- * @param[in] fmt String literal description of the reason that this code path is erroneous with
- * optinal format tagas
- * @throw always throws raft::logic_error
- */
-#define RAFT_FAIL(fmt, ...)                                     \
-  do {                                                          \
-    std::string msg{};                                          \
-    SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
-    throw raft::logic_error(msg);                               \
-  } while (0)
-
-#endif
\ No newline at end of file
+#include <raft/core/error.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 158816f762..da80958ab0 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -15,330 +15,9 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use the include/raft_runtime/handle.hpp instead.
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
 
-#ifndef __RAFT_RT_HANDLE
-#define __RAFT_RT_HANDLE
-
 #pragma once
-
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <cusolverDn.h>
-#include <cusolverSp.h>
-#include <cusparse.h>
-
-///@todo: enable once we have migrated cuml-comms layer too
-//#include <common/cuml_comms_int.hpp>
-
-#include "cudart_utils.h"
-
-#include <raft/comms/comms.hpp>
-#include <raft/interruptible.hpp>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/detail/cusolver_wrappers.hpp>
-#include <raft/sparse/detail/cusparse_macros.h>
-#include <rmm/cuda_stream_pool.hpp>
-#include <rmm/exec_policy.hpp>
-
-namespace raft {
-
-/**
- * @brief Main handle object that stores all necessary context used for calling
- *        necessary cuda kernels and/or libraries
- */
-class handle_t {
- public:
-  // delete copy/move constructors and assignment operators as
-  // copying and moving underlying resources is unsafe
-  handle_t(const handle_t&) = delete;
-  handle_t& operator=(const handle_t&) = delete;
-  handle_t(handle_t&&)                 = delete;
-  handle_t& operator=(handle_t&&) = delete;
-
-  /**
-   * @brief Construct a handle with a stream view and stream pool
-   *
-   * @param[in] stream_view the default stream (which has the default per-thread stream if
-   * unspecified)
-   * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
-   */
-  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
-           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
-    : dev_id_([]() -> int {
-        int cur_dev = -1;
-        RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
-        return cur_dev;
-      }()),
-      stream_view_{stream_view},
-      stream_pool_{stream_pool}
-  {
-    create_resources();
-  }
-
-  /** Destroys all held-up resources */
-  virtual ~handle_t() { destroy_resources(); }
-
-  int get_device() const { return dev_id_; }
-
-  cublasHandle_t get_cublas_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cublas_initialized_) {
-      RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
-      RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
-      cublas_initialized_ = true;
-    }
-    return cublas_handle_;
-  }
-
-  cusolverDnHandle_t get_cusolver_dn_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
-      cusolver_dn_initialized_ = true;
-    }
-    return cusolver_dn_handle_;
-  }
-
-  cusolverSpHandle_t get_cusolver_sp_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
-      cusolver_sp_initialized_ = true;
-    }
-    return cusolver_sp_handle_;
-  }
-
-  cusparseHandle_t get_cusparse_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusparse_initialized_) {
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
-      cusparse_initialized_ = true;
-    }
-    return cusparse_handle_;
-  }
-
-  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
-
-  /**
-   * @brief synchronize a stream on the handle
-   */
-  void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
-
-  /**
-   * @brief synchronize main stream on the handle
-   */
-  void sync_stream() const { sync_stream(stream_view_); }
-
-  /**
-   * @brief returns main stream on the handle
-   */
-  rmm::cuda_stream_view get_stream() const { return stream_view_; }
-
-  /**
-   * @brief returns whether stream pool was initialized on the handle
-   */
-
-  bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
-
-  /**
-   * @brief returns stream pool on the handle
-   */
-  const rmm::cuda_stream_pool& get_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return *stream_pool_;
-  }
-
-  std::size_t get_stream_pool_size() const
-  {
-    return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
-  }
-
-  /**
-   * @brief return stream from pool
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream();
-  }
-
-  /**
-   * @brief return stream from pool at index
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream(stream_idx);
-  }
-
-  /**
-   * @brief return stream from pool if size > 0, else main stream on handle
-   */
-  rmm::cuda_stream_view get_next_usable_stream() const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
-  }
-
-  /**
-   * @brief return stream from pool at index if size > 0, else main stream on handle
-   *
-   * @param[in] stream_idx the required index of the stream in the stream pool if available
-   */
-  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
-  }
-
-  /**
-   * @brief synchronize the stream pool on the handle
-   */
-  void sync_stream_pool() const
-  {
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      sync_stream(stream_pool_->get_stream(i));
-    }
-  }
-
-  /**
-   * @brief synchronize subset of stream pool
-   *
-   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
-   */
-  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    for (const auto& stream_index : stream_indices) {
-      sync_stream(stream_pool_->get_stream(stream_index));
-    }
-  }
-
-  /**
-   * @brief ask stream pool to wait on last event in main stream
-   */
-  void wait_stream_pool_on_stream() const
-  {
-    RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
-    }
-  }
-
-  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
-
-  const comms::comms_t& get_comms() const
-  {
-    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
-    return *communicator_;
-  }
-
-  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
-  {
-    subcomms_[key] = subcomm;
-  }
-
-  const comms::comms_t& get_subcomm(std::string key) const
-  {
-    RAFT_EXPECTS(
-      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
-
-    auto subcomm = subcomms_.at(key);
-
-    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
-
-    return *subcomm;
-  }
-
-  bool comms_initialized() const { return (nullptr != communicator_.get()); }
-
-  const cudaDeviceProp& get_device_properties() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!device_prop_initialized_) {
-      RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
-      device_prop_initialized_ = true;
-    }
-    return prop_;
-  }
-
- private:
-  std::shared_ptr<comms::comms_t> communicator_;
-  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
-
-  const int dev_id_;
-  mutable cublasHandle_t cublas_handle_;
-  mutable bool cublas_initialized_{false};
-  mutable cusolverDnHandle_t cusolver_dn_handle_;
-  mutable bool cusolver_dn_initialized_{false};
-  mutable cusolverSpHandle_t cusolver_sp_handle_;
-  mutable bool cusolver_sp_initialized_{false};
-  mutable cusparseHandle_t cusparse_handle_;
-  mutable bool cusparse_initialized_{false};
-  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
-  rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
-  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
-  cudaEvent_t event_;
-  mutable cudaDeviceProp prop_;
-  mutable bool device_prop_initialized_{false};
-  mutable std::mutex mutex_;
-
-  void create_resources()
-  {
-    thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
-
-    RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-  }
-
-  void destroy_resources()
-  {
-    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
-    if (cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
-    }
-    if (cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
-    }
-    if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
-    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
-  }
-};  // class handle_t
-
-/**
- * @brief RAII approach to synchronizing across all streams in the handle
- */
-class stream_syncer {
- public:
-  explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
-  ~stream_syncer()
-  {
-    handle_.wait_stream_pool_on_stream();
-    handle_.sync_stream_pool();
-  }
-
-  stream_syncer(const stream_syncer& other) = delete;
-  stream_syncer& operator=(const stream_syncer& other) = delete;
-
- private:
-  const handle_t& handle_;
-};  // class stream_syncer
-
-}  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/core/handle.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/interruptible.hpp b/cpp/include/raft/interruptible.hpp
index 6764065363..cf067112b5 100644
--- a/cpp/include/raft/interruptible.hpp
+++ b/cpp/include/raft/interruptible.hpp
@@ -15,262 +15,9 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use the include/raft_runtime/interruptible.hpp instead.
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
 
-#ifndef __RAFT_RT_INTERRUPTIBLE_H
-#define __RAFT_RT_INTERRUPTIBLE_H
-
 #pragma once
-
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <raft/cudart_utils.h>
-#include <raft/error.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <thread>
-#include <unordered_map>
-
-namespace raft {
-
-/**
- * @brief Exception thrown during `interruptible::synchronize` call when it detects a request
- * to cancel the work performed in this CPU thread.
- */
-struct interrupted_exception : public raft::exception {
-  using raft::exception::exception;
-};
-
-/**
- * @brief Cooperative-style interruptible execution.
- *
- * This class provides facilities for interrupting execution of a C++ thread at designated points
- * in code from outside of the thread. In particular, it provides an interruptible version of the
- * blocking CUDA synchronization function, that allows dropping a long-running GPU work.
- *
- *
- * **Important:** Although CUDA synchronize calls serve as cancellation points, the interruptible
- * machinery has nothing to do with CUDA streams or events. In other words, when you call `cancel`,
- * it’s the CPU waiting function what is interrupted, not the GPU stream work. This means, when the
- * `interrupted_exception` is raised, any unfinished GPU stream work continues to run. It’s the
- * responsibility of the developer then to make sure the unfinished stream work does not affect the
- * program in an undesirable way.
- *
- *
- * What can happen to CUDA stream when the `synchronize` is cancelled? If you catch the
- * `interrupted_exception` immediately, you can safely wait on the stream again.
- * Otherwise, some of the allocated resources may be released before the active kernel finishes
- * using them, which will result in writing into deallocated or reallocated memory and undefined
- * behavior in general. A dead-locked kernel may never finish (or may crash if you’re lucky). In
- * practice, the outcome is usually acceptable for the use case of emergency program interruption
- * (e.g., CTRL+C), but extra effort on the use side is required to allow safe interrupting and
- * resuming of the GPU stream work.
- */
-class interruptible {
- public:
-  /**
-   * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
-   * called on this CPU thread.
-   *
-   * @param [in] stream a CUDA stream.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread before the currently captured work has been finished.
-   * @throw raft::cuda_error if another CUDA error happens.
-   */
-  static inline void synchronize(rmm::cuda_stream_view stream)
-  {
-    get_token()->synchronize_impl(cudaStreamQuery, stream);
-  }
-
-  /**
-   * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
-   * called on this CPU thread.
-   *
-   * @param [in] event a CUDA event.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread before the currently captured work has been finished.
-   * @throw raft::cuda_error if another CUDA error happens.
-   */
-  static inline void synchronize(cudaEvent_t event)
-  {
-    get_token()->synchronize_impl(cudaEventQuery, event);
-  }
-
-  /**
-   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-   * `interruptible::cancel`.
-   *
-   * This is a cancellation point for an interruptible thread. It's called in the internals of
-   * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
-   * recommended to call `interruptible::yield()` in between to make sure the thread does not become
-   * unresponsive for too long.
-   *
-   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-   *
-   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-   * thread.
-   */
-  static inline void yield() { get_token()->yield_impl(); }
-
-  /**
-   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-   * `interruptible::cancel`.
-   *
-   * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
-   *
-   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-   *
-   * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
-   */
-  static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
-
-  /**
-   * @brief Get a cancellation token for this CPU thread.
-   *
-   * @return an object that can be used to cancel the GPU work waited on this CPU thread.
-   */
-  static inline auto get_token() -> std::shared_ptr<interruptible>
-  {
-    // NB: using static thread-local storage to keep the token alive once it is initialized
-    static thread_local std::shared_ptr<interruptible> s(
-      get_token_impl<true>(std::this_thread::get_id()));
-    return s;
-  }
-
-  /**
-   * @brief Get a cancellation token for a CPU thread given by its id.
-   *
-   * The returned token may live longer than the associated thread. In that case, using its
-   * `cancel` method has no effect.
-   *
-   * @param [in] thread_id an id of a C++ CPU thread.
-   * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
-   */
-  static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-  {
-    return get_token_impl<false>(thread_id);
-  }
-
-  /**
-   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-   * CPU thread given by the `thread_id`
-   *
-   * Note, this function uses a mutex to safely get a cancellation token that may be shared
-   * among multiple threads. If you plan to use it from a signal handler, consider the non-static
-   * `cancel()` instead.
-   *
-   * @param [in] thread_id a CPU thread, in which the work should be interrupted.
-   */
-  static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
-
-  /**
-   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-   * CPU thread given by this `interruptible` token.
-   *
-   * Note, this function does not involve thread synchronization/locks and does not throw any
-   * exceptions, so it's safe to call from a signal handler.
-   */
-  inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
-
-  // don't allow the token to leave the shared_ptr
-  interruptible(interruptible const&) = delete;
-  interruptible(interruptible&&)      = delete;
-  auto operator=(interruptible const&) -> interruptible& = delete;
-  auto operator=(interruptible&&) -> interruptible& = delete;
-
- private:
-  /** Global registry of thread-local cancellation stores. */
-  static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
-  /** Protect the access to the registry. */
-  static inline std::mutex mutex_;
-
-  /**
-   * Create a new interruptible token or get an existing from the global registry_.
-   *
-   * Presumptions:
-   *
-   *   1. get_token_impl<true> must be called at most once per thread.
-   *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
-   *   3. get_token_impl<false> can be called as many times as needed, producing a valid
-   *      token for any input thread_id, independent of whether a C++ thread with this
-   *      id exists or not.
-   *
-   * @tparam Claim whether to bind the token to the given thread.
-   * @param [in] thread_id the id of the associated C++ thread.
-   * @return new or existing interruptible token.
-   */
-  template <bool Claim>
-  static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-  {
-    std::lock_guard<std::mutex> guard_get(mutex_);
-    // the following constructs an empty shared_ptr if the key does not exist.
-    auto& weak_store  = registry_[thread_id];
-    auto thread_store = weak_store.lock();
-    if (!thread_store || (Claim && thread_store->claimed_)) {
-      // Create a new thread_store in two cases:
-      //  1. It does not exist in the map yet
-      //  2. The previous store in the map has not yet been deleted
-      thread_store.reset(new interruptible(), [thread_id](auto ts) {
-        std::lock_guard<std::mutex> guard_erase(mutex_);
-        auto found = registry_.find(thread_id);
-        if (found != registry_.end()) {
-          auto stored = found->second.lock();
-          // thread_store is not moveable, thus retains its original location.
-          // Not equal pointers below imply the new store has been already placed
-          // in the registry_ by the same std::thread::id
-          if (!stored || stored.get() == ts) { registry_.erase(found); }
-        }
-        delete ts;
-      });
-      std::weak_ptr<interruptible>(thread_store).swap(weak_store);
-    }
-    // The thread_store is "claimed" by the thread
-    if constexpr (Claim) { thread_store->claimed_ = true; }
-    return thread_store;
-  }
-
-  /**
-   * Communicate whether the thread is in a cancelled state or can continue execution.
-   *
-   * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
-   * These are the only two places where it's used.
-   */
-  std::atomic_flag continue_;
-  /** This flag is set to true when the created token is placed into a thread-local storage. */
-  bool claimed_ = false;
-
-  interruptible() noexcept { yield_no_throw_impl(); }
-
-  void yield_impl()
-  {
-    if (!yield_no_throw_impl()) {
-      throw interrupted_exception("The work in this thread was cancelled.");
-    }
-  }
-
-  auto yield_no_throw_impl() noexcept -> bool
-  {
-    return continue_.test_and_set(std::memory_order_relaxed);
-  }
-
-  template <typename Query, typename Object>
-  inline void synchronize_impl(Query query, Object object)
-  {
-    cudaError_t query_result;
-    while (true) {
-      yield_impl();
-      query_result = query(object);
-      if (query_result != cudaErrorNotReady) { break; }
-      std::this_thread::yield();
-    }
-    RAFT_CUDA_TRY(query_result);
-  }
-};
-
-}  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/core/interruptible.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cublas_macros.h b/cpp/include/raft/linalg/cublas_macros.h
index 0281c5c667..8250ad4217 100644
--- a/cpp/include/raft/linalg/cublas_macros.h
+++ b/cpp/include/raft/linalg/cublas_macros.h
@@ -13,114 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cublas_macros.hpp instead.
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
 
-#ifndef __RAFT_RT_CUBLAS_MACROS_H
-#define __RAFT_RT_CUBLAS_MACROS_H
 
 #pragma once
 
-#include <cublas_v2.h>
-#include <raft/error.hpp>
-
-///@todo: enable this once we have logger enabled
-//#include <cuml/common/logger.hpp>
-
-#include <cstdint>
-
-#define _CUBLAS_ERR_TO_STR(err) \
-  case err: return #err
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuBLAS error is encountered.
- */
-struct cublas_error : public raft::exception {
-  explicit cublas_error(char const* const message) : raft::exception(message) {}
-  explicit cublas_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-namespace detail {
-
-inline const char* cublas_error_to_string(cublasStatus_t err)
-{
-  switch (err) {
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default: return "CUBLAS_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace detail
-}  // namespace linalg
-}  // namespace raft
-
-#undef _CUBLAS_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuBLAS runtime API functions.
- *
- * Invokes a cuBLAS runtime API function call, if the call does not return
- * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
- */
-#define RAFT_CUBLAS_TRY(call)                                              \
-  do {                                                                     \
-    cublasStatus_t const status = (call);                                  \
-    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                   \
-      SET_ERROR_MSG(msg,                                                   \
-                    "cuBLAS error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                             \
-                    #call,                                                 \
-                    status,                                                \
-                    raft::linalg::detail::cublas_error_to_string(status)); \
-      throw raft::cublas_error(msg);                                       \
-    }                                                                      \
-  } while (0)
-
-// FIXME: Remove after consumers rename
-#ifndef CUBLAS_TRY
-#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUBLAS_TRY_NO_THROW(call)                               \
-  do {                                                               \
-    cublasStatus_t const status = call;                              \
-    if (CUBLAS_STATUS_SUCCESS != status) {                           \
-      printf("CUBLAS call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                  \
-             __FILE__,                                               \
-             __LINE__,                                               \
-             raft::linalg::detail::cublas_error_to_string(status));  \
-    }                                                                \
-  } while (0)
-
-/** FIXME: remove after cuml rename */
-#ifndef CUBLAS_CHECK
-#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
-#endif
-
-/** FIXME: remove after cuml rename */
-#ifndef CUBLAS_CHECK_NO_THROW
-#define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
-#endif
-
-#endif
\ No newline at end of file
+#include <raft/core/cublas_macros.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cusolver_macros.h b/cpp/include/raft/linalg/cusolver_macros.h
index df27f7ce26..bdf1238f65 100644
--- a/cpp/include/raft/linalg/cusolver_macros.h
+++ b/cpp/include/raft/linalg/cusolver_macros.h
@@ -13,110 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cusolver_macros.hpp instead.
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
 
-#ifndef __RAFT_RT_CUSOLVER_MACROS_H
-#define __RAFT_RT_CUSOLVER_MACROS_H
 
 #pragma once
 
-#include <cusolverDn.h>
-#include <cusolverSp.h>
-///@todo: enable this once logging is enabled
-//#include <cuml/common/logger.hpp>
-#include <raft/cudart_utils.h>
-#include <type_traits>
-
-#define _CUSOLVER_ERR_TO_STR(err) \
-  case err: return #err;
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuSOLVER error is encountered.
- */
-struct cusolver_error : public raft::exception {
-  explicit cusolver_error(char const* const message) : raft::exception(message) {}
-  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-
-inline const char* cusolver_error_to_string(cusolverStatus_t err)
-{
-  switch (err) {
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-    default: return "CUSOLVER_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace linalg
-}  // namespace raft
-
-#undef _CUSOLVER_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuSOLVER runtime API functions.
- *
- * Invokes a cuSOLVER runtime API function call, if the call does not return
- * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
- */
-#define RAFT_CUSOLVER_TRY(call)                                              \
-  do {                                                                       \
-    cusolverStatus_t const status = (call);                                  \
-    if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                     \
-      SET_ERROR_MSG(msg,                                                     \
-                    "cuSOLVER error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                               \
-                    #call,                                                   \
-                    status,                                                  \
-                    raft::linalg::detail::cusolver_error_to_string(status)); \
-      throw raft::cusolver_error(msg);                                       \
-    }                                                                        \
-  } while (0)
-
-// FIXME: remove after consumer rename
-#ifndef CUSOLVER_TRY
-#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUSOLVER_TRY_NO_THROW(call)                               \
-  do {                                                                 \
-    cusolverStatus_t const status = call;                              \
-    if (CUSOLVER_STATUS_SUCCESS != status) {                           \
-      printf("CUSOLVER call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                    \
-             __FILE__,                                                 \
-             __LINE__,                                                 \
-             raft::linalg::detail::cusolver_error_to_string(status));  \
-    }                                                                  \
-  } while (0)
-
-// FIXME: remove after cuml rename
-#ifndef CUSOLVER_CHECK
-#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
-#endif
-
-#ifndef CUSOLVER_CHECK_NO_THROW
-#define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
-#endif
-
-#endif
\ No newline at end of file
+#include <raft/core/cusolver_macros.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/mdarray.hpp b/cpp/include/raft/mdarray.hpp
index f92a0e5e59..30aed9b4e7 100644
--- a/cpp/include/raft/mdarray.hpp
+++ b/cpp/include/raft/mdarray.hpp
@@ -1,10 +1,3 @@
-/*
- * Copyright (2019) Sandia Corporation
- *
- * The source code is licensed under the 3-clause BSD license found in the LICENSE file
- * thirdparty/LICENSES/mdarray.license
- */
-
 /*
  * Copyright (c) 2022, NVIDIA CORPORATION.
  *
@@ -20,631 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
-#include <experimental/mdspan>
-#include <raft/detail/mdarray.hpp>
-#include <raft/handle.hpp>
-#include <rmm/cuda_stream_view.hpp>
-
-namespace raft {
-/**
- * @\brief C-Contiguous layout for mdarray and mdspan. Implies row-major and contiguous memory.
- */
-using layout_c_contiguous = detail::stdex::layout_right;
-
-/**
- * @\brief F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory.
- */
-using layout_f_contiguous = detail::stdex::layout_left;
-
-/**
- * @brief stdex::mdspan with device tag to avoid accessing incorrect memory location.
- */
-template <typename ElementType,
-          typename Extents,
-          typename LayoutPolicy   = layout_c_contiguous,
-          typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
-using device_mdspan = detail::stdex::
-  mdspan<ElementType, Extents, LayoutPolicy, detail::device_accessor<AccessorPolicy>>;
-
-/**
- * @brief stdex::mdspan with host tag to avoid accessing incorrect memory location.
- */
-template <typename ElementType,
-          typename Extents,
-          typename LayoutPolicy   = layout_c_contiguous,
-          typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
-using host_mdspan =
-  detail::stdex::mdspan<ElementType, Extents, LayoutPolicy, detail::host_accessor<AccessorPolicy>>;
-
-/**
- * @brief Modified from the c++ mdarray proposal
- *
- *   https://isocpp.org/files/papers/D1684R0.html
- *
- * mdarray is a container type for mdspan with similar template arguments.  However there
- * are some inconsistencies in between them.  We have made some modificiations to fit our
- * needs, which are listed below.
- *
- * - Layout policy is different, the mdarray in raft uses `stdex::extent` directly just
- *   like `mdspan`, while the `mdarray` in the reference implementation uses varidic
- *   template.
- *
- * - Most of the constructors from the reference implementation is removed to make sure
- *   CUDA stream is honorred.
- *
- * - unique_size is not implemented, which is still working in progress in the proposal
- *
- * - For container policy, we adopt the alternative approach documented in the proposal
- *   [sec 2.4.3], which requires an additional make_accessor method for it to be used in
- *   mdspan.  The container policy reference implementation has multiple `access` methods
- *   that accommodate needs for both mdarray and mdspan.  This is more difficult for us
- *   since the policy might contain states that are unwanted inside a CUDA kernel.  Also,
- *   on host we return a proxy to the actual value as `device_ref` so different access
- *   methods will have different return type, which is less desirable.
- *
- * - For the above reasons, copying from other mdarray with different policy type is also
- *   removed.
- */
-template <typename ElementType, typename Extents, typename LayoutPolicy, typename ContainerPolicy>
-class mdarray {
-  static_assert(!std::is_const<ElementType>::value,
-                "Element type for container must not be const.");
-
- public:
-  using extents_type = Extents;
-  using layout_type  = LayoutPolicy;
-  using mapping_type = typename layout_type::template mapping<extents_type>;
-  using element_type = ElementType;
-
-  using value_type      = std::remove_cv_t<element_type>;
-  using index_type      = std::size_t;
-  using difference_type = std::ptrdiff_t;
-  // Naming: ref impl: container_policy_type, proposal: container_policy
-  using container_policy_type = ContainerPolicy;
-  using container_type        = typename container_policy_type::container_type;
-
-  using pointer         = typename container_policy_type::pointer;
-  using const_pointer   = typename container_policy_type::const_pointer;
-  using reference       = typename container_policy_type::reference;
-  using const_reference = typename container_policy_type::const_reference;
-
- private:
-  template <typename E,
-            typename ViewAccessorPolicy =
-              std::conditional_t<std::is_const_v<E>,
-                                 typename container_policy_type::const_accessor_policy,
-                                 typename container_policy_type::accessor_policy>>
-  using view_type_impl =
-    std::conditional_t<container_policy_type::is_host_type::value,
-                       host_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>,
-                       device_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>>;
-
- public:
-  /**
-   * \brief the mdspan type returned by view method.
-   */
-  using view_type       = view_type_impl<element_type>;
-  using const_view_type = view_type_impl<element_type const>;
-
- public:
-  constexpr mdarray() noexcept(std::is_nothrow_default_constructible_v<container_type>)
-    : cp_{rmm::cuda_stream_default}, c_{cp_.create(0)} {};
-  constexpr mdarray(mdarray const&) noexcept(std::is_nothrow_copy_constructible_v<container_type>) =
-    default;
-  constexpr mdarray(mdarray&&) noexcept(std::is_nothrow_move_constructible<container_type>::value) =
-    default;
-
-  constexpr auto operator                                               =(mdarray const&) noexcept(
-    std::is_nothrow_copy_assignable<container_type>::value) -> mdarray& = default;
-  constexpr auto operator                                               =(mdarray&&) noexcept(
-    std::is_nothrow_move_assignable<container_type>::value) -> mdarray& = default;
-
-  ~mdarray() noexcept(std::is_nothrow_destructible<container_type>::value) = default;
-
-#ifndef RAFT_MDARRAY_CTOR_CONSTEXPR
-#if !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
-// 11.0:
-// Error: Internal Compiler Error (codegen): "there was an error in verifying the lgenfe output!"
-//
-// 11.2:
-// Call parameter type does not match function signature!
-// i8** null
-// i8*  %call14 = call i32 null(void (i8*)* null, i8* null, i8** null), !dbg !1060
-// <unnamed>: parse Invalid record (Producer: 'LLVM7.0.1' Reader: 'LLVM 7.0.1')
-#define RAFT_MDARRAY_CTOR_CONSTEXPR constexpr
-#else
-#define RAFT_MDARRAY_CTOR_CONSTEXPR
-#endif  // !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
-#endif  // RAFT_MDARRAY_CTOR_CONSTEXPR
-
-  /**
-   * @brief The only constructor that can create storage, this is to make sure CUDA stream is being
-   * used.
-   */
-  RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type const& cp)
-    : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
-  {
-  }
-  RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type& cp)
-    : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
-  {
-  }
-
-#undef RAFT_MDARRAY_CTOR_CONSTEXPR
-
-  /**
-   * @brief Get a mdspan that can be passed down to CUDA kernels.
-   */
-  auto view() noexcept { return view_type(c_.data(), map_, cp_.make_accessor_policy()); }
-  /**
-   * @brief Get a mdspan that can be passed down to CUDA kernels.
-   */
-  auto view() const noexcept
-  {
-    return const_view_type(c_.data(), map_, cp_.make_accessor_policy());
-  }
-
-  [[nodiscard]] constexpr auto size() const noexcept -> index_type { return this->view().size(); }
-
-  [[nodiscard]] auto data() noexcept -> pointer { return c_.data(); }
-  [[nodiscard]] constexpr auto data() const noexcept -> const_pointer { return c_.data(); }
-
-  /**
-   * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
-   */
-  template <typename... IndexType>
-  auto operator()(IndexType&&... indices)
-    -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
-                          (std::is_convertible_v<IndexType, index_type> && ...) &&
-                          std::is_constructible_v<extents_type, IndexType...> &&
-                          std::is_constructible_v<mapping_type, extents_type>,
-                        /* device policy is not default constructible due to requirement for CUDA
-                           stream. */
-                        /* std::is_default_constructible_v<container_policy_type> */
-                        reference>
-  {
-    return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
-  }
-
-  /**
-   * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
-   */
-  template <typename... IndexType>
-  auto operator()(IndexType&&... indices) const
-    -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
-                          (std::is_convertible_v<IndexType, index_type> && ...) &&
-                          std::is_constructible_v<extents_type, IndexType...> &&
-                          std::is_constructible<mapping_type, extents_type>::value,
-                        /* device policy is not default constructible due to requirement for CUDA
-                           stream. */
-                        /* std::is_default_constructible_v<container_policy_type> */
-                        const_reference>
-  {
-    return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
-  }
-
-  // basic_mdarray observers of the domain multidimensional index space (also in basic_mdspan)
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank() noexcept -> index_type
-  {
-    return extents_type::rank();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank_dynamic() noexcept -> index_type
-  {
-    return extents_type::rank_dynamic();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto static_extent(size_t r) noexcept
-    -> index_type
-  {
-    return extents_type::static_extent(r);
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extents() const noexcept -> extents_type
-  {
-    return map_.extents();
-  }
-  /**
-   * @brief the extent of rank r
-   */
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extent(size_t r) const noexcept -> index_type
-  {
-    return map_.extents().extent(r);
-  }
-  // mapping
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto mapping() const noexcept -> mapping_type
-  {
-    return map_;
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_unique() const noexcept -> bool
-  {
-    return map_.is_unique();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_contiguous() const noexcept -> bool
-  {
-    return map_.is_contiguous();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_strided() const noexcept -> bool
-  {
-    return map_.is_strided();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto stride(size_t r) const -> index_type
-  {
-    return map_.stride(r);
-  }
-
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_unique() noexcept -> bool
-  {
-    return mapping_type::is_always_unique();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_contiguous() noexcept -> bool
-  {
-    return mapping_type::is_always_contiguous();
-  }
-  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_strided() noexcept -> bool
-  {
-    return mapping_type::is_always_strided();
-  }
-
- private:
-  template <typename, typename, typename, typename>
-  friend class mdarray;
-
- private:
-  container_policy_type cp_;
-  mapping_type map_;
-  container_type c_;
-};
-
-/**
- * @brief mdarray with host container policy
- * @tparam ElementType the data type of the elements
- * @tparam Extents defines the shape
- * @tparam LayoutPolicy policy for indexing strides and layout ordering
- * @tparam ContainerPolicy storage and accessor policy
- */
-template <typename ElementType,
-          typename Extents,
-          typename LayoutPolicy    = layout_c_contiguous,
-          typename ContainerPolicy = detail::host_vector_policy<ElementType>>
-using host_mdarray =
-  mdarray<ElementType, Extents, LayoutPolicy, detail::host_accessor<ContainerPolicy>>;
-
-/**
- * @brief mdarray with device container policy
- * @tparam ElementType the data type of the elements
- * @tparam Extents defines the shape
- * @tparam LayoutPolicy policy for indexing strides and layout ordering
- * @tparam ContainerPolicy storage and accessor policy
- */
-template <typename ElementType,
-          typename Extents,
-          typename LayoutPolicy    = layout_c_contiguous,
-          typename ContainerPolicy = detail::device_uvector_policy<ElementType>>
-using device_mdarray =
-  mdarray<ElementType, Extents, LayoutPolicy, detail::device_accessor<ContainerPolicy>>;
-
-/**
- * @brief Shorthand for 0-dim host mdarray (scalar).
- * @tparam ElementType the data type of the scalar element
- */
-template <typename ElementType>
-using host_scalar = host_mdarray<ElementType, detail::scalar_extent>;
-
-/**
- * @brief Shorthand for 0-dim host mdarray (scalar).
- * @tparam ElementType the data type of the scalar element
- */
-template <typename ElementType>
-using device_scalar = device_mdarray<ElementType, detail::scalar_extent>;
-
-/**
- * @brief Shorthand for 1-dim host mdarray.
- * @tparam ElementType the data type of the vector elements
- */
-template <typename ElementType>
-using host_vector = host_mdarray<ElementType, detail::vector_extent>;
-
-/**
- * @brief Shorthand for 1-dim device mdarray.
- * @tparam ElementType the data type of the vector elements
- */
-template <typename ElementType>
-using device_vector = device_mdarray<ElementType, detail::vector_extent>;
-
-/**
- * @brief Shorthand for c-contiguous host matrix.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using host_matrix = host_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
-
-/**
- * @brief Shorthand for c-contiguous device matrix.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using device_matrix = device_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
-
-/**
- * @brief Shorthand for 0-dim host mdspan (scalar).
- * @tparam ElementType the data type of the scalar element
- */
-template <typename ElementType>
-using host_scalar_view = host_mdspan<ElementType, detail::scalar_extent>;
-
-/**
- * @brief Shorthand for 0-dim host mdspan (scalar).
- * @tparam ElementType the data type of the scalar element
- */
-template <typename ElementType>
-using device_scalar_view = device_mdspan<ElementType, detail::scalar_extent>;
-
-/**
- * @brief Shorthand for 1-dim host mdspan.
- * @tparam ElementType the data type of the vector elements
- */
-template <typename ElementType>
-using host_vector_view = host_mdspan<ElementType, detail::vector_extent>;
-
-/**
- * @brief Shorthand for 1-dim device mdspan.
- * @tparam ElementType the data type of the vector elements
- */
-template <typename ElementType>
-using device_vector_view = device_mdspan<ElementType, detail::vector_extent>;
-
-/**
- * @brief Shorthand for c-contiguous host matrix view.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- *
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using host_matrix_view = host_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
-
-/**
- * @brief Shorthand for c-contiguous device matrix view.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- *
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using device_matrix_view = device_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
-
-/**
- * @brief Create a 0-dim (scalar) mdspan instance for host value.
- *
- * @tparam ElementType the data type of the matrix elements
- * @param[in] ptr on device to wrap
- */
-template <typename ElementType>
-auto make_host_scalar_view(ElementType* ptr)
-{
-  detail::scalar_extent extents;
-  return host_scalar_view<ElementType>{ptr, extents};
-}
-
-/**
- * @brief Create a 0-dim (scalar) mdspan instance for device value.
- *
- * @tparam ElementType the data type of the matrix elements
- * @param[in] ptr on device to wrap
- */
-template <typename ElementType>
-auto make_device_scalar_view(ElementType* ptr)
-{
-  detail::scalar_extent extents;
-  return device_scalar_view<ElementType>{ptr, extents};
-}
-
-/**
- * @brief Create a 2-dim c-contiguous mdspan instance for host pointer. It's
- *        expected that the given layout policy match the layout of the underlying
- *        pointer.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param[in] ptr on host to wrap
- * @param[in] n_rows number of rows in pointer
- * @param[in] n_cols number of columns in pointer
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
-{
-  detail::matrix_extent extents{n_rows, n_cols};
-  return host_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
-}
-/**
- * @brief Create a 2-dim c-contiguous mdspan instance for device pointer. It's
- *        expected that the given layout policy match the layout of the underlying
- *        pointer.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param[in] ptr on device to wrap
- * @param[in] n_rows number of rows in pointer
- * @param[in] n_cols number of columns in pointer
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
-{
-  detail::matrix_extent extents{n_rows, n_cols};
-  return device_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
-}
 
 /**
- * @brief Create a 1-dim mdspan instance for host pointer.
- * @tparam ElementType the data type of the vector elements
- * @param[in] ptr on host to wrap
- * @param[in] n number of elements in pointer
- * @return raft::host_vector_view
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
-template <typename ElementType>
-auto make_host_vector_view(ElementType* ptr, size_t n)
-{
-  detail::vector_extent extents{n};
-  return host_matrix_view<ElementType>{ptr, extents};
-}
 
-/**
- * @brief Create a 1-dim mdspan instance for device pointer.
- * @tparam ElementType the data type of the vector elements
- * @param[in] ptr on device to wrap
- * @param[in] n number of elements in pointer
- * @return raft::device_vector_view
- */
-template <typename ElementType>
-auto make_device_vector_view(ElementType* ptr, size_t n)
-{
-  detail::vector_extent extents{n};
-  return device_matrix_view<ElementType>{ptr, extents};
-}
-
-/**
- * @brief Create a 2-dim c-contiguous host mdarray.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param[in] n_rows number or rows in matrix
- * @param[in] n_cols number of columns in matrix
- * @return raft::host_matrix
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_host_matrix(size_t n_rows, size_t n_cols)
-{
-  detail::matrix_extent extents{n_rows, n_cols};
-  using policy_t = typename host_matrix<ElementType>::container_policy_type;
-  policy_t policy;
-  return host_matrix<ElementType, LayoutPolicy>{extents, policy};
-}
-
-/**
- * @brief Create a 2-dim c-contiguous device mdarray.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param[in] n_rows number or rows in matrix
- * @param[in] n_cols number of columns in matrix
- * @param[in] stream cuda stream for ordering events
- * @return raft::device_matrix
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
-{
-  detail::matrix_extent extents{n_rows, n_cols};
-  using policy_t = typename device_matrix<ElementType>::container_policy_type;
-  policy_t policy{stream};
-  return device_matrix<ElementType, LayoutPolicy>{extents, policy};
-}
-
-/**
- * @brief Create a 2-dim c-contiguous device mdarray.
- *
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param[in] handle raft handle for managing expensive resources
- * @param[in] n_rows number or rows in matrix
- * @param[in] n_cols number of columns in matrix
- * @return raft::device_matrix
- */
-template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_cols)
-{
-  return make_device_matrix<ElementType, LayoutPolicy>(n_rows, n_cols, handle.get_stream());
-}
-
-/**
- * @brief Create a host scalar from v.
- *
- * @tparam ElementType the data type of the scalar element
- * @param[in] v scalar type to wrap
- * @return raft::host_scalar
- */
-template <typename ElementType>
-auto make_host_scalar(ElementType const& v)
-{
-  // FIXME(jiamingy): We can optimize this by using std::array as container policy, which
-  // requires some more compile time dispatching. This is enabled in the ref impl but
-  // hasn't been ported here yet.
-  detail::scalar_extent extents;
-  using policy_t = typename host_scalar<ElementType>::container_policy_type;
-  policy_t policy;
-  auto scalar = host_scalar<ElementType>{extents, policy};
-  scalar(0)   = v;
-  return scalar;
-}
-
-/**
- * @brief Create a device scalar from v.
- *
- * @tparam ElementType the data type of the scalar element
- * @param[in] v scalar type to wrap on device
- * @param[in] stream the cuda stream for ordering events
- * @return raft::device_scalar
- */
-template <typename ElementType>
-auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
-{
-  detail::scalar_extent extents;
-  using policy_t = typename device_scalar<ElementType>::container_policy_type;
-  policy_t policy{stream};
-  auto scalar = device_scalar<ElementType>{extents, policy};
-  scalar(0)   = v;
-  return scalar;
-}
-
-/**
- * @brief Create a device scalar from v.
- *
- * @tparam ElementType the data type of the scalar element
- * @param[in] handle raft handle for managing expensive cuda resources
- * @param[in] v scalar to wrap on device
- * @return raft::device_scalar
- */
-template <typename ElementType>
-auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
-{
-  return make_device_scalar<ElementType>(v, handle.get_stream());
-}
-
-/**
- * @brief Create a 1-dim host mdarray.
- * @tparam ElementType the data type of the vector elements
- * @param[in] n number of elements in vector
- * @return raft::host_vector
- */
-template <typename ElementType>
-auto make_host_vector(size_t n)
-{
-  detail::vector_extent extents{n};
-  using policy_t = typename host_vector<ElementType>::container_policy_type;
-  policy_t policy;
-  return host_vector<ElementType>{extents, policy};
-}
-
-/**
- * @brief Create a 1-dim device mdarray.
- * @tparam ElementType the data type of the vector elements
- * @param[in] n number of elements in vector
- * @param[in] stream the cuda stream for ordering events
- * @return raft::device_vector
- */
-template <typename ElementType>
-auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
-{
-  detail::vector_extent extents{n};
-  using policy_t = typename device_vector<ElementType>::container_policy_type;
-  policy_t policy{stream};
-  return device_vector<ElementType>{extents, policy};
-}
-
-/**
- * @brief Create a 1-dim device mdarray.
- * @tparam ElementType the data type of the vector elements
- * @param[in] handle raft handle for managing expensive cuda resources
- * @param[in] n number of elements in vector
- * @return raft::device_vector
- */
-template <typename ElementType>
-auto make_device_vector(raft::handle_t const& handle, size_t n)
-{
-  return make_device_vector<ElementType>(n, handle.get_stream());
-}
-}  // namespace raft
+#pragma once
+#include <raft/core/mdarray.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/span.hpp b/cpp/include/raft/span.hpp
index b4fbf5b63a..ba7b393ca7 100644
--- a/cpp/include/raft/span.hpp
+++ b/cpp/include/raft/span.hpp
@@ -13,270 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
-
-#include <cassert>
-#include <cinttypes>  // size_t
-#include <cstddef>    // std::byte
-#include <raft/detail/span.hpp>
-#include <thrust/functional.h>
-#include <thrust/host_vector.h>  // __host__ __device__
-#include <thrust/iterator/reverse_iterator.h>
-#include <type_traits>
-
-namespace raft {
-/**
- * @brief The span class defined in ISO C++20.  Iterator is defined as plain pointer and
- *        most of the methods have bound check on debug build.
- *
- * @code
- *   rmm::device_uvector<float> uvec(10, rmm::cuda_stream_default);
- *   auto view = device_span<float>{uvec.data(), uvec.size()};
- * @endcode
- */
-template <typename T, bool is_device, std::size_t Extent = dynamic_extent>
-class span {
- public:
-  using element_type    = T;
-  using value_type      = typename std::remove_cv<T>::type;
-  using size_type       = std::size_t;
-  using difference_type = std::ptrdiff_t;
-  using pointer         = T*;
-  using const_pointer   = T const*;
-  using reference       = T&;
-  using const_reference = T const&;
-
-  using iterator               = pointer;
-  using const_iterator         = const_pointer;
-  using reverse_iterator       = thrust::reverse_iterator<iterator>;
-  using const_reverse_iterator = thrust::reverse_iterator<const_iterator>;
-
-  /**
-   * @brief Default constructor that constructs a span with size 0 and nullptr.
-   */
-  constexpr span() noexcept = default;
-
-  /**
-   * @brief Constructs a span that is a view over the range [first, first + count);
-   */
-  constexpr span(pointer ptr, size_type count) noexcept : storage_{ptr, count}
-  {
-    assert(!(Extent != dynamic_extent && count != Extent));
-    assert(ptr || count == 0);
-  }
-  /**
-   * @brief Constructs a span that is a view over the range [first, last)
-   */
-  constexpr span(pointer first, pointer last) noexcept
-    : span{first, static_cast<size_type>(thrust::distance(first, last))}
-  {
-  }
-  /**
-   * @brief Constructs a span that is a view over the array arr.
-   */
-  template <std::size_t N>
-  constexpr span(element_type (&arr)[N]) noexcept : span{&arr[0], N}
-  {
-  }
-
-  /**
-   * @brief Initialize a span class from another one who's underlying type is convertible
-   *        to element_type.
-   */
-  template <class U,
-            std::size_t OtherExtent,
-            class = typename std::enable_if<
-              detail::is_allowed_element_type_conversion_t<U, T>::value &&
-              detail::is_allowed_extent_conversion_t<OtherExtent, Extent>::value>>
-  constexpr span(const span<U, is_device, OtherExtent>& other) noexcept
-    : span{other.data(), other.size()}
-  {
-  }
-
-  constexpr span(span const& other) noexcept = default;
-  constexpr span(span&& other) noexcept      = default;
-
-  constexpr auto operator=(span const& other) noexcept -> span& = default;
-  constexpr auto operator=(span&& other) noexcept -> span& = default;
-
-  constexpr auto begin() const noexcept -> iterator { return data(); }
-
-  constexpr auto end() const noexcept -> iterator { return data() + size(); }
-
-  constexpr auto cbegin() const noexcept -> const_iterator { return data(); }
-
-  constexpr auto cend() const noexcept -> const_iterator { return data() + size(); }
-
-  __host__ __device__ constexpr auto rbegin() const noexcept -> reverse_iterator
-  {
-    return reverse_iterator{end()};
-  }
-
-  __host__ __device__ constexpr auto rend() const noexcept -> reverse_iterator
-  {
-    return reverse_iterator{begin()};
-  }
-
-  __host__ __device__ constexpr auto crbegin() const noexcept -> const_reverse_iterator
-  {
-    return const_reverse_iterator{cend()};
-  }
-
-  __host__ __device__ constexpr auto crend() const noexcept -> const_reverse_iterator
-  {
-    return const_reverse_iterator{cbegin()};
-  }
-
-  // element access
-  constexpr auto front() const -> reference { return (*this)[0]; }
-
-  constexpr auto back() const -> reference { return (*this)[size() - 1]; }
-
-  template <typename Index>
-  constexpr auto operator[](Index _idx) const -> reference
-  {
-    assert(static_cast<size_type>(_idx) < size());
-    return data()[_idx];
-  }
-
-  constexpr auto data() const noexcept -> pointer { return storage_.data(); }
-
-  // Observers
-  [[nodiscard]] constexpr auto size() const noexcept -> size_type { return storage_.size(); }
-  [[nodiscard]] constexpr auto size_bytes() const noexcept -> size_type
-  {
-    return size() * sizeof(T);
-  }
-
-  constexpr auto empty() const noexcept { return size() == 0; }
-
-  // Subviews
-  template <std::size_t Count>
-  constexpr auto first() const -> span<element_type, is_device, Count>
-  {
-    assert(Count <= size());
-    return {data(), Count};
-  }
-
-  constexpr auto first(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
-  {
-    assert(_count <= size());
-    return {data(), _count};
-  }
-
-  template <std::size_t Count>
-  constexpr auto last() const -> span<element_type, is_device, Count>
-  {
-    assert(Count <= size());
-    return {data() + size() - Count, Count};
-  }
-
-  constexpr auto last(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
-  {
-    assert(_count <= size());
-    return subspan(size() - _count, _count);
-  }
-
-  /*!
-   * If Count is std::dynamic_extent, r.size() == this->size() - Offset;
-   * Otherwise r.size() == Count.
-   */
-  template <std::size_t Offset, std::size_t Count = dynamic_extent>
-  constexpr auto subspan() const
-    -> span<element_type, is_device, detail::extent_value_t<Extent, Offset, Count>::value>
-  {
-    assert((Count == dynamic_extent) ? (Offset <= size()) : (Offset + Count <= size()));
-    return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
-  }
-
-  constexpr auto subspan(size_type _offset, size_type _count = dynamic_extent) const
-    -> span<element_type, is_device, dynamic_extent>
-  {
-    assert((_count == dynamic_extent) ? (_offset <= size()) : (_offset + _count <= size()));
-    return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
-  }
-
- private:
-  detail::span_storage<T, Extent> storage_;
-};
-
-/**
- * @brief A span class for host pointer.
- */
-template <typename T, size_t extent = dynamic_extent>
-using host_span = span<T, false, extent>;
 
 /**
- * @brief A span class for device pointer.
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
-template <typename T, size_t extent = dynamic_extent>
-using device_span = span<T, true, extent>;
 
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator==(span<T, is_device, X> l, span<U, is_device, Y> r) -> bool
-{
-  if (l.size() != r.size()) { return false; }
-  for (auto l_beg = l.cbegin(), r_beg = r.cbegin(); l_beg != l.cend(); ++l_beg, ++r_beg) {
-    if (*l_beg != *r_beg) { return false; }
-  }
-  return true;
-}
-
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator!=(span<T, is_device, X> l, span<U, is_device, Y> r)
-{
-  return !(l == r);
-}
-
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator<(span<T, is_device, X> l, span<U, is_device, Y> r)
-{
-  return detail::lexicographical_compare<
-    typename span<T, is_device, X>::iterator,
-    typename span<U, is_device, Y>::iterator,
-    thrust::less<typename span<T, is_device, X>::element_type>>(
-    l.begin(), l.end(), r.begin(), r.end());
-}
-
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator<=(span<T, is_device, X> l, span<U, is_device, Y> r)
-{
-  return !(l > r);
-}
-
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator>(span<T, is_device, X> l, span<U, is_device, Y> r)
-{
-  return detail::lexicographical_compare<
-    typename span<T, is_device, X>::iterator,
-    typename span<U, is_device, Y>::iterator,
-    thrust::greater<typename span<T, is_device, X>::element_type>>(
-    l.begin(), l.end(), r.begin(), r.end());
-}
-
-template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-constexpr auto operator>=(span<T, is_device, X> l, span<U, is_device, Y> r)
-{
-  return !(l < r);
-}
-
-/**
- * @brief Converts a span into a view of its underlying bytes
- */
-template <class T, bool is_device, std::size_t E>
-auto as_bytes(span<T, is_device, E> s) noexcept
-  -> span<const std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
-{
-  return {reinterpret_cast<const std::byte*>(s.data()), s.size_bytes()};
-}
-
-/**
- * @brief Converts a span into a mutable view of its underlying bytes
- */
-template <class T, bool is_device, std::size_t E>
-auto as_writable_bytes(span<T, is_device, E> s) noexcept
-  -> span<std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
-{
-  return {reinterpret_cast<std::byte*>(s.data()), s.size_bytes()};
-}
-}  // namespace raft
+#pragma once
+#include <raft/core/span.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/detail/cusparse_macros.h b/cpp/include/raft/sparse/detail/cusparse_macros.h
index 10c7e8836c..e7d81f51aa 100644
--- a/cpp/include/raft/sparse/detail/cusparse_macros.h
+++ b/cpp/include/raft/sparse/detail/cusparse_macros.h
@@ -13,111 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#pragma once
-
-#include <cusparse.h>
-#include <raft/error.hpp>
-///@todo: enable this once logging is enabled
-//#include <cuml/common/logger.hpp>
-
-#define _CUSPARSE_ERR_TO_STR(err) \
-  case err: return #err;
-
-// Notes:
-//(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
-//(2.) to enforce a lower version,
-//
-//`#define CUDA_ENFORCE_LOWER
-// #include <raft/sparse/detail/cusparse_wrappers.h>`
-//
-// (i.e., before including this header)
-//
-#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10100)
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuSparse error is encountered.
- */
-struct cusparse_error : public raft::exception {
-  explicit cusparse_error(char const* const message) : raft::exception(message) {}
-  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace sparse {
-namespace detail {
-
-inline const char* cusparse_error_to_string(cusparseStatus_t err)
-{
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
-  return cusparseGetErrorString(err);
-#else   // CUDART_VERSION
-  switch (err) {
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    default: return "CUSPARSE_STATUS_UNKNOWN";
-  };
-#endif  // CUDART_VERSION
-}
-
-}  // namespace detail
-}  // namespace sparse
-}  // namespace raft
-
-#undef _CUSPARSE_ERR_TO_STR
-
 /**
- * @brief Error checking macro for cuSparse runtime API functions.
- *
- * Invokes a cuSparse runtime API function call, if the call does not return
- * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
-#define RAFT_CUSPARSE_TRY(call)                                              \
-  do {                                                                       \
-    cusparseStatus_t const status = (call);                                  \
-    if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                     \
-      SET_ERROR_MSG(msg,                                                     \
-                    "cuSparse error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                               \
-                    #call,                                                   \
-                    status,                                                  \
-                    raft::sparse::detail::cusparse_error_to_string(status)); \
-      throw raft::cusparse_error(msg);                                       \
-    }                                                                        \
-  } while (0)
-
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_TRY
-#define CUSPARSE_TRY(call) RAFT_CUSPARSE_TRY(call)
-#endif
 
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_CHECK
-#define CUSPARSE_CHECK(call) CUSPARSE_TRY(call)
-#endif
-
-//@todo: use logger here once logging is enabled
-/** check for cusparse runtime API errors but do not assert */
-#define RAFT_CUSPARSE_TRY_NO_THROW(call)                           \
-  do {                                                             \
-    cusparseStatus_t err = call;                                   \
-    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
-      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
-             #call,                                                \
-             err,                                                  \
-             raft::sparse::detail::cusparse_error_to_string(err)); \
-    }                                                              \
-  } while (0)
+#pragma once
 
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_CHECK_NO_THROW
-#define CUSPARSE_CHECK_NO_THROW(call) RAFT_CUSPARSE_TRY_NO_THROW(call)
-#endif
+#include <raft/core/cusparse_macros.hpp>
\ No newline at end of file

From 6eddecdfea122140c36ed77789467531795bcaff Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 16:09:11 -0400
Subject: [PATCH 149/167] Updating style

---
 cpp/include/raft/common/nvtx.hpp          |    2 +-
 cpp/include/raft/core/comms.hpp           | 1136 ++++++++++-----------
 cpp/include/raft/core/cublas_macros.hpp   |   54 +-
 cpp/include/raft/core/cudart_utils.hpp    |  384 +++----
 cpp/include/raft/core/cusolver_macros.hpp |   50 +-
 cpp/include/raft/core/cusparse_macros.hpp |   46 +-
 cpp/include/raft/core/error.hpp           |   70 +-
 cpp/include/raft/core/handle.hpp          |  556 +++++-----
 cpp/include/raft/core/interruptible.hpp   |  378 +++----
 cpp/include/raft/core/logger.hpp          |  312 +++---
 cpp/include/raft/core/mdarray.hpp         |  686 ++++++-------
 cpp/include/raft/core/nvtx.hpp            |   86 +-
 cpp/include/raft/core/span.hpp            |  386 +++----
 cpp/include/raft/linalg/cublas_macros.h   |    1 -
 cpp/include/raft/linalg/cusolver_macros.h |    1 -
 15 files changed, 2073 insertions(+), 2075 deletions(-)

diff --git a/cpp/include/raft/common/nvtx.hpp b/cpp/include/raft/common/nvtx.hpp
index 6125c937ea..f5c7527580 100644
--- a/cpp/include/raft/common/nvtx.hpp
+++ b/cpp/include/raft/common/nvtx.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/core/comms.hpp b/cpp/include/raft/core/comms.hpp
index b93924b3d2..bf2f7af777 100644
--- a/cpp/include/raft/core/comms.hpp
+++ b/cpp/include/raft/core/comms.hpp
@@ -21,613 +21,613 @@
 #include <vector>
 
 namespace raft {
-    namespace comms {
+namespace comms {
 
-        typedef unsigned int request_t;
-        enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
-        enum class op_t { SUM, PROD, MIN, MAX };
+typedef unsigned int request_t;
+enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
+enum class op_t { SUM, PROD, MIN, MAX };
 
 /**
  * The resulting status of distributed stream synchronization
  */
-        enum class status_t {
-            SUCCESS,  // Synchronization successful
-            ERROR,    // An error occured querying sync status
-            ABORT     // A failure occurred in sync, queued operations aborted
-        };
+enum class status_t {
+  SUCCESS,  // Synchronization successful
+  ERROR,    // An error occured querying sync status
+  ABORT     // A failure occurred in sync, queued operations aborted
+};
 
-        template <typename value_t>
-        constexpr datatype_t
+template <typename value_t>
+constexpr datatype_t
 
-        get_type();
+get_type();
 
-        template <>
-        constexpr datatype_t
+template <>
+constexpr datatype_t
 
-        get_type<char>()
-        {
-            return datatype_t::CHAR;
-        }
+get_type<char>()
+{
+  return datatype_t::CHAR;
+}
 
-        template <>
-        constexpr datatype_t
+template <>
+constexpr datatype_t
 
-        get_type<uint8_t>()
-        {
-            return datatype_t::UINT8;
-        }
+get_type<uint8_t>()
+{
+  return datatype_t::UINT8;
+}
 
-        template <>
-        constexpr datatype_t
+template <>
+constexpr datatype_t
 
-        get_type<int>()
-        {
-            return datatype_t::INT32;
-        }
+get_type<int>()
+{
+  return datatype_t::INT32;
+}
 
-        template <>
-        constexpr datatype_t
+template <>
+constexpr datatype_t
 
-        get_type<uint32_t>()
-        {
-            return datatype_t::UINT32;
-        }
+get_type<uint32_t>()
+{
+  return datatype_t::UINT32;
+}
 
-        template <>
-        constexpr datatype_t
+template <>
+constexpr datatype_t
 
-        get_type<int64_t>()
-        {
-            return datatype_t::INT64;
-        }
+get_type<int64_t>()
+{
+  return datatype_t::INT64;
+}
 
-        template <>
-        constexpr datatype_t
+template <>
+constexpr datatype_t
 
-        get_type<uint64_t>()
-        {
-            return datatype_t::UINT64;
-        }
+get_type<uint64_t>()
+{
+  return datatype_t::UINT64;
+}
 
-        template <>
-        constexpr datatype_t
+template <>
+constexpr datatype_t
 
-        get_type<float>()
-        {
-            return datatype_t::FLOAT32;
-        }
+get_type<float>()
+{
+  return datatype_t::FLOAT32;
+}
 
-        template <>
-        constexpr datatype_t
+template <>
+constexpr datatype_t
 
-        get_type<double>()
-        {
-            return datatype_t::FLOAT64;
-        }
+get_type<double>()
+{
+  return datatype_t::FLOAT64;
+}
 
-        class comms_iface {
-        public:
-            virtual ~comms_iface() {}
+class comms_iface {
+ public:
+  virtual ~comms_iface() {}
 
-            virtual int get_size() const = 0;
+  virtual int get_size() const = 0;
 
-            virtual int get_rank() const = 0;
+  virtual int get_rank() const = 0;
 
-            virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
+  virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
 
-            virtual void barrier() const = 0;
+  virtual void barrier() const = 0;
 
-            virtual status_t sync_stream(cudaStream_t stream) const = 0;
+  virtual status_t sync_stream(cudaStream_t stream) const = 0;
 
-            virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
+  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
 
-            virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
+  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
 
-            virtual void waitall(int count, request_t array_of_requests[]) const = 0;
+  virtual void waitall(int count, request_t array_of_requests[]) const = 0;
 
-            virtual void allreduce(const void* sendbuff,
-                                   void* recvbuff,
-                                   size_t count,
-                                   datatype_t datatype,
-                                   op_t op,
-                                   cudaStream_t stream) const = 0;
+  virtual void allreduce(const void* sendbuff,
+                         void* recvbuff,
+                         size_t count,
+                         datatype_t datatype,
+                         op_t op,
+                         cudaStream_t stream) const = 0;
 
-            virtual void bcast(
-                    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
+  virtual void bcast(
+    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
 
-            virtual void bcast(const void* sendbuff,
-                               void* recvbuff,
-                               size_t count,
-                               datatype_t datatype,
-                               int root,
+  virtual void bcast(const void* sendbuff,
+                     void* recvbuff,
+                     size_t count,
+                     datatype_t datatype,
+                     int root,
+                     cudaStream_t stream) const = 0;
+
+  virtual void reduce(const void* sendbuff,
+                      void* recvbuff,
+                      size_t count,
+                      datatype_t datatype,
+                      op_t op,
+                      int root,
+                      cudaStream_t stream) const = 0;
+
+  virtual void allgather(const void* sendbuff,
+                         void* recvbuff,
+                         size_t sendcount,
+                         datatype_t datatype,
+                         cudaStream_t stream) const = 0;
+
+  virtual void allgatherv(const void* sendbuf,
+                          void* recvbuf,
+                          const size_t* recvcounts,
+                          const size_t* displs,
+                          datatype_t datatype,
+                          cudaStream_t stream) const = 0;
+
+  virtual void gather(const void* sendbuff,
+                      void* recvbuff,
+                      size_t sendcount,
+                      datatype_t datatype,
+                      int root,
+                      cudaStream_t stream) const = 0;
+
+  virtual void gatherv(const void* sendbuf,
+                       void* recvbuf,
+                       size_t sendcount,
+                       const size_t* recvcounts,
+                       const size_t* displs,
+                       datatype_t datatype,
+                       int root,
+                       cudaStream_t stream) const = 0;
+
+  virtual void reducescatter(const void* sendbuff,
+                             void* recvbuff,
+                             size_t recvcount,
+                             datatype_t datatype,
+                             op_t op,
+                             cudaStream_t stream) const = 0;
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
+
+  virtual void device_sendrecv(const void* sendbuf,
+                               size_t sendsize,
+                               int dest,
+                               void* recvbuf,
+                               size_t recvsize,
+                               int source,
                                cudaStream_t stream) const = 0;
 
-            virtual void reduce(const void* sendbuff,
-                                void* recvbuff,
-                                size_t count,
-                                datatype_t datatype,
-                                op_t op,
-                                int root,
-                                cudaStream_t stream) const = 0;
-
-            virtual void allgather(const void* sendbuff,
-                                   void* recvbuff,
-                                   size_t sendcount,
-                                   datatype_t datatype,
-                                   cudaStream_t stream) const = 0;
-
-            virtual void allgatherv(const void* sendbuf,
-                                    void* recvbuf,
-                                    const size_t* recvcounts,
-                                    const size_t* displs,
-                                    datatype_t datatype,
-                                    cudaStream_t stream) const = 0;
-
-            virtual void gather(const void* sendbuff,
-                                void* recvbuff,
-                                size_t sendcount,
-                                datatype_t datatype,
-                                int root,
-                                cudaStream_t stream) const = 0;
-
-            virtual void gatherv(const void* sendbuf,
-                                 void* recvbuf,
-                                 size_t sendcount,
-                                 const size_t* recvcounts,
-                                 const size_t* displs,
-                                 datatype_t datatype,
-                                 int root,
-                                 cudaStream_t stream) const = 0;
-
-            virtual void reducescatter(const void* sendbuff,
-                                       void* recvbuff,
-                                       size_t recvcount,
-                                       datatype_t datatype,
-                                       op_t op,
-                                       cudaStream_t stream) const = 0;
-
-            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-            virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
-
-            // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-            virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
-
-            virtual void device_sendrecv(const void* sendbuf,
-                                         size_t sendsize,
-                                         int dest,
+  virtual void device_multicast_sendrecv(const void* sendbuf,
+                                         std::vector<size_t> const& sendsizes,
+                                         std::vector<size_t> const& sendoffsets,
+                                         std::vector<int> const& dests,
                                          void* recvbuf,
-                                         size_t recvsize,
-                                         int source,
+                                         std::vector<size_t> const& recvsizes,
+                                         std::vector<size_t> const& recvoffsets,
+                                         std::vector<int> const& sources,
                                          cudaStream_t stream) const = 0;
-
-            virtual void device_multicast_sendrecv(const void* sendbuf,
-                                                   std::vector<size_t> const& sendsizes,
-                                                   std::vector<size_t> const& sendoffsets,
-                                                   std::vector<int> const& dests,
-                                                   void* recvbuf,
-                                                   std::vector<size_t> const& recvsizes,
-                                                   std::vector<size_t> const& recvoffsets,
-                                                   std::vector<int> const& sources,
-                                                   cudaStream_t stream) const = 0;
-        };
-
-        class comms_t {
-        public:
-            comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
-            {
-                ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
-            }
-
-            /**
-             * Virtual Destructor to enable polymorphism
-             */
-            virtual ~comms_t() {}
-
-            /**
-             * Returns the size of the communicator clique
-             */
-
-            int get_size() const { return impl_->get_size(); }
-
-            /**
-             * Returns the local rank
-             */
-            int get_rank() const { return impl_->get_rank(); }
-
-            /**
-             * Splits the current communicator clique into sub-cliques matching
-             * the given color and key
-             *
-             * @param color ranks w/ the same color are placed in the same communicator
-             * @param key controls rank assignment
-             */
-            std::unique_ptr<comms_iface> comm_split(int color, int key) const
-            {
-                return impl_->comm_split(color, key);
-            }
-
-            /**
-             * Performs a collective barrier synchronization
-             */
-            void barrier() const { impl_->barrier(); }
-
-            /**
-             * Some collective communications implementations (eg. NCCL) might use asynchronous
-             * collectives that are explicitly synchronized. It's important to always synchronize
-             * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
-             * to prevent the potential for deadlocks.
-             *
-             * @param stream the cuda stream to sync collective operations on
-             */
-            status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
-
-            /**
-             * Performs an asynchronous point-to-point send
-             * @tparam value_t the type of data to send
-             * @param buf pointer to array of data to send
-             * @param size number of elements in buf
-             * @param dest destination rank
-             * @param tag a tag to use for the receiver to filter
-             * @param request pointer to hold returned request_t object.
-             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-             */
-            template <typename value_t>
-            void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
-            {
-                impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
-            }
-
-            /**
-             * Performs an asynchronous point-to-point receive
-             * @tparam value_t the type of data to be received
-             * @param buf pointer to (initialized) array that will hold received data
-             * @param size number of elements in buf
-             * @param source source rank
-             * @param tag a tag to use for message filtering
-             * @param request pointer to hold returned request_t object.
-             * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
-             */
-            template <typename value_t>
-            void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
-            {
-                impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
-            }
-
-            /**
-             * Synchronize on an array of request_t objects returned from isend/irecv
-             * @param count number of requests to synchronize on
-             * @param array_of_requests an array of request_t objects returned from isend/irecv
-             */
-            void waitall(int count, request_t array_of_requests[]) const
-            {
-                impl_->waitall(count, array_of_requests);
-            }
-
-            /**
-             * Perform an allreduce collective
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff data to reduce
-             * @param recvbuff buffer to hold the reduced result
-             * @param count number of elements in sendbuff
-             * @param op reduction operation to perform
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void allreduce(
-                    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
-            {
-                impl_->allreduce(static_cast<const void*>(sendbuff),
-                                 static_cast<void*>(recvbuff),
-                                 count,
-                                 get_type<value_t>(),
-                                 op,
-                                 stream);
-            }
-
-            /**
-             * Broadcast data from one rank to the rest
-             * @tparam value_t datatype of underlying buffers
-             * @param buff buffer to send
-             * @param count number of elements if buff
-             * @param root the rank initiating the broadcast
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
-            {
-                impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
-            }
-
-            /**
-             * Broadcast data from one rank to the rest
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to broadcast (only used in root)
-             * @param recvbuff buffer to receive broadcasted data
-             * @param count number of elements if buff
-             * @param root the rank initiating the broadcast
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void bcast(
-                    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
-            {
-                impl_->bcast(static_cast<const void*>(sendbuff),
-                             static_cast<void*>(recvbuff),
-                             count,
-                             get_type<value_t>(),
-                             root,
-                             stream);
-            }
-
-            /**
-             * Reduce data from many ranks down to a single rank
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to reduce
-             * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
-             * @param count number of elements in sendbuff
-             * @param op reduction operation to perform
-             * @param root rank to store the results
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void reduce(const value_t* sendbuff,
-                        value_t* recvbuff,
-                        size_t count,
-                        op_t op,
-                        int root,
-                        cudaStream_t stream) const
-            {
-                impl_->reduce(static_cast<const void*>(sendbuff),
-                              static_cast<void*>(recvbuff),
-                              count,
-                              get_type<value_t>(),
-                              op,
-                              root,
-                              stream);
-            }
-
-            /**
-             * Gathers data from each rank onto all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to gather
-             * @param recvbuff buffer containing gathered data from all ranks
-             * @param sendcount number of elements in send buffer
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void allgather(const value_t* sendbuff,
-                           value_t* recvbuff,
-                           size_t sendcount,
-                           cudaStream_t stream) const
-            {
-                impl_->allgather(static_cast<const void*>(sendbuff),
-                                 static_cast<void*>(recvbuff),
-                                 sendcount,
-                                 get_type<value_t>(),
-                                 stream);
-            }
-
-            /**
-             * Gathers data from all ranks and delivers to combined data to all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuf buffer containing data to send
-             * @param recvbuf buffer containing data to receive
-             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-             *                   elements that are to be received from each rank
-             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-             *               (relative to recvbuf) at which to place the incoming data from each rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void allgatherv(const value_t* sendbuf,
-                            value_t* recvbuf,
-                            const size_t* recvcounts,
-                            const size_t* displs,
-                            cudaStream_t stream) const
-            {
-                impl_->allgatherv(static_cast<const void*>(sendbuf),
-                                  static_cast<void*>(recvbuf),
-                                  recvcounts,
-                                  displs,
-                                  get_type<value_t>(),
-                                  stream);
-            }
-
-            /**
-             * Gathers data from each rank onto all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to gather
-             * @param recvbuff buffer containing gathered data from all ranks
-             * @param sendcount number of elements in send buffer
-             * @param root rank to store the results
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void gather(const value_t* sendbuff,
-                        value_t* recvbuff,
-                        size_t sendcount,
-                        int root,
-                        cudaStream_t stream) const
-            {
-                impl_->gather(static_cast<const void*>(sendbuff),
-                              static_cast<void*>(recvbuff),
-                              sendcount,
-                              get_type<value_t>(),
-                              root,
-                              stream);
-            }
-
-            /**
-             * Gathers data from all ranks and delivers to combined data to all ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuf buffer containing data to send
-             * @param recvbuf buffer containing data to receive
-             * @param sendcount number of elements in send buffer
-             * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
-             *                   elements that are to be received from each rank
-             * @param displs pointer to an array (of length num_ranks size) to specify the displacement
-             *               (relative to recvbuf) at which to place the incoming data from each rank
-             * @param root rank to store the results
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void gatherv(const value_t* sendbuf,
-                         value_t* recvbuf,
-                         size_t sendcount,
-                         const size_t* recvcounts,
-                         const size_t* displs,
-                         int root,
-                         cudaStream_t stream) const
-            {
-                impl_->gatherv(static_cast<const void*>(sendbuf),
-                               static_cast<void*>(recvbuf),
-                               sendcount,
-                               recvcounts,
-                               displs,
-                               get_type<value_t>(),
-                               root,
-                               stream);
-            }
-
-            /**
-             * Reduces data from all ranks then scatters the result across ranks
-             * @tparam value_t datatype of underlying buffers
-             * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
-             * @param recvbuff buffer containing received data
-             * @param recvcount number of items to receive
-             * @param op reduction operation to perform
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void reducescatter(const value_t* sendbuff,
-                               value_t* recvbuff,
-                               size_t recvcount,
-                               op_t op,
-                               cudaStream_t stream) const
-            {
-                impl_->reducescatter(static_cast<const void*>(sendbuff),
-                                     static_cast<void*>(recvbuff),
-                                     recvcount,
-                                     get_type<value_t>(),
-                                     op,
-                                     stream);
-            }
-
-            /**
-             * Performs a point-to-point send
-             *
-             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-             *
-             * @tparam value_t the type of data to send
-             * @param buf pointer to array of data to send
-             * @param size number of elements in buf
-             * @param dest destination rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
-            {
-                impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
-            }
-
-            /**
-             * Performs a point-to-point receive
-             *
-             *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
-             *
-             * @tparam value_t the type of data to be received
-             * @param buf pointer to (initialized) array that will hold received data
-             * @param size number of elements in buf
-             * @param source source rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
-            {
-                impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
-            }
-
-            /**
-             * Performs a point-to-point send/receive
-             *
-             * @tparam value_t the type of data to be sent & received
-             * @param sendbuf pointer to array of data to send
-             * @param sendsize number of elements in sendbuf
-             * @param dest destination rank
-             * @param recvbuf pointer to (initialized) array that will hold received data
-             * @param recvsize number of elements in recvbuf
-             * @param source source rank
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_sendrecv(const value_t* sendbuf,
-                                 size_t sendsize,
-                                 int dest,
+};
+
+class comms_t {
+ public:
+  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
+  {
+    ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
+  }
+
+  /**
+   * Virtual Destructor to enable polymorphism
+   */
+  virtual ~comms_t() {}
+
+  /**
+   * Returns the size of the communicator clique
+   */
+
+  int get_size() const { return impl_->get_size(); }
+
+  /**
+   * Returns the local rank
+   */
+  int get_rank() const { return impl_->get_rank(); }
+
+  /**
+   * Splits the current communicator clique into sub-cliques matching
+   * the given color and key
+   *
+   * @param color ranks w/ the same color are placed in the same communicator
+   * @param key controls rank assignment
+   */
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
+    return impl_->comm_split(color, key);
+  }
+
+  /**
+   * Performs a collective barrier synchronization
+   */
+  void barrier() const { impl_->barrier(); }
+
+  /**
+   * Some collective communications implementations (eg. NCCL) might use asynchronous
+   * collectives that are explicitly synchronized. It's important to always synchronize
+   * using this method to allow failures to propagate, rather than `cudaStreamSynchronize()`,
+   * to prevent the potential for deadlocks.
+   *
+   * @param stream the cuda stream to sync collective operations on
+   */
+  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
+
+  /**
+   * Performs an asynchronous point-to-point send
+   * @tparam value_t the type of data to send
+   * @param buf pointer to array of data to send
+   * @param size number of elements in buf
+   * @param dest destination rank
+   * @param tag a tag to use for the receiver to filter
+   * @param request pointer to hold returned request_t object.
+   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+   */
+  template <typename value_t>
+  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
+  }
+
+  /**
+   * Performs an asynchronous point-to-point receive
+   * @tparam value_t the type of data to be received
+   * @param buf pointer to (initialized) array that will hold received data
+   * @param size number of elements in buf
+   * @param source source rank
+   * @param tag a tag to use for message filtering
+   * @param request pointer to hold returned request_t object.
+   * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
+   */
+  template <typename value_t>
+  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
+  }
+
+  /**
+   * Synchronize on an array of request_t objects returned from isend/irecv
+   * @param count number of requests to synchronize on
+   * @param array_of_requests an array of request_t objects returned from isend/irecv
+   */
+  void waitall(int count, request_t array_of_requests[]) const
+  {
+    impl_->waitall(count, array_of_requests);
+  }
+
+  /**
+   * Perform an allreduce collective
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff data to reduce
+   * @param recvbuff buffer to hold the reduced result
+   * @param count number of elements in sendbuff
+   * @param op reduction operation to perform
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allreduce(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
+  {
+    impl_->allreduce(static_cast<const void*>(sendbuff),
+                     static_cast<void*>(recvbuff),
+                     count,
+                     get_type<value_t>(),
+                     op,
+                     stream);
+  }
+
+  /**
+   * Broadcast data from one rank to the rest
+   * @tparam value_t datatype of underlying buffers
+   * @param buff buffer to send
+   * @param count number of elements if buff
+   * @param root the rank initiating the broadcast
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
+  {
+    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
+  }
+
+  /**
+   * Broadcast data from one rank to the rest
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to broadcast (only used in root)
+   * @param recvbuff buffer to receive broadcasted data
+   * @param count number of elements if buff
+   * @param root the rank initiating the broadcast
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void bcast(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
+  {
+    impl_->bcast(static_cast<const void*>(sendbuff),
+                 static_cast<void*>(recvbuff),
+                 count,
+                 get_type<value_t>(),
+                 root,
+                 stream);
+  }
+
+  /**
+   * Reduce data from many ranks down to a single rank
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to reduce
+   * @param recvbuff buffer containing reduced data (only needs to be initialized on root)
+   * @param count number of elements in sendbuff
+   * @param op reduction operation to perform
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void reduce(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t count,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    impl_->reduce(static_cast<const void*>(sendbuff),
+                  static_cast<void*>(recvbuff),
+                  count,
+                  get_type<value_t>(),
+                  op,
+                  root,
+                  stream);
+  }
+
+  /**
+   * Gathers data from each rank onto all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to gather
+   * @param recvbuff buffer containing gathered data from all ranks
+   * @param sendcount number of elements in send buffer
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allgather(const value_t* sendbuff,
+                 value_t* recvbuff,
+                 size_t sendcount,
+                 cudaStream_t stream) const
+  {
+    impl_->allgather(static_cast<const void*>(sendbuff),
+                     static_cast<void*>(recvbuff),
+                     sendcount,
+                     get_type<value_t>(),
+                     stream);
+  }
+
+  /**
+   * Gathers data from all ranks and delivers to combined data to all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuf buffer containing data to send
+   * @param recvbuf buffer containing data to receive
+   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+   *                   elements that are to be received from each rank
+   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+   *               (relative to recvbuf) at which to place the incoming data from each rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void allgatherv(const value_t* sendbuf,
+                  value_t* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  cudaStream_t stream) const
+  {
+    impl_->allgatherv(static_cast<const void*>(sendbuf),
+                      static_cast<void*>(recvbuf),
+                      recvcounts,
+                      displs,
+                      get_type<value_t>(),
+                      stream);
+  }
+
+  /**
+   * Gathers data from each rank onto all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to gather
+   * @param recvbuff buffer containing gathered data from all ranks
+   * @param sendcount number of elements in send buffer
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void gather(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t sendcount,
+              int root,
+              cudaStream_t stream) const
+  {
+    impl_->gather(static_cast<const void*>(sendbuff),
+                  static_cast<void*>(recvbuff),
+                  sendcount,
+                  get_type<value_t>(),
+                  root,
+                  stream);
+  }
+
+  /**
+   * Gathers data from all ranks and delivers to combined data to all ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuf buffer containing data to send
+   * @param recvbuf buffer containing data to receive
+   * @param sendcount number of elements in send buffer
+   * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
+   *                   elements that are to be received from each rank
+   * @param displs pointer to an array (of length num_ranks size) to specify the displacement
+   *               (relative to recvbuf) at which to place the incoming data from each rank
+   * @param root rank to store the results
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void gatherv(const value_t* sendbuf,
+               value_t* recvbuf,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               int root,
+               cudaStream_t stream) const
+  {
+    impl_->gatherv(static_cast<const void*>(sendbuf),
+                   static_cast<void*>(recvbuf),
+                   sendcount,
+                   recvcounts,
+                   displs,
+                   get_type<value_t>(),
+                   root,
+                   stream);
+  }
+
+  /**
+   * Reduces data from all ranks then scatters the result across ranks
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
+   * @param recvbuff buffer containing received data
+   * @param recvcount number of items to receive
+   * @param op reduction operation to perform
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void reducescatter(const value_t* sendbuff,
+                     value_t* recvbuff,
+                     size_t recvcount,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    impl_->reducescatter(static_cast<const void*>(sendbuff),
+                         static_cast<void*>(recvbuff),
+                         recvcount,
+                         get_type<value_t>(),
+                         op,
+                         stream);
+  }
+
+  /**
+   * Performs a point-to-point send
+   *
+   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+   *
+   * @tparam value_t the type of data to send
+   * @param buf pointer to array of data to send
+   * @param size number of elements in buf
+   * @param dest destination rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
+  {
+    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
+  }
+
+  /**
+   * Performs a point-to-point receive
+   *
+   *  if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock.
+   *
+   * @tparam value_t the type of data to be received
+   * @param buf pointer to (initialized) array that will hold received data
+   * @param size number of elements in buf
+   * @param source source rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
+  {
+    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
+  }
+
+  /**
+   * Performs a point-to-point send/receive
+   *
+   * @tparam value_t the type of data to be sent & received
+   * @param sendbuf pointer to array of data to send
+   * @param sendsize number of elements in sendbuf
+   * @param dest destination rank
+   * @param recvbuf pointer to (initialized) array that will hold received data
+   * @param recvsize number of elements in recvbuf
+   * @param source source rank
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_sendrecv(const value_t* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       value_t* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
+    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
+                           sendsize * sizeof(value_t),
+                           dest,
+                           static_cast<void*>(recvbuf),
+                           recvsize * sizeof(value_t),
+                           source,
+                           stream);
+  }
+
+  /**
+   * Performs a multicast send/receive
+   *
+   * @tparam value_t the type of data to be sent & received
+   * @param sendbuf pointer to array of data to send
+   * @param sendsizes numbers of elements to send
+   * @param sendoffsets offsets in a number of elements from sendbuf
+   * @param dests destination ranks
+   * @param recvbuf pointer to (initialized) array that will hold received data
+   * @param recvsizes numbers of elements to recv
+   * @param recvoffsets offsets in a number of elements from recvbuf
+   * @param sources source ranks
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void device_multicast_sendrecv(const value_t* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
                                  value_t* recvbuf,
-                                 size_t recvsize,
-                                 int source,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
                                  cudaStream_t stream) const
-            {
-                impl_->device_sendrecv(static_cast<const void*>(sendbuf),
-                                       sendsize * sizeof(value_t),
-                                       dest,
-                                       static_cast<void*>(recvbuf),
-                                       recvsize * sizeof(value_t),
-                                       source,
-                                       stream);
-            }
-
-            /**
-             * Performs a multicast send/receive
-             *
-             * @tparam value_t the type of data to be sent & received
-             * @param sendbuf pointer to array of data to send
-             * @param sendsizes numbers of elements to send
-             * @param sendoffsets offsets in a number of elements from sendbuf
-             * @param dests destination ranks
-             * @param recvbuf pointer to (initialized) array that will hold received data
-             * @param recvsizes numbers of elements to recv
-             * @param recvoffsets offsets in a number of elements from recvbuf
-             * @param sources source ranks
-             * @param stream CUDA stream to synchronize operation
-             */
-            template <typename value_t>
-            void device_multicast_sendrecv(const value_t* sendbuf,
-                                           std::vector<size_t> const& sendsizes,
-                                           std::vector<size_t> const& sendoffsets,
-                                           std::vector<int> const& dests,
-                                           value_t* recvbuf,
-                                           std::vector<size_t> const& recvsizes,
-                                           std::vector<size_t> const& recvoffsets,
-                                           std::vector<int> const& sources,
-                                           cudaStream_t stream) const
-            {
-                auto sendbytesizes   = sendsizes;
-                auto sendbyteoffsets = sendoffsets;
-                for (size_t i = 0; i < sendsizes.size(); ++i) {
-                    sendbytesizes[i] *= sizeof(value_t);
-                    sendbyteoffsets[i] *= sizeof(value_t);
-                }
-                auto recvbytesizes   = recvsizes;
-                auto recvbyteoffsets = recvoffsets;
-                for (size_t i = 0; i < recvsizes.size(); ++i) {
-                    recvbytesizes[i] *= sizeof(value_t);
-                    recvbyteoffsets[i] *= sizeof(value_t);
-                }
-                impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
-                                                 sendbytesizes,
-                                                 sendbyteoffsets,
-                                                 dests,
-                                                 static_cast<void*>(recvbuf),
-                                                 recvbytesizes,
-                                                 recvbyteoffsets,
-                                                 sources,
-                                                 stream);
-            }
-
-        private:
-            std::unique_ptr<comms_iface> impl_;
-        };
-
-    }  // namespace comms
+  {
+    auto sendbytesizes   = sendsizes;
+    auto sendbyteoffsets = sendoffsets;
+    for (size_t i = 0; i < sendsizes.size(); ++i) {
+      sendbytesizes[i] *= sizeof(value_t);
+      sendbyteoffsets[i] *= sizeof(value_t);
+    }
+    auto recvbytesizes   = recvsizes;
+    auto recvbyteoffsets = recvoffsets;
+    for (size_t i = 0; i < recvsizes.size(); ++i) {
+      recvbytesizes[i] *= sizeof(value_t);
+      recvbyteoffsets[i] *= sizeof(value_t);
+    }
+    impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
+                                     sendbytesizes,
+                                     sendbyteoffsets,
+                                     dests,
+                                     static_cast<void*>(recvbuf),
+                                     recvbytesizes,
+                                     recvbyteoffsets,
+                                     sources,
+                                     stream);
+  }
+
+ private:
+  std::unique_ptr<comms_iface> impl_;
+};
+
+}  // namespace comms
 }  // namespace raft
diff --git a/cpp/include/raft/core/cublas_macros.hpp b/cpp/include/raft/core/cublas_macros.hpp
index 5a96444e45..0281c5c667 100644
--- a/cpp/include/raft/core/cublas_macros.hpp
+++ b/cpp/include/raft/core/cublas_macros.hpp
@@ -40,33 +40,33 @@ namespace raft {
 /**
  * @brief Exception thrown when a cuBLAS error is encountered.
  */
-    struct cublas_error : public raft::exception {
-        explicit cublas_error(char const* const message) : raft::exception(message) {}
-        explicit cublas_error(std::string const& message) : raft::exception(message) {}
-    };
-
-    namespace linalg {
-        namespace detail {
-
-            inline const char* cublas_error_to_string(cublasStatus_t err)
-            {
-                switch (err) {
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
-                    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-                    default: return "CUBLAS_STATUS_UNKNOWN";
-                };
-            }
-
-        }  // namespace detail
-    }  // namespace linalg
+struct cublas_error : public raft::exception {
+  explicit cublas_error(char const* const message) : raft::exception(message) {}
+  explicit cublas_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+namespace detail {
+
+inline const char* cublas_error_to_string(cublasStatus_t err)
+{
+  switch (err) {
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
+    default: return "CUBLAS_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace detail
+}  // namespace linalg
 }  // namespace raft
 
 #undef _CUBLAS_ERR_TO_STR
diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp
index ecc840b77e..3a18d7e420 100644
--- a/cpp/include/raft/core/cudart_utils.hpp
+++ b/cpp/include/raft/core/cudart_utils.hpp
@@ -46,10 +46,10 @@ namespace raft {
 /**
  * @brief Exception thrown when a CUDA error is encountered.
  */
-    struct cuda_error : public raft::exception {
-        explicit cuda_error(char const* const message) : raft::exception(message) {}
-        explicit cuda_error(std::string const& message) : raft::exception(message) {}
-    };
+struct cuda_error : public raft::exception {
+  explicit cuda_error(char const* const message) : raft::exception(message) {}
+  explicit cuda_error(std::string const& message) : raft::exception(message) {}
+};
 
 }  // namespace raft
 
@@ -141,99 +141,99 @@ namespace raft {
 namespace raft {
 
 /** Helper method to get to know warp size in device code */
-    __host__ __device__ constexpr inline int warp_size() { return 32; }
+__host__ __device__ constexpr inline int warp_size() { return 32; }
 
-    __host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
+__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to threads.
  */
-    class grid_1d_thread_t {
-    public:
-        int const block_size{0};
-        int const num_blocks{0};
-
-        /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         * @param elements_per_thread Typically, a single kernel thread processes more than a single
-         * element; this affects the number of threads the grid must contain
-         */
-        grid_1d_thread_t(size_t overall_num_elements,
-                         size_t num_threads_per_block,
-                         size_t max_num_blocks_1d,
-                         size_t elements_per_thread = 1)
-                : block_size(num_threads_per_block),
-                  num_blocks(
-                          std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
-                                   (elements_per_thread * num_threads_per_block),
-                                   max_num_blocks_1d))
-        {
-            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                         "num_threads_per_block / warp_size() must be > 0");
-            RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
-        }
-    };
+class grid_1d_thread_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param elements_per_thread Typically, a single kernel thread processes more than a single
+   * element; this affects the number of threads the grid must contain
+   */
+  grid_1d_thread_t(size_t overall_num_elements,
+                   size_t num_threads_per_block,
+                   size_t max_num_blocks_1d,
+                   size_t elements_per_thread = 1)
+    : block_size(num_threads_per_block),
+      num_blocks(
+        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
+                   (elements_per_thread * num_threads_per_block),
+                 max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+    RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
+  }
+};
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to warps.
  */
-    class grid_1d_warp_t {
-    public:
-        int const block_size{0};
-        int const num_blocks{0};
-
-        /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         */
-        grid_1d_warp_t(size_t overall_num_elements,
-                       size_t num_threads_per_block,
-                       size_t max_num_blocks_1d)
-                : block_size(num_threads_per_block),
-                  num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
-                                      (num_threads_per_block / warp_size()),
-                                      max_num_blocks_1d))
-        {
-            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                         "num_threads_per_block / warp_size() must be > 0");
-        }
-    };
+class grid_1d_warp_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   */
+  grid_1d_warp_t(size_t overall_num_elements,
+                 size_t num_threads_per_block,
+                 size_t max_num_blocks_1d)
+    : block_size(num_threads_per_block),
+      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
+                            (num_threads_per_block / warp_size()),
+                          max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+  }
+};
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
  * elements to blocks.
  */
-    class grid_1d_block_t {
-    public:
-        int const block_size{0};
-        int const num_blocks{0};
-
-        /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         */
-        grid_1d_block_t(size_t overall_num_elements,
-                        size_t num_threads_per_block,
-                        size_t max_num_blocks_1d)
-                : block_size(num_threads_per_block),
-                  num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
-        {
-            RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
-            RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
-                         "num_threads_per_block / warp_size() must be > 0");
-        }
-    };
+class grid_1d_block_t {
+ public:
+  int const block_size{0};
+  int const num_blocks{0};
+
+  /**
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   */
+  grid_1d_block_t(size_t overall_num_elements,
+                  size_t num_threads_per_block,
+                  size_t max_num_blocks_1d)
+    : block_size(num_threads_per_block),
+      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
+  {
+    RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
+    RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
+                 "num_threads_per_block / warp_size() must be > 0");
+  }
+};
 
 /**
  * @brief Generic copy method for all kinds of transfers
@@ -243,11 +243,11 @@ namespace raft {
  * @param len lenth of the src/dst buffers in terms of number of elements
  * @param stream cuda stream
  */
-    template <typename Type>
-    void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
-    {
-        CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
-    }
+template <typename Type>
+void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+}
 
 /**
  * @defgroup Copy Copy methods
@@ -256,118 +256,118 @@ namespace raft {
  * @{
  */
 /** performs a host to device copy */
-    template <typename Type>
-    void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
-    {
-        copy(d_ptr, h_ptr, len, stream);
-    }
+template <typename Type>
+void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
+{
+  copy(d_ptr, h_ptr, len, stream);
+}
 
 /** performs a device to host copy */
-    template <typename Type>
-    void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
-    {
-        copy(h_ptr, d_ptr, len, stream);
-    }
-
-    template <typename Type>
-    void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
-    {
-        CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
-    }
+template <typename Type>
+void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
+{
+  copy(h_ptr, d_ptr, len, stream);
+}
+
+template <typename Type>
+void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+}
 /** @} */
 
 /**
  * @defgroup Debug Utils for debugging host/device buffers
  * @{
  */
-    template <class T, class OutStream>
-    void print_host_vector(const char* variable_name,
-                           const T* host_mem,
-                           size_t componentsCount,
-                           OutStream& out)
-    {
-        out << variable_name << "=[";
-        for (size_t i = 0; i < componentsCount; ++i) {
-            if (i != 0) out << ",";
-            out << host_mem[i];
-        }
-        out << "];\n";
-    }
-
-    template <class T, class OutStream>
-    void print_device_vector(const char* variable_name,
-                             const T* devMem,
-                             size_t componentsCount,
-                             OutStream& out)
-    {
-        T* host_mem = new T[componentsCount];
-        CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
-        print_host_vector(variable_name, host_mem, componentsCount, out);
-        delete[] host_mem;
-    }
+template <class T, class OutStream>
+void print_host_vector(const char* variable_name,
+                       const T* host_mem,
+                       size_t componentsCount,
+                       OutStream& out)
+{
+  out << variable_name << "=[";
+  for (size_t i = 0; i < componentsCount; ++i) {
+    if (i != 0) out << ",";
+    out << host_mem[i];
+  }
+  out << "];\n";
+}
+
+template <class T, class OutStream>
+void print_device_vector(const char* variable_name,
+                         const T* devMem,
+                         size_t componentsCount,
+                         OutStream& out)
+{
+  T* host_mem = new T[componentsCount];
+  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
+  print_host_vector(variable_name, host_mem, componentsCount, out);
+  delete[] host_mem;
+}
 /** @} */
 
 /** helper method to get max usable shared mem per block parameter */
-    inline int getSharedMemPerBlock()
-    {
-        int devId;
-        RAFT_CUDA_TRY(cudaGetDevice(&devId));
-        int smemPerBlk;
-        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
-        return smemPerBlk;
-    }
+inline int getSharedMemPerBlock()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int smemPerBlk;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
+  return smemPerBlk;
+}
 
 /** helper method to get multi-processor count parameter */
-    inline int getMultiProcessorCount()
-    {
-        int devId;
-        RAFT_CUDA_TRY(cudaGetDevice(&devId));
-        int mpCount;
-        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
-        return mpCount;
-    }
+inline int getMultiProcessorCount()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int mpCount;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+  return mpCount;
+}
 
 /** helper method to convert an array on device to a string on host */
-    template <typename T>
-    std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
-    {
-        std::stringstream ss;
+template <typename T>
+std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
+{
+  std::stringstream ss;
 
-        T* arr_h = (T*)malloc(size * sizeof(T));
-        update_host(arr_h, arr, size, stream);
-        RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  T* arr_h = (T*)malloc(size * sizeof(T));
+  update_host(arr_h, arr, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
-        ss << name << " = [ ";
-        for (int i = 0; i < size; i++) {
-            ss << std::setw(width) << arr_h[i];
+  ss << name << " = [ ";
+  for (int i = 0; i < size; i++) {
+    ss << std::setw(width) << arr_h[i];
 
-            if (i < size - 1) ss << ", ";
-        }
-        ss << " ]" << std::endl;
+    if (i < size - 1) ss << ", ";
+  }
+  ss << " ]" << std::endl;
 
-        free(arr_h);
+  free(arr_h);
 
-        return ss.str();
-    }
+  return ss.str();
+}
 
 /** this seems to be unused, but may be useful in the future */
-    template <typename T>
-    void ASSERT_DEVICE_MEM(T* ptr, std::string name)
-    {
-        cudaPointerAttributes s_att;
-        cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
-
-        if (s_err != 0 || s_att.device == -1)
-            std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
-                      << ", err=" << s_err << std::endl;
-    }
-
-    inline uint32_t curTimeMillis()
-    {
-        auto now      = std::chrono::high_resolution_clock::now();
-        auto duration = now.time_since_epoch();
-        return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
-    }
+template <typename T>
+void ASSERT_DEVICE_MEM(T* ptr, std::string name)
+{
+  cudaPointerAttributes s_att;
+  cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
+
+  if (s_err != 0 || s_att.device == -1)
+    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
+              << ", err=" << s_err << std::endl;
+}
+
+inline uint32_t curTimeMillis()
+{
+  auto now      = std::chrono::high_resolution_clock::now();
+  auto duration = now.time_since_epoch();
+  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+}
 
 /** Helper function to calculate need memory for allocate to store dense matrix.
  * @param rows number of rows in matrix
@@ -375,34 +375,34 @@ namespace raft {
  * @return need number of items to allocate via allocate()
  * @sa allocate()
  */
-    inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
+inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
 
 /** Helper function to check alignment of pointer.
  * @param ptr the pointer to check
  * @param alignment to be checked for
  * @return true if address in bytes is a multiple of alignment
  */
-    template <typename Type>
-    bool is_aligned(Type* ptr, size_t alignment)
-    {
-        return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
-    }
+template <typename Type>
+bool is_aligned(Type* ptr, size_t alignment)
+{
+  return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+}
 
 /** calculate greatest common divisor of two numbers
  * @a integer
  * @b integer
  * @ return gcd of a and b
  */
-    template <typename IntType>
-    IntType gcd(IntType a, IntType b)
-    {
-        while (b != 0) {
-            IntType tmp = b;
-            b           = a % b;
-            a           = tmp;
-        }
-        return a;
-    }
+template <typename IntType>
+IntType gcd(IntType a, IntType b)
+{
+  while (b != 0) {
+    IntType tmp = b;
+    b           = a % b;
+    a           = tmp;
+  }
+  return a;
+}
 
 }  // namespace raft
 
diff --git a/cpp/include/raft/core/cusolver_macros.hpp b/cpp/include/raft/core/cusolver_macros.hpp
index 87bca0e4e0..df27f7ce26 100644
--- a/cpp/include/raft/core/cusolver_macros.hpp
+++ b/cpp/include/raft/core/cusolver_macros.hpp
@@ -39,31 +39,31 @@ namespace raft {
 /**
  * @brief Exception thrown when a cuSOLVER error is encountered.
  */
-    struct cusolver_error : public raft::exception {
-        explicit cusolver_error(char const* const message) : raft::exception(message) {}
-        explicit cusolver_error(std::string const& message) : raft::exception(message) {}
-    };
-
-    namespace linalg {
-
-        inline const char* cusolver_error_to_string(cusolverStatus_t err)
-        {
-            switch (err) {
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
-                _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-                default: return "CUSOLVER_STATUS_UNKNOWN";
-            };
-        }
-
-    }  // namespace linalg
+struct cusolver_error : public raft::exception {
+  explicit cusolver_error(char const* const message) : raft::exception(message) {}
+  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+
+inline const char* cusolver_error_to_string(cusolverStatus_t err)
+{
+  switch (err) {
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
+    default: return "CUSOLVER_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace linalg
 }  // namespace raft
 
 #undef _CUSOLVER_ERR_TO_STR
diff --git a/cpp/include/raft/core/cusparse_macros.hpp b/cpp/include/raft/core/cusparse_macros.hpp
index 1983dadec8..10c7e8836c 100644
--- a/cpp/include/raft/core/cusparse_macros.hpp
+++ b/cpp/include/raft/core/cusparse_macros.hpp
@@ -40,35 +40,35 @@ namespace raft {
 /**
  * @brief Exception thrown when a cuSparse error is encountered.
  */
-    struct cusparse_error : public raft::exception {
-        explicit cusparse_error(char const* const message) : raft::exception(message) {}
-        explicit cusparse_error(std::string const& message) : raft::exception(message) {}
-    };
+struct cusparse_error : public raft::exception {
+  explicit cusparse_error(char const* const message) : raft::exception(message) {}
+  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
+};
 
-    namespace sparse {
-        namespace detail {
+namespace sparse {
+namespace detail {
 
-            inline const char* cusparse_error_to_string(cusparseStatus_t err)
-            {
+inline const char* cusparse_error_to_string(cusparseStatus_t err)
+{
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
-                return cusparseGetErrorString(err);
+  return cusparseGetErrorString(err);
 #else   // CUDART_VERSION
-                switch (err) {
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
-                    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-                    default: return "CUSPARSE_STATUS_UNKNOWN";
-                };
+  switch (err) {
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    default: return "CUSPARSE_STATUS_UNKNOWN";
+  };
 #endif  // CUDART_VERSION
-            }
+}
 
-        }  // namespace detail
-    }  // namespace sparse
+}  // namespace detail
+}  // namespace sparse
 }  // namespace raft
 
 #undef _CUSPARSE_ERR_TO_STR
diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp
index 8b49715f79..5e1aa3af28 100644
--- a/cpp/include/raft/core/error.hpp
+++ b/cpp/include/raft/core/error.hpp
@@ -35,36 +35,36 @@
 namespace raft {
 
 /** base exception class for the whole of raft */
-    class exception : public std::exception {
-    public:
-        /** default ctor */
-        explicit exception() noexcept : std::exception(), msg_() {}
-
-        /** copy ctor */
-        exception(exception const& src) noexcept : std::exception(), msg_(src.what())
-        {
-            collect_call_stack();
-        }
-
-        /** ctor from an input message */
-        explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
-        {
-            collect_call_stack();
-        }
-
-        /** get the message associated with this exception */
-        char const* what() const noexcept override { return msg_.c_str(); }
-
-    private:
-        /** message associated with this exception */
-        std::string msg_;
-
-        /** append call stack info to this exception's message for ease of debug */
-        // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-        void collect_call_stack() noexcept
-        {
+class exception : public std::exception {
+ public:
+  /** default ctor */
+  explicit exception() noexcept : std::exception(), msg_() {}
+
+  /** copy ctor */
+  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
+  {
+    collect_call_stack();
+  }
+
+  /** ctor from an input message */
+  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
+  {
+    collect_call_stack();
+  }
+
+  /** get the message associated with this exception */
+  char const* what() const noexcept override { return msg_.c_str(); }
+
+ private:
+  /** message associated with this exception */
+  std::string msg_;
+
+  /** append call stack info to this exception's message for ease of debug */
+  // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
+  void collect_call_stack() noexcept
+  {
 #ifdef __GNUC__
-            constexpr int kMaxStackDepth = 64;
+    constexpr int kMaxStackDepth = 64;
     void* stack[kMaxStackDepth];  // NOLINT
     auto depth = backtrace(stack, kMaxStackDepth);
     std::ostringstream oss;
@@ -82,8 +82,8 @@ namespace raft {
     free(strings);
     msg_ += oss.str();
 #endif  // __GNUC__
-        }
-    };
+  }
+};
 
 /**
  * @brief Exception thrown when logical precondition is violated.
@@ -92,10 +92,10 @@ namespace raft {
  * RAFT_EXPECTS and  RAFT_FAIL macros.
  *
  */
-    struct logic_error : public raft::exception {
-        explicit logic_error(char const* const message) : raft::exception(message) {}
-        explicit logic_error(std::string const& message) : raft::exception(message) {}
-    };
+struct logic_error : public raft::exception {
+  explicit logic_error(char const* const message) : raft::exception(message) {}
+  explicit logic_error(std::string const& message) : raft::exception(message) {}
+};
 
 }  // namespace raft
 
diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp
index 13a3fc26d9..158816f762 100644
--- a/cpp/include/raft/core/handle.hpp
+++ b/cpp/include/raft/core/handle.hpp
@@ -56,288 +56,288 @@ namespace raft {
  * @brief Main handle object that stores all necessary context used for calling
  *        necessary cuda kernels and/or libraries
  */
-    class handle_t {
-    public:
-        // delete copy/move constructors and assignment operators as
-        // copying and moving underlying resources is unsafe
-        handle_t(const handle_t&) = delete;
-        handle_t& operator=(const handle_t&) = delete;
-        handle_t(handle_t&&)                 = delete;
-        handle_t& operator=(handle_t&&) = delete;
-
-        /**
-         * @brief Construct a handle with a stream view and stream pool
-         *
-         * @param[in] stream_view the default stream (which has the default per-thread stream if
-         * unspecified)
-         * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
-         */
-        handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
-                 std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
-                : dev_id_([]() -> int {
-            int cur_dev = -1;
-            RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
-            return cur_dev;
-        }()),
-                  stream_view_{stream_view},
-                  stream_pool_{stream_pool}
-        {
-            create_resources();
-        }
-
-        /** Destroys all held-up resources */
-        virtual ~handle_t() { destroy_resources(); }
-
-        int get_device() const { return dev_id_; }
-
-        cublasHandle_t get_cublas_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cublas_initialized_) {
-                RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
-                RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
-                cublas_initialized_ = true;
-            }
-            return cublas_handle_;
-        }
-
-        cusolverDnHandle_t get_cusolver_dn_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cusolver_dn_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
-                cusolver_dn_initialized_ = true;
-            }
-            return cusolver_dn_handle_;
-        }
-
-        cusolverSpHandle_t get_cusolver_sp_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cusolver_sp_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
-                cusolver_sp_initialized_ = true;
-            }
-            return cusolver_sp_handle_;
-        }
-
-        cusparseHandle_t get_cusparse_handle() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!cusparse_initialized_) {
-                RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
-                RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
-                cusparse_initialized_ = true;
-            }
-            return cusparse_handle_;
-        }
-
-        rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
-
-        /**
-         * @brief synchronize a stream on the handle
-         */
-        void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
-
-        /**
-         * @brief synchronize main stream on the handle
-         */
-        void sync_stream() const { sync_stream(stream_view_); }
-
-        /**
-         * @brief returns main stream on the handle
-         */
-        rmm::cuda_stream_view get_stream() const { return stream_view_; }
-
-        /**
-         * @brief returns whether stream pool was initialized on the handle
-         */
-
-        bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
-
-        /**
-         * @brief returns stream pool on the handle
-         */
-        const rmm::cuda_stream_pool& get_stream_pool() const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            return *stream_pool_;
-        }
-
-        std::size_t get_stream_pool_size() const
-        {
-            return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
-        }
-
-        /**
-         * @brief return stream from pool
-         */
-        rmm::cuda_stream_view get_stream_from_stream_pool() const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            return stream_pool_->get_stream();
-        }
-
-        /**
-         * @brief return stream from pool at index
-         */
-        rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            return stream_pool_->get_stream(stream_idx);
-        }
-
-        /**
-         * @brief return stream from pool if size > 0, else main stream on handle
-         */
-        rmm::cuda_stream_view get_next_usable_stream() const
-        {
-            return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
-        }
-
-        /**
-         * @brief return stream from pool at index if size > 0, else main stream on handle
-         *
-         * @param[in] stream_idx the required index of the stream in the stream pool if available
-         */
-        rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
-        {
-            return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
-        }
-
-        /**
-         * @brief synchronize the stream pool on the handle
-         */
-        void sync_stream_pool() const
-        {
-            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-                sync_stream(stream_pool_->get_stream(i));
-            }
-        }
-
-        /**
-         * @brief synchronize subset of stream pool
-         *
-         * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
-         */
-        void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
-        {
-            RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-            for (const auto& stream_index : stream_indices) {
-                sync_stream(stream_pool_->get_stream(stream_index));
-            }
-        }
-
-        /**
-         * @brief ask stream pool to wait on last event in main stream
-         */
-        void wait_stream_pool_on_stream() const
-        {
-            RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
-            for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-                RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
-            }
-        }
-
-        void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
-
-        const comms::comms_t& get_comms() const
-        {
-            RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
-            return *communicator_;
-        }
-
-        void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
-        {
-            subcomms_[key] = subcomm;
-        }
-
-        const comms::comms_t& get_subcomm(std::string key) const
-        {
-            RAFT_EXPECTS(
-                    subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
-
-            auto subcomm = subcomms_.at(key);
-
-            RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
-
-            return *subcomm;
-        }
-
-        bool comms_initialized() const { return (nullptr != communicator_.get()); }
-
-        const cudaDeviceProp& get_device_properties() const
-        {
-            std::lock_guard<std::mutex> _(mutex_);
-            if (!device_prop_initialized_) {
-                RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
-                device_prop_initialized_ = true;
-            }
-            return prop_;
-        }
-
-    private:
-        std::shared_ptr<comms::comms_t> communicator_;
-        std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
-
-        const int dev_id_;
-        mutable cublasHandle_t cublas_handle_;
-        mutable bool cublas_initialized_{false};
-        mutable cusolverDnHandle_t cusolver_dn_handle_;
-        mutable bool cusolver_dn_initialized_{false};
-        mutable cusolverSpHandle_t cusolver_sp_handle_;
-        mutable bool cusolver_sp_initialized_{false};
-        mutable cusparseHandle_t cusparse_handle_;
-        mutable bool cusparse_initialized_{false};
-        std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
-        rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
-        std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
-        cudaEvent_t event_;
-        mutable cudaDeviceProp prop_;
-        mutable bool device_prop_initialized_{false};
-        mutable std::mutex mutex_;
-
-        void create_resources()
-        {
-            thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
-
-            RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-        }
-
-        void destroy_resources()
-        {
-            if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
-            if (cusolver_dn_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
-            }
-            if (cusolver_sp_initialized_) {
-                RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
-            }
-            if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
-            RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
-        }
-    };  // class handle_t
+class handle_t {
+ public:
+  // delete copy/move constructors and assignment operators as
+  // copying and moving underlying resources is unsafe
+  handle_t(const handle_t&) = delete;
+  handle_t& operator=(const handle_t&) = delete;
+  handle_t(handle_t&&)                 = delete;
+  handle_t& operator=(handle_t&&) = delete;
+
+  /**
+   * @brief Construct a handle with a stream view and stream pool
+   *
+   * @param[in] stream_view the default stream (which has the default per-thread stream if
+   * unspecified)
+   * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
+   */
+  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
+           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
+    : dev_id_([]() -> int {
+        int cur_dev = -1;
+        RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
+        return cur_dev;
+      }()),
+      stream_view_{stream_view},
+      stream_pool_{stream_pool}
+  {
+    create_resources();
+  }
+
+  /** Destroys all held-up resources */
+  virtual ~handle_t() { destroy_resources(); }
+
+  int get_device() const { return dev_id_; }
+
+  cublasHandle_t get_cublas_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cublas_initialized_) {
+      RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
+      RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
+      cublas_initialized_ = true;
+    }
+    return cublas_handle_;
+  }
+
+  cusolverDnHandle_t get_cusolver_dn_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusolver_dn_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
+      cusolver_dn_initialized_ = true;
+    }
+    return cusolver_dn_handle_;
+  }
+
+  cusolverSpHandle_t get_cusolver_sp_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusolver_sp_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
+      cusolver_sp_initialized_ = true;
+    }
+    return cusolver_sp_handle_;
+  }
+
+  cusparseHandle_t get_cusparse_handle() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusparse_initialized_) {
+      RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
+      RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
+      cusparse_initialized_ = true;
+    }
+    return cusparse_handle_;
+  }
+
+  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
+
+  /**
+   * @brief synchronize a stream on the handle
+   */
+  void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
+
+  /**
+   * @brief synchronize main stream on the handle
+   */
+  void sync_stream() const { sync_stream(stream_view_); }
+
+  /**
+   * @brief returns main stream on the handle
+   */
+  rmm::cuda_stream_view get_stream() const { return stream_view_; }
+
+  /**
+   * @brief returns whether stream pool was initialized on the handle
+   */
+
+  bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
+
+  /**
+   * @brief returns stream pool on the handle
+   */
+  const rmm::cuda_stream_pool& get_stream_pool() const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return *stream_pool_;
+  }
+
+  std::size_t get_stream_pool_size() const
+  {
+    return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
+  }
+
+  /**
+   * @brief return stream from pool
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool() const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return stream_pool_->get_stream();
+  }
+
+  /**
+   * @brief return stream from pool at index
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return stream_pool_->get_stream(stream_idx);
+  }
+
+  /**
+   * @brief return stream from pool if size > 0, else main stream on handle
+   */
+  rmm::cuda_stream_view get_next_usable_stream() const
+  {
+    return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
+  }
+
+  /**
+   * @brief return stream from pool at index if size > 0, else main stream on handle
+   *
+   * @param[in] stream_idx the required index of the stream in the stream pool if available
+   */
+  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
+  {
+    return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
+  }
+
+  /**
+   * @brief synchronize the stream pool on the handle
+   */
+  void sync_stream_pool() const
+  {
+    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+      sync_stream(stream_pool_->get_stream(i));
+    }
+  }
+
+  /**
+   * @brief synchronize subset of stream pool
+   *
+   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+   */
+  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    for (const auto& stream_index : stream_indices) {
+      sync_stream(stream_pool_->get_stream(stream_index));
+    }
+  }
+
+  /**
+   * @brief ask stream pool to wait on last event in main stream
+   */
+  void wait_stream_pool_on_stream() const
+  {
+    RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
+    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
+    }
+  }
+
+  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
+
+  const comms::comms_t& get_comms() const
+  {
+    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
+    return *communicator_;
+  }
+
+  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+  {
+    subcomms_[key] = subcomm;
+  }
+
+  const comms::comms_t& get_subcomm(std::string key) const
+  {
+    RAFT_EXPECTS(
+      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
+
+    auto subcomm = subcomms_.at(key);
+
+    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
+
+    return *subcomm;
+  }
+
+  bool comms_initialized() const { return (nullptr != communicator_.get()); }
+
+  const cudaDeviceProp& get_device_properties() const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!device_prop_initialized_) {
+      RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
+      device_prop_initialized_ = true;
+    }
+    return prop_;
+  }
+
+ private:
+  std::shared_ptr<comms::comms_t> communicator_;
+  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
+
+  const int dev_id_;
+  mutable cublasHandle_t cublas_handle_;
+  mutable bool cublas_initialized_{false};
+  mutable cusolverDnHandle_t cusolver_dn_handle_;
+  mutable bool cusolver_dn_initialized_{false};
+  mutable cusolverSpHandle_t cusolver_sp_handle_;
+  mutable bool cusolver_sp_initialized_{false};
+  mutable cusparseHandle_t cusparse_handle_;
+  mutable bool cusparse_initialized_{false};
+  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
+  rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
+  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
+  cudaEvent_t event_;
+  mutable cudaDeviceProp prop_;
+  mutable bool device_prop_initialized_{false};
+  mutable std::mutex mutex_;
+
+  void create_resources()
+  {
+    thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
+
+    RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+  }
+
+  void destroy_resources()
+  {
+    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
+    if (cusolver_dn_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+    }
+    if (cusolver_sp_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+    }
+    if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
+    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
+  }
+};  // class handle_t
 
 /**
  * @brief RAII approach to synchronizing across all streams in the handle
  */
-    class stream_syncer {
-    public:
-        explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
-        ~stream_syncer()
-        {
-            handle_.wait_stream_pool_on_stream();
-            handle_.sync_stream_pool();
-        }
-
-        stream_syncer(const stream_syncer& other) = delete;
-        stream_syncer& operator=(const stream_syncer& other) = delete;
-
-    private:
-        const handle_t& handle_;
-    };  // class stream_syncer
+class stream_syncer {
+ public:
+  explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
+  ~stream_syncer()
+  {
+    handle_.wait_stream_pool_on_stream();
+    handle_.sync_stream_pool();
+  }
+
+  stream_syncer(const stream_syncer& other) = delete;
+  stream_syncer& operator=(const stream_syncer& other) = delete;
+
+ private:
+  const handle_t& handle_;
+};  // class stream_syncer
 
 }  // namespace raft
 
diff --git a/cpp/include/raft/core/interruptible.hpp b/cpp/include/raft/core/interruptible.hpp
index 43b64ce430..6764065363 100644
--- a/cpp/include/raft/core/interruptible.hpp
+++ b/cpp/include/raft/core/interruptible.hpp
@@ -39,9 +39,9 @@ namespace raft {
  * @brief Exception thrown during `interruptible::synchronize` call when it detects a request
  * to cancel the work performed in this CPU thread.
  */
-    struct interrupted_exception : public raft::exception {
-        using raft::exception::exception;
-    };
+struct interrupted_exception : public raft::exception {
+  using raft::exception::exception;
+};
 
 /**
  * @brief Cooperative-style interruptible execution.
@@ -68,208 +68,208 @@ namespace raft {
  * (e.g., CTRL+C), but extra effort on the use side is required to allow safe interrupting and
  * resuming of the GPU stream work.
  */
-    class interruptible {
-    public:
-        /**
-         * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
-         * called on this CPU thread.
-         *
-         * @param [in] stream a CUDA stream.
-         *
-         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-         * thread before the currently captured work has been finished.
-         * @throw raft::cuda_error if another CUDA error happens.
-         */
-        static inline void synchronize(rmm::cuda_stream_view stream)
-        {
-            get_token()->synchronize_impl(cudaStreamQuery, stream);
-        }
+class interruptible {
+ public:
+  /**
+   * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
+   * called on this CPU thread.
+   *
+   * @param [in] stream a CUDA stream.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread before the currently captured work has been finished.
+   * @throw raft::cuda_error if another CUDA error happens.
+   */
+  static inline void synchronize(rmm::cuda_stream_view stream)
+  {
+    get_token()->synchronize_impl(cudaStreamQuery, stream);
+  }
 
-        /**
-         * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
-         * called on this CPU thread.
-         *
-         * @param [in] event a CUDA event.
-         *
-         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-         * thread before the currently captured work has been finished.
-         * @throw raft::cuda_error if another CUDA error happens.
-         */
-        static inline void synchronize(cudaEvent_t event)
-        {
-            get_token()->synchronize_impl(cudaEventQuery, event);
-        }
+  /**
+   * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
+   * called on this CPU thread.
+   *
+   * @param [in] event a CUDA event.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread before the currently captured work has been finished.
+   * @throw raft::cuda_error if another CUDA error happens.
+   */
+  static inline void synchronize(cudaEvent_t event)
+  {
+    get_token()->synchronize_impl(cudaEventQuery, event);
+  }
 
-        /**
-         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-         * `interruptible::cancel`.
-         *
-         * This is a cancellation point for an interruptible thread. It's called in the internals of
-         * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
-         * recommended to call `interruptible::yield()` in between to make sure the thread does not become
-         * unresponsive for too long.
-         *
-         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-         *
-         * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
-         * thread.
-         */
-        static inline void yield() { get_token()->yield_impl(); }
+  /**
+   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+   * `interruptible::cancel`.
+   *
+   * This is a cancellation point for an interruptible thread. It's called in the internals of
+   * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
+   * recommended to call `interruptible::yield()` in between to make sure the thread does not become
+   * unresponsive for too long.
+   *
+   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread.
+   */
+  static inline void yield() { get_token()->yield_impl(); }
 
-        /**
-         * @brief Check the thread state, whether the thread can continue execution or is interrupted by
-         * `interruptible::cancel`.
-         *
-         * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
-         *
-         * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
-         *
-         * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
-         */
-        static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
+  /**
+   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+   * `interruptible::cancel`.
+   *
+   * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
+   *
+   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+   *
+   * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
+   */
+  static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
 
-        /**
-         * @brief Get a cancellation token for this CPU thread.
-         *
-         * @return an object that can be used to cancel the GPU work waited on this CPU thread.
-         */
-        static inline auto get_token() -> std::shared_ptr<interruptible>
-        {
-            // NB: using static thread-local storage to keep the token alive once it is initialized
-            static thread_local std::shared_ptr<interruptible> s(
-                    get_token_impl<true>(std::this_thread::get_id()));
-            return s;
-        }
+  /**
+   * @brief Get a cancellation token for this CPU thread.
+   *
+   * @return an object that can be used to cancel the GPU work waited on this CPU thread.
+   */
+  static inline auto get_token() -> std::shared_ptr<interruptible>
+  {
+    // NB: using static thread-local storage to keep the token alive once it is initialized
+    static thread_local std::shared_ptr<interruptible> s(
+      get_token_impl<true>(std::this_thread::get_id()));
+    return s;
+  }
 
-        /**
-         * @brief Get a cancellation token for a CPU thread given by its id.
-         *
-         * The returned token may live longer than the associated thread. In that case, using its
-         * `cancel` method has no effect.
-         *
-         * @param [in] thread_id an id of a C++ CPU thread.
-         * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
-         */
-        static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-        {
-            return get_token_impl<false>(thread_id);
-        }
+  /**
+   * @brief Get a cancellation token for a CPU thread given by its id.
+   *
+   * The returned token may live longer than the associated thread. In that case, using its
+   * `cancel` method has no effect.
+   *
+   * @param [in] thread_id an id of a C++ CPU thread.
+   * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
+   */
+  static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+  {
+    return get_token_impl<false>(thread_id);
+  }
 
-        /**
-         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-         * CPU thread given by the `thread_id`
-         *
-         * Note, this function uses a mutex to safely get a cancellation token that may be shared
-         * among multiple threads. If you plan to use it from a signal handler, consider the non-static
-         * `cancel()` instead.
-         *
-         * @param [in] thread_id a CPU thread, in which the work should be interrupted.
-         */
-        static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
+  /**
+   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+   * CPU thread given by the `thread_id`
+   *
+   * Note, this function uses a mutex to safely get a cancellation token that may be shared
+   * among multiple threads. If you plan to use it from a signal handler, consider the non-static
+   * `cancel()` instead.
+   *
+   * @param [in] thread_id a CPU thread, in which the work should be interrupted.
+   */
+  static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
 
-        /**
-         * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
-         * CPU thread given by this `interruptible` token.
-         *
-         * Note, this function does not involve thread synchronization/locks and does not throw any
-         * exceptions, so it's safe to call from a signal handler.
-         */
-        inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
+  /**
+   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+   * CPU thread given by this `interruptible` token.
+   *
+   * Note, this function does not involve thread synchronization/locks and does not throw any
+   * exceptions, so it's safe to call from a signal handler.
+   */
+  inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
 
-        // don't allow the token to leave the shared_ptr
-        interruptible(interruptible const&) = delete;
-        interruptible(interruptible&&)      = delete;
-        auto operator=(interruptible const&) -> interruptible& = delete;
-        auto operator=(interruptible&&) -> interruptible& = delete;
+  // don't allow the token to leave the shared_ptr
+  interruptible(interruptible const&) = delete;
+  interruptible(interruptible&&)      = delete;
+  auto operator=(interruptible const&) -> interruptible& = delete;
+  auto operator=(interruptible&&) -> interruptible& = delete;
 
-    private:
-        /** Global registry of thread-local cancellation stores. */
-        static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
-        /** Protect the access to the registry. */
-        static inline std::mutex mutex_;
+ private:
+  /** Global registry of thread-local cancellation stores. */
+  static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
+  /** Protect the access to the registry. */
+  static inline std::mutex mutex_;
 
-        /**
-         * Create a new interruptible token or get an existing from the global registry_.
-         *
-         * Presumptions:
-         *
-         *   1. get_token_impl<true> must be called at most once per thread.
-         *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
-         *   3. get_token_impl<false> can be called as many times as needed, producing a valid
-         *      token for any input thread_id, independent of whether a C++ thread with this
-         *      id exists or not.
-         *
-         * @tparam Claim whether to bind the token to the given thread.
-         * @param [in] thread_id the id of the associated C++ thread.
-         * @return new or existing interruptible token.
-         */
-        template <bool Claim>
-        static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
-        {
-            std::lock_guard<std::mutex> guard_get(mutex_);
-            // the following constructs an empty shared_ptr if the key does not exist.
-            auto& weak_store  = registry_[thread_id];
-            auto thread_store = weak_store.lock();
-            if (!thread_store || (Claim && thread_store->claimed_)) {
-                // Create a new thread_store in two cases:
-                //  1. It does not exist in the map yet
-                //  2. The previous store in the map has not yet been deleted
-                thread_store.reset(new interruptible(), [thread_id](auto ts) {
-                    std::lock_guard<std::mutex> guard_erase(mutex_);
-                    auto found = registry_.find(thread_id);
-                    if (found != registry_.end()) {
-                        auto stored = found->second.lock();
-                        // thread_store is not moveable, thus retains its original location.
-                        // Not equal pointers below imply the new store has been already placed
-                        // in the registry_ by the same std::thread::id
-                        if (!stored || stored.get() == ts) { registry_.erase(found); }
-                    }
-                    delete ts;
-                });
-                std::weak_ptr<interruptible>(thread_store).swap(weak_store);
-            }
-            // The thread_store is "claimed" by the thread
-            if constexpr (Claim) { thread_store->claimed_ = true; }
-            return thread_store;
+  /**
+   * Create a new interruptible token or get an existing from the global registry_.
+   *
+   * Presumptions:
+   *
+   *   1. get_token_impl<true> must be called at most once per thread.
+   *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
+   *   3. get_token_impl<false> can be called as many times as needed, producing a valid
+   *      token for any input thread_id, independent of whether a C++ thread with this
+   *      id exists or not.
+   *
+   * @tparam Claim whether to bind the token to the given thread.
+   * @param [in] thread_id the id of the associated C++ thread.
+   * @return new or existing interruptible token.
+   */
+  template <bool Claim>
+  static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+  {
+    std::lock_guard<std::mutex> guard_get(mutex_);
+    // the following constructs an empty shared_ptr if the key does not exist.
+    auto& weak_store  = registry_[thread_id];
+    auto thread_store = weak_store.lock();
+    if (!thread_store || (Claim && thread_store->claimed_)) {
+      // Create a new thread_store in two cases:
+      //  1. It does not exist in the map yet
+      //  2. The previous store in the map has not yet been deleted
+      thread_store.reset(new interruptible(), [thread_id](auto ts) {
+        std::lock_guard<std::mutex> guard_erase(mutex_);
+        auto found = registry_.find(thread_id);
+        if (found != registry_.end()) {
+          auto stored = found->second.lock();
+          // thread_store is not moveable, thus retains its original location.
+          // Not equal pointers below imply the new store has been already placed
+          // in the registry_ by the same std::thread::id
+          if (!stored || stored.get() == ts) { registry_.erase(found); }
         }
+        delete ts;
+      });
+      std::weak_ptr<interruptible>(thread_store).swap(weak_store);
+    }
+    // The thread_store is "claimed" by the thread
+    if constexpr (Claim) { thread_store->claimed_ = true; }
+    return thread_store;
+  }
 
-        /**
-         * Communicate whether the thread is in a cancelled state or can continue execution.
-         *
-         * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
-         * These are the only two places where it's used.
-         */
-        std::atomic_flag continue_;
-        /** This flag is set to true when the created token is placed into a thread-local storage. */
-        bool claimed_ = false;
+  /**
+   * Communicate whether the thread is in a cancelled state or can continue execution.
+   *
+   * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
+   * These are the only two places where it's used.
+   */
+  std::atomic_flag continue_;
+  /** This flag is set to true when the created token is placed into a thread-local storage. */
+  bool claimed_ = false;
 
-        interruptible() noexcept { yield_no_throw_impl(); }
+  interruptible() noexcept { yield_no_throw_impl(); }
 
-        void yield_impl()
-        {
-            if (!yield_no_throw_impl()) {
-                throw interrupted_exception("The work in this thread was cancelled.");
-            }
-        }
+  void yield_impl()
+  {
+    if (!yield_no_throw_impl()) {
+      throw interrupted_exception("The work in this thread was cancelled.");
+    }
+  }
 
-        auto yield_no_throw_impl() noexcept -> bool
-        {
-            return continue_.test_and_set(std::memory_order_relaxed);
-        }
+  auto yield_no_throw_impl() noexcept -> bool
+  {
+    return continue_.test_and_set(std::memory_order_relaxed);
+  }
 
-        template <typename Query, typename Object>
-        inline void synchronize_impl(Query query, Object object)
-        {
-            cudaError_t query_result;
-            while (true) {
-                yield_impl();
-                query_result = query(object);
-                if (query_result != cudaErrorNotReady) { break; }
-                std::this_thread::yield();
-            }
-            RAFT_CUDA_TRY(query_result);
-        }
-    };
+  template <typename Query, typename Object>
+  inline void synchronize_impl(Query query, Object object)
+  {
+    cudaError_t query_result;
+    while (true) {
+      yield_impl();
+      query_result = query(object);
+      if (query_result != cudaErrorNotReady) { break; }
+      std::this_thread::yield();
+    }
+    RAFT_CUDA_TRY(query_result);
+  }
+};
 
 }  // namespace raft
 
diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger.hpp
index a25f3fa4a6..9066e103d0 100644
--- a/cpp/include/raft/core/logger.hpp
+++ b/cpp/include/raft/core/logger.hpp
@@ -55,8 +55,8 @@
 
 namespace raft {
 
-    static const std::string RAFT_NAME = "raft";
-    static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
+static const std::string RAFT_NAME = "raft";
+static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
 
 /**
  * @defgroup CStringFormat Expand a C-style format string
@@ -70,28 +70,28 @@ namespace raft {
  *
  * @{
  */
-    std::string format(const char* fmt, va_list& vl)
-    {
-        char buf[4096];
-        vsnprintf(buf, sizeof(buf), fmt, vl);
-        return std::string(buf);
-    }
-
-    std::string format(const char* fmt, ...)
-    {
-        va_list vl;
-        va_start(vl, fmt);
-        std::string str = format(fmt, vl);
-        va_end(vl);
-        return str;
-    }
+std::string format(const char* fmt, va_list& vl)
+{
+  char buf[4096];
+  vsnprintf(buf, sizeof(buf), fmt, vl);
+  return std::string(buf);
+}
+
+std::string format(const char* fmt, ...)
+{
+  va_list vl;
+  va_start(vl, fmt);
+  std::string str = format(fmt, vl);
+  va_end(vl);
+  return str;
+}
 /** @} */
 
-    int convert_level_to_spdlog(int level)
-    {
-        level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
-        return RAFT_LEVEL_TRACE - level;
-    }
+int convert_level_to_spdlog(int level)
+{
+  level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
+  return RAFT_LEVEL_TRACE - level;
+}
 
 /**
  * @brief The main Logging class for raft library.
@@ -103,140 +103,140 @@ namespace raft {
  * @todo This currently only supports logging to stdout. Need to add support in
  *       future to add custom loggers as well [Issue #2046]
  */
-    class logger {
-    public:
-        // @todo setting the logger once per process with
-        logger(std::string const& name_ = "")
-                : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
-                  spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
-                  cur_pattern()
-        {
-            set_pattern(default_log_pattern);
-            set_level(RAFT_LEVEL_INFO);
-        }
-        /**
-         * @brief Singleton method to get the underlying logger object
-         *
-         * @return the singleton logger object
-         */
-        static logger& get(std::string const& name = "")
-        {
-            if (log_map.find(name) == log_map.end()) {
-                log_map[name] = std::make_shared<raft::logger>(name);
-            }
-            return *log_map[name];
-        }
-
-        /**
-         * @brief Set the logging level.
-         *
-         * Only messages with level equal or above this will be printed
-         *
-         * @param[in] level logging level
-         *
-         * @note The log level will actually be set only if the input is within the
-         *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
-         *       be ignored. See documentation of decisiontree for how this gets used
-         */
-        void set_level(int level)
-        {
-            level = convert_level_to_spdlog(level);
-            spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
-        }
-
-        /**
-         * @brief Set the logging pattern
-         *
-         * @param[in] pattern the pattern to be set. Refer this link
-         *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
-         *                    to know the right syntax of this pattern
-         */
-        void set_pattern(const std::string& pattern)
-        {
-            cur_pattern = pattern;
-            spdlogger->set_pattern(pattern);
-        }
-
-        /**
-         * @brief Register a callback function to be run in place of usual log call
-         *
-         * @param[in] callback the function to be run on all logged messages
-         */
-        void set_callback(void (*callback)(int lvl, const char* msg)) { sink->set_callback(callback); }
-
-        /**
-         * @brief Register a flush function compatible with the registered callback
-         *
-         * @param[in] flush the function to use when flushing logs
-         */
-        void set_flush(void (*flush)()) { sink->set_flush(flush); }
-
-        /**
-         * @brief Tells whether messages will be logged for the given log level
-         *
-         * @param[in] level log level to be checked for
-         * @return true if messages will be logged for this level, else false
-         */
-        bool should_log_for(int level) const
-        {
-            level        = convert_level_to_spdlog(level);
-            auto level_e = static_cast<spdlog::level::level_enum>(level);
-            return spdlogger->should_log(level_e);
-        }
-
-        /**
-         * @brief Query for the current log level
-         *
-         * @return the current log level
-         */
-        int get_level() const
-        {
-            auto level_e = spdlogger->level();
-            return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
-        }
-
-        /**
-         * @brief Get the current logging pattern
-         * @return the pattern
-         */
-        std::string get_pattern() const { return cur_pattern; }
-
-        /**
-         * @brief Main logging method
-         *
-         * @param[in] level logging level of this message
-         * @param[in] fmt   C-like format string, followed by respective params
-         */
-        void log(int level, const char* fmt, ...)
-        {
-            level        = convert_level_to_spdlog(level);
-            auto level_e = static_cast<spdlog::level::level_enum>(level);
-            // explicit check to make sure that we only expand messages when required
-            if (spdlogger->should_log(level_e)) {
-                va_list vl;
-                va_start(vl, fmt);
-                auto msg = format(fmt, vl);
-                va_end(vl);
-                spdlogger->log(level_e, msg);
-            }
-        }
-
-        /**
-         * @brief Flush logs by calling flush on underlying logger
-         */
-        void flush() { spdlogger->flush(); }
-
-        ~logger() {}
-
-    private:
-        logger();
-
-        static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
-        std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
-        std::shared_ptr<spdlog::logger> spdlogger;
-        std::string cur_pattern;
-        int cur_level;
-    };  // class logger
+class logger {
+ public:
+  // @todo setting the logger once per process with
+  logger(std::string const& name_ = "")
+    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
+      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
+      cur_pattern()
+  {
+    set_pattern(default_log_pattern);
+    set_level(RAFT_LEVEL_INFO);
+  }
+  /**
+   * @brief Singleton method to get the underlying logger object
+   *
+   * @return the singleton logger object
+   */
+  static logger& get(std::string const& name = "")
+  {
+    if (log_map.find(name) == log_map.end()) {
+      log_map[name] = std::make_shared<raft::logger>(name);
+    }
+    return *log_map[name];
+  }
+
+  /**
+   * @brief Set the logging level.
+   *
+   * Only messages with level equal or above this will be printed
+   *
+   * @param[in] level logging level
+   *
+   * @note The log level will actually be set only if the input is within the
+   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
+   *       be ignored. See documentation of decisiontree for how this gets used
+   */
+  void set_level(int level)
+  {
+    level = convert_level_to_spdlog(level);
+    spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
+  }
+
+  /**
+   * @brief Set the logging pattern
+   *
+   * @param[in] pattern the pattern to be set. Refer this link
+   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
+   *                    to know the right syntax of this pattern
+   */
+  void set_pattern(const std::string& pattern)
+  {
+    cur_pattern = pattern;
+    spdlogger->set_pattern(pattern);
+  }
+
+  /**
+   * @brief Register a callback function to be run in place of usual log call
+   *
+   * @param[in] callback the function to be run on all logged messages
+   */
+  void set_callback(void (*callback)(int lvl, const char* msg)) { sink->set_callback(callback); }
+
+  /**
+   * @brief Register a flush function compatible with the registered callback
+   *
+   * @param[in] flush the function to use when flushing logs
+   */
+  void set_flush(void (*flush)()) { sink->set_flush(flush); }
+
+  /**
+   * @brief Tells whether messages will be logged for the given log level
+   *
+   * @param[in] level log level to be checked for
+   * @return true if messages will be logged for this level, else false
+   */
+  bool should_log_for(int level) const
+  {
+    level        = convert_level_to_spdlog(level);
+    auto level_e = static_cast<spdlog::level::level_enum>(level);
+    return spdlogger->should_log(level_e);
+  }
+
+  /**
+   * @brief Query for the current log level
+   *
+   * @return the current log level
+   */
+  int get_level() const
+  {
+    auto level_e = spdlogger->level();
+    return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
+  }
+
+  /**
+   * @brief Get the current logging pattern
+   * @return the pattern
+   */
+  std::string get_pattern() const { return cur_pattern; }
+
+  /**
+   * @brief Main logging method
+   *
+   * @param[in] level logging level of this message
+   * @param[in] fmt   C-like format string, followed by respective params
+   */
+  void log(int level, const char* fmt, ...)
+  {
+    level        = convert_level_to_spdlog(level);
+    auto level_e = static_cast<spdlog::level::level_enum>(level);
+    // explicit check to make sure that we only expand messages when required
+    if (spdlogger->should_log(level_e)) {
+      va_list vl;
+      va_start(vl, fmt);
+      auto msg = format(fmt, vl);
+      va_end(vl);
+      spdlogger->log(level_e, msg);
+    }
+  }
+
+  /**
+   * @brief Flush logs by calling flush on underlying logger
+   */
+  void flush() { spdlogger->flush(); }
+
+  ~logger() {}
+
+ private:
+  logger();
+
+  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
+  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
+  std::shared_ptr<spdlog::logger> spdlogger;
+  std::string cur_pattern;
+  int cur_level;
+};  // class logger
 
 };  // namespace raft
 
diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
index 502686786e..f92a0e5e59 100644
--- a/cpp/include/raft/core/mdarray.hpp
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -30,32 +30,32 @@ namespace raft {
 /**
  * @\brief C-Contiguous layout for mdarray and mdspan. Implies row-major and contiguous memory.
  */
-    using layout_c_contiguous = detail::stdex::layout_right;
+using layout_c_contiguous = detail::stdex::layout_right;
 
 /**
  * @\brief F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory.
  */
-    using layout_f_contiguous = detail::stdex::layout_left;
+using layout_f_contiguous = detail::stdex::layout_left;
 
 /**
  * @brief stdex::mdspan with device tag to avoid accessing incorrect memory location.
  */
-    template <typename ElementType,
-            typename Extents,
-            typename LayoutPolicy   = layout_c_contiguous,
-            typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
-    using device_mdspan = detail::stdex::
-    mdspan<ElementType, Extents, LayoutPolicy, detail::device_accessor<AccessorPolicy>>;
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy   = layout_c_contiguous,
+          typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
+using device_mdspan = detail::stdex::
+  mdspan<ElementType, Extents, LayoutPolicy, detail::device_accessor<AccessorPolicy>>;
 
 /**
  * @brief stdex::mdspan with host tag to avoid accessing incorrect memory location.
  */
-    template <typename ElementType,
-            typename Extents,
-            typename LayoutPolicy   = layout_c_contiguous,
-            typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
-    using host_mdspan =
-    detail::stdex::mdspan<ElementType, Extents, LayoutPolicy, detail::host_accessor<AccessorPolicy>>;
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy   = layout_c_contiguous,
+          typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
+using host_mdspan =
+  detail::stdex::mdspan<ElementType, Extents, LayoutPolicy, detail::host_accessor<AccessorPolicy>>;
 
 /**
  * @brief Modified from the c++ mdarray proposal
@@ -86,61 +86,61 @@ namespace raft {
  * - For the above reasons, copying from other mdarray with different policy type is also
  *   removed.
  */
-    template <typename ElementType, typename Extents, typename LayoutPolicy, typename ContainerPolicy>
-    class mdarray {
-        static_assert(!std::is_const<ElementType>::value,
-        "Element type for container must not be const.");
-
-    public:
-        using extents_type = Extents;
-        using layout_type  = LayoutPolicy;
-        using mapping_type = typename layout_type::template mapping<extents_type>;
-        using element_type = ElementType;
-
-        using value_type      = std::remove_cv_t<element_type>;
-        using index_type      = std::size_t;
-        using difference_type = std::ptrdiff_t;
-        // Naming: ref impl: container_policy_type, proposal: container_policy
-        using container_policy_type = ContainerPolicy;
-        using container_type        = typename container_policy_type::container_type;
-
-        using pointer         = typename container_policy_type::pointer;
-        using const_pointer   = typename container_policy_type::const_pointer;
-        using reference       = typename container_policy_type::reference;
-        using const_reference = typename container_policy_type::const_reference;
-
-    private:
-        template <typename E,
-                typename ViewAccessorPolicy =
-                std::conditional_t<std::is_const_v<E>,
-                        typename container_policy_type::const_accessor_policy,
-                        typename container_policy_type::accessor_policy>>
-        using view_type_impl =
-        std::conditional_t<container_policy_type::is_host_type::value,
-                host_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>,
-                device_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>>;
-
-    public:
-        /**
-         * \brief the mdspan type returned by view method.
-         */
-        using view_type       = view_type_impl<element_type>;
-        using const_view_type = view_type_impl<element_type const>;
-
-    public:
-        constexpr mdarray() noexcept(std::is_nothrow_default_constructible_v<container_type>)
-                : cp_{rmm::cuda_stream_default}, c_{cp_.create(0)} {};
-        constexpr mdarray(mdarray const&) noexcept(std::is_nothrow_copy_constructible_v<container_type>) =
-        default;
-        constexpr mdarray(mdarray&&) noexcept(std::is_nothrow_move_constructible<container_type>::value) =
-        default;
-
-        constexpr auto operator                                               =(mdarray const&) noexcept(
-                std::is_nothrow_copy_assignable<container_type>::value) -> mdarray& = default;
-        constexpr auto operator                                               =(mdarray&&) noexcept(
-                std::is_nothrow_move_assignable<container_type>::value) -> mdarray& = default;
-
-        ~mdarray() noexcept(std::is_nothrow_destructible<container_type>::value) = default;
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename ContainerPolicy>
+class mdarray {
+  static_assert(!std::is_const<ElementType>::value,
+                "Element type for container must not be const.");
+
+ public:
+  using extents_type = Extents;
+  using layout_type  = LayoutPolicy;
+  using mapping_type = typename layout_type::template mapping<extents_type>;
+  using element_type = ElementType;
+
+  using value_type      = std::remove_cv_t<element_type>;
+  using index_type      = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  // Naming: ref impl: container_policy_type, proposal: container_policy
+  using container_policy_type = ContainerPolicy;
+  using container_type        = typename container_policy_type::container_type;
+
+  using pointer         = typename container_policy_type::pointer;
+  using const_pointer   = typename container_policy_type::const_pointer;
+  using reference       = typename container_policy_type::reference;
+  using const_reference = typename container_policy_type::const_reference;
+
+ private:
+  template <typename E,
+            typename ViewAccessorPolicy =
+              std::conditional_t<std::is_const_v<E>,
+                                 typename container_policy_type::const_accessor_policy,
+                                 typename container_policy_type::accessor_policy>>
+  using view_type_impl =
+    std::conditional_t<container_policy_type::is_host_type::value,
+                       host_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>,
+                       device_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>>;
+
+ public:
+  /**
+   * \brief the mdspan type returned by view method.
+   */
+  using view_type       = view_type_impl<element_type>;
+  using const_view_type = view_type_impl<element_type const>;
+
+ public:
+  constexpr mdarray() noexcept(std::is_nothrow_default_constructible_v<container_type>)
+    : cp_{rmm::cuda_stream_default}, c_{cp_.create(0)} {};
+  constexpr mdarray(mdarray const&) noexcept(std::is_nothrow_copy_constructible_v<container_type>) =
+    default;
+  constexpr mdarray(mdarray&&) noexcept(std::is_nothrow_move_constructible<container_type>::value) =
+    default;
+
+  constexpr auto operator                                               =(mdarray const&) noexcept(
+    std::is_nothrow_copy_assignable<container_type>::value) -> mdarray& = default;
+  constexpr auto operator                                               =(mdarray&&) noexcept(
+    std::is_nothrow_move_assignable<container_type>::value) -> mdarray& = default;
+
+  ~mdarray() noexcept(std::is_nothrow_destructible<container_type>::value) = default;
 
 #ifndef RAFT_MDARRAY_CTOR_CONSTEXPR
 #if !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
@@ -158,141 +158,141 @@ namespace raft {
 #endif  // !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
 #endif  // RAFT_MDARRAY_CTOR_CONSTEXPR
 
-        /**
-         * @brief The only constructor that can create storage, this is to make sure CUDA stream is being
-         * used.
-         */
-        RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type const& cp)
-                : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
-        {
-        }
-        RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type& cp)
-                : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
-        {
-        }
+  /**
+   * @brief The only constructor that can create storage, this is to make sure CUDA stream is being
+   * used.
+   */
+  RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type const& cp)
+    : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
+  {
+  }
+  RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type& cp)
+    : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
+  {
+  }
 
 #undef RAFT_MDARRAY_CTOR_CONSTEXPR
 
-        /**
-         * @brief Get a mdspan that can be passed down to CUDA kernels.
-         */
-        auto view() noexcept { return view_type(c_.data(), map_, cp_.make_accessor_policy()); }
-        /**
-         * @brief Get a mdspan that can be passed down to CUDA kernels.
-         */
-        auto view() const noexcept
-        {
-            return const_view_type(c_.data(), map_, cp_.make_accessor_policy());
-        }
-
-        [[nodiscard]] constexpr auto size() const noexcept -> index_type { return this->view().size(); }
-
-        [[nodiscard]] auto data() noexcept -> pointer { return c_.data(); }
-        [[nodiscard]] constexpr auto data() const noexcept -> const_pointer { return c_.data(); }
-
-        /**
-         * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
-         */
-        template <typename... IndexType>
-        auto operator()(IndexType&&... indices)
-        -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
-                (std::is_convertible_v<IndexType, index_type> && ...) &&
-        std::is_constructible_v<extents_type, IndexType...> &&
-                std::is_constructible_v<mapping_type, extents_type>,
-        /* device policy is not default constructible due to requirement for CUDA
-           stream. */
-        /* std::is_default_constructible_v<container_policy_type> */
-        reference>
-        {
-            return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
-        }
-
-        /**
-         * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
-         */
-        template <typename... IndexType>
-        auto operator()(IndexType&&... indices) const
-        -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
-                (std::is_convertible_v<IndexType, index_type> && ...) &&
-        std::is_constructible_v<extents_type, IndexType...> &&
-                std::is_constructible<mapping_type, extents_type>::value,
-        /* device policy is not default constructible due to requirement for CUDA
-           stream. */
-        /* std::is_default_constructible_v<container_policy_type> */
-        const_reference>
-        {
-            return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
-        }
-
-        // basic_mdarray observers of the domain multidimensional index space (also in basic_mdspan)
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank() noexcept -> index_type
-        {
-            return extents_type::rank();
-        }
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank_dynamic() noexcept -> index_type
-        {
-            return extents_type::rank_dynamic();
-        }
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto static_extent(size_t r) noexcept
-        -> index_type
-        {
-            return extents_type::static_extent(r);
-        }
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extents() const noexcept -> extents_type
-        {
-            return map_.extents();
-        }
-        /**
-         * @brief the extent of rank r
-         */
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extent(size_t r) const noexcept -> index_type
-        {
-            return map_.extents().extent(r);
-        }
-        // mapping
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto mapping() const noexcept -> mapping_type
-        {
-            return map_;
-        }
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_unique() const noexcept -> bool
-        {
-            return map_.is_unique();
-        }
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_contiguous() const noexcept -> bool
-        {
-            return map_.is_contiguous();
-        }
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_strided() const noexcept -> bool
-        {
-            return map_.is_strided();
-        }
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto stride(size_t r) const -> index_type
-        {
-            return map_.stride(r);
-        }
-
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_unique() noexcept -> bool
-        {
-            return mapping_type::is_always_unique();
-        }
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_contiguous() noexcept -> bool
-        {
-            return mapping_type::is_always_contiguous();
-        }
-        [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_strided() noexcept -> bool
-        {
-            return mapping_type::is_always_strided();
-        }
-
-    private:
-        template <typename, typename, typename, typename>
-        friend class mdarray;
-
-    private:
-        container_policy_type cp_;
-        mapping_type map_;
-        container_type c_;
-    };
+  /**
+   * @brief Get a mdspan that can be passed down to CUDA kernels.
+   */
+  auto view() noexcept { return view_type(c_.data(), map_, cp_.make_accessor_policy()); }
+  /**
+   * @brief Get a mdspan that can be passed down to CUDA kernels.
+   */
+  auto view() const noexcept
+  {
+    return const_view_type(c_.data(), map_, cp_.make_accessor_policy());
+  }
+
+  [[nodiscard]] constexpr auto size() const noexcept -> index_type { return this->view().size(); }
+
+  [[nodiscard]] auto data() noexcept -> pointer { return c_.data(); }
+  [[nodiscard]] constexpr auto data() const noexcept -> const_pointer { return c_.data(); }
+
+  /**
+   * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
+   */
+  template <typename... IndexType>
+  auto operator()(IndexType&&... indices)
+    -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
+                          (std::is_convertible_v<IndexType, index_type> && ...) &&
+                          std::is_constructible_v<extents_type, IndexType...> &&
+                          std::is_constructible_v<mapping_type, extents_type>,
+                        /* device policy is not default constructible due to requirement for CUDA
+                           stream. */
+                        /* std::is_default_constructible_v<container_policy_type> */
+                        reference>
+  {
+    return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
+  }
+
+  /**
+   * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
+   */
+  template <typename... IndexType>
+  auto operator()(IndexType&&... indices) const
+    -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
+                          (std::is_convertible_v<IndexType, index_type> && ...) &&
+                          std::is_constructible_v<extents_type, IndexType...> &&
+                          std::is_constructible<mapping_type, extents_type>::value,
+                        /* device policy is not default constructible due to requirement for CUDA
+                           stream. */
+                        /* std::is_default_constructible_v<container_policy_type> */
+                        const_reference>
+  {
+    return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
+  }
+
+  // basic_mdarray observers of the domain multidimensional index space (also in basic_mdspan)
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank() noexcept -> index_type
+  {
+    return extents_type::rank();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank_dynamic() noexcept -> index_type
+  {
+    return extents_type::rank_dynamic();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto static_extent(size_t r) noexcept
+    -> index_type
+  {
+    return extents_type::static_extent(r);
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extents() const noexcept -> extents_type
+  {
+    return map_.extents();
+  }
+  /**
+   * @brief the extent of rank r
+   */
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extent(size_t r) const noexcept -> index_type
+  {
+    return map_.extents().extent(r);
+  }
+  // mapping
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto mapping() const noexcept -> mapping_type
+  {
+    return map_;
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_unique() const noexcept -> bool
+  {
+    return map_.is_unique();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_contiguous() const noexcept -> bool
+  {
+    return map_.is_contiguous();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_strided() const noexcept -> bool
+  {
+    return map_.is_strided();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto stride(size_t r) const -> index_type
+  {
+    return map_.stride(r);
+  }
+
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_unique() noexcept -> bool
+  {
+    return mapping_type::is_always_unique();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_contiguous() noexcept -> bool
+  {
+    return mapping_type::is_always_contiguous();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_strided() noexcept -> bool
+  {
+    return mapping_type::is_always_strided();
+  }
+
+ private:
+  template <typename, typename, typename, typename>
+  friend class mdarray;
+
+ private:
+  container_policy_type cp_;
+  mapping_type map_;
+  container_type c_;
+};
 
 /**
  * @brief mdarray with host container policy
@@ -301,12 +301,12 @@ namespace raft {
  * @tparam LayoutPolicy policy for indexing strides and layout ordering
  * @tparam ContainerPolicy storage and accessor policy
  */
-    template <typename ElementType,
-            typename Extents,
-            typename LayoutPolicy    = layout_c_contiguous,
-            typename ContainerPolicy = detail::host_vector_policy<ElementType>>
-    using host_mdarray =
-    mdarray<ElementType, Extents, LayoutPolicy, detail::host_accessor<ContainerPolicy>>;
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy    = layout_c_contiguous,
+          typename ContainerPolicy = detail::host_vector_policy<ElementType>>
+using host_mdarray =
+  mdarray<ElementType, Extents, LayoutPolicy, detail::host_accessor<ContainerPolicy>>;
 
 /**
  * @brief mdarray with device container policy
@@ -315,84 +315,84 @@ namespace raft {
  * @tparam LayoutPolicy policy for indexing strides and layout ordering
  * @tparam ContainerPolicy storage and accessor policy
  */
-    template <typename ElementType,
-            typename Extents,
-            typename LayoutPolicy    = layout_c_contiguous,
-            typename ContainerPolicy = detail::device_uvector_policy<ElementType>>
-    using device_mdarray =
-    mdarray<ElementType, Extents, LayoutPolicy, detail::device_accessor<ContainerPolicy>>;
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy    = layout_c_contiguous,
+          typename ContainerPolicy = detail::device_uvector_policy<ElementType>>
+using device_mdarray =
+  mdarray<ElementType, Extents, LayoutPolicy, detail::device_accessor<ContainerPolicy>>;
 
 /**
  * @brief Shorthand for 0-dim host mdarray (scalar).
  * @tparam ElementType the data type of the scalar element
  */
-    template <typename ElementType>
-    using host_scalar = host_mdarray<ElementType, detail::scalar_extent>;
+template <typename ElementType>
+using host_scalar = host_mdarray<ElementType, detail::scalar_extent>;
 
 /**
  * @brief Shorthand for 0-dim host mdarray (scalar).
  * @tparam ElementType the data type of the scalar element
  */
-    template <typename ElementType>
-    using device_scalar = device_mdarray<ElementType, detail::scalar_extent>;
+template <typename ElementType>
+using device_scalar = device_mdarray<ElementType, detail::scalar_extent>;
 
 /**
  * @brief Shorthand for 1-dim host mdarray.
  * @tparam ElementType the data type of the vector elements
  */
-    template <typename ElementType>
-    using host_vector = host_mdarray<ElementType, detail::vector_extent>;
+template <typename ElementType>
+using host_vector = host_mdarray<ElementType, detail::vector_extent>;
 
 /**
  * @brief Shorthand for 1-dim device mdarray.
  * @tparam ElementType the data type of the vector elements
  */
-    template <typename ElementType>
-    using device_vector = device_mdarray<ElementType, detail::vector_extent>;
+template <typename ElementType>
+using device_vector = device_mdarray<ElementType, detail::vector_extent>;
 
 /**
  * @brief Shorthand for c-contiguous host matrix.
  * @tparam ElementType the data type of the matrix elements
  * @tparam LayoutPolicy policy for strides and layout ordering
  */
-    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-    using host_matrix = host_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using host_matrix = host_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for c-contiguous device matrix.
  * @tparam ElementType the data type of the matrix elements
  * @tparam LayoutPolicy policy for strides and layout ordering
  */
-    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-    using device_matrix = device_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using device_matrix = device_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for 0-dim host mdspan (scalar).
  * @tparam ElementType the data type of the scalar element
  */
-    template <typename ElementType>
-    using host_scalar_view = host_mdspan<ElementType, detail::scalar_extent>;
+template <typename ElementType>
+using host_scalar_view = host_mdspan<ElementType, detail::scalar_extent>;
 
 /**
  * @brief Shorthand for 0-dim host mdspan (scalar).
  * @tparam ElementType the data type of the scalar element
  */
-    template <typename ElementType>
-    using device_scalar_view = device_mdspan<ElementType, detail::scalar_extent>;
+template <typename ElementType>
+using device_scalar_view = device_mdspan<ElementType, detail::scalar_extent>;
 
 /**
  * @brief Shorthand for 1-dim host mdspan.
  * @tparam ElementType the data type of the vector elements
  */
-    template <typename ElementType>
-    using host_vector_view = host_mdspan<ElementType, detail::vector_extent>;
+template <typename ElementType>
+using host_vector_view = host_mdspan<ElementType, detail::vector_extent>;
 
 /**
  * @brief Shorthand for 1-dim device mdspan.
  * @tparam ElementType the data type of the vector elements
  */
-    template <typename ElementType>
-    using device_vector_view = device_mdspan<ElementType, detail::vector_extent>;
+template <typename ElementType>
+using device_vector_view = device_mdspan<ElementType, detail::vector_extent>;
 
 /**
  * @brief Shorthand for c-contiguous host matrix view.
@@ -400,8 +400,8 @@ namespace raft {
  * @tparam LayoutPolicy policy for strides and layout ordering
  *
  */
-    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-    using host_matrix_view = host_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using host_matrix_view = host_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for c-contiguous device matrix view.
@@ -409,8 +409,8 @@ namespace raft {
  * @tparam LayoutPolicy policy for strides and layout ordering
  *
  */
-    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-    using device_matrix_view = device_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using device_matrix_view = device_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
 
 /**
  * @brief Create a 0-dim (scalar) mdspan instance for host value.
@@ -418,12 +418,12 @@ namespace raft {
  * @tparam ElementType the data type of the matrix elements
  * @param[in] ptr on device to wrap
  */
-    template <typename ElementType>
-    auto make_host_scalar_view(ElementType* ptr)
-    {
-        detail::scalar_extent extents;
-        return host_scalar_view<ElementType>{ptr, extents};
-    }
+template <typename ElementType>
+auto make_host_scalar_view(ElementType* ptr)
+{
+  detail::scalar_extent extents;
+  return host_scalar_view<ElementType>{ptr, extents};
+}
 
 /**
  * @brief Create a 0-dim (scalar) mdspan instance for device value.
@@ -431,12 +431,12 @@ namespace raft {
  * @tparam ElementType the data type of the matrix elements
  * @param[in] ptr on device to wrap
  */
-    template <typename ElementType>
-    auto make_device_scalar_view(ElementType* ptr)
-    {
-        detail::scalar_extent extents;
-        return device_scalar_view<ElementType>{ptr, extents};
-    }
+template <typename ElementType>
+auto make_device_scalar_view(ElementType* ptr)
+{
+  detail::scalar_extent extents;
+  return device_scalar_view<ElementType>{ptr, extents};
+}
 
 /**
  * @brief Create a 2-dim c-contiguous mdspan instance for host pointer. It's
@@ -448,12 +448,12 @@ namespace raft {
  * @param[in] n_rows number of rows in pointer
  * @param[in] n_cols number of columns in pointer
  */
-    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-    auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
-    {
-        detail::matrix_extent extents{n_rows, n_cols};
-        return host_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
-    }
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  return host_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
+}
 /**
  * @brief Create a 2-dim c-contiguous mdspan instance for device pointer. It's
  *        expected that the given layout policy match the layout of the underlying
@@ -464,12 +464,12 @@ namespace raft {
  * @param[in] n_rows number of rows in pointer
  * @param[in] n_cols number of columns in pointer
  */
-    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-    auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
-    {
-        detail::matrix_extent extents{n_rows, n_cols};
-        return device_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
-    }
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  return device_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
+}
 
 /**
  * @brief Create a 1-dim mdspan instance for host pointer.
@@ -478,12 +478,12 @@ namespace raft {
  * @param[in] n number of elements in pointer
  * @return raft::host_vector_view
  */
-    template <typename ElementType>
-    auto make_host_vector_view(ElementType* ptr, size_t n)
-    {
-        detail::vector_extent extents{n};
-        return host_matrix_view<ElementType>{ptr, extents};
-    }
+template <typename ElementType>
+auto make_host_vector_view(ElementType* ptr, size_t n)
+{
+  detail::vector_extent extents{n};
+  return host_matrix_view<ElementType>{ptr, extents};
+}
 
 /**
  * @brief Create a 1-dim mdspan instance for device pointer.
@@ -492,12 +492,12 @@ namespace raft {
  * @param[in] n number of elements in pointer
  * @return raft::device_vector_view
  */
-    template <typename ElementType>
-    auto make_device_vector_view(ElementType* ptr, size_t n)
-    {
-        detail::vector_extent extents{n};
-        return device_matrix_view<ElementType>{ptr, extents};
-    }
+template <typename ElementType>
+auto make_device_vector_view(ElementType* ptr, size_t n)
+{
+  detail::vector_extent extents{n};
+  return device_matrix_view<ElementType>{ptr, extents};
+}
 
 /**
  * @brief Create a 2-dim c-contiguous host mdarray.
@@ -507,14 +507,14 @@ namespace raft {
  * @param[in] n_cols number of columns in matrix
  * @return raft::host_matrix
  */
-    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-    auto make_host_matrix(size_t n_rows, size_t n_cols)
-    {
-        detail::matrix_extent extents{n_rows, n_cols};
-        using policy_t = typename host_matrix<ElementType>::container_policy_type;
-        policy_t policy;
-        return host_matrix<ElementType, LayoutPolicy>{extents, policy};
-    }
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_host_matrix(size_t n_rows, size_t n_cols)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  using policy_t = typename host_matrix<ElementType>::container_policy_type;
+  policy_t policy;
+  return host_matrix<ElementType, LayoutPolicy>{extents, policy};
+}
 
 /**
  * @brief Create a 2-dim c-contiguous device mdarray.
@@ -525,14 +525,14 @@ namespace raft {
  * @param[in] stream cuda stream for ordering events
  * @return raft::device_matrix
  */
-    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-    auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
-    {
-        detail::matrix_extent extents{n_rows, n_cols};
-        using policy_t = typename device_matrix<ElementType>::container_policy_type;
-        policy_t policy{stream};
-        return device_matrix<ElementType, LayoutPolicy>{extents, policy};
-    }
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  using policy_t = typename device_matrix<ElementType>::container_policy_type;
+  policy_t policy{stream};
+  return device_matrix<ElementType, LayoutPolicy>{extents, policy};
+}
 
 /**
  * @brief Create a 2-dim c-contiguous device mdarray.
@@ -544,11 +544,11 @@ namespace raft {
  * @param[in] n_cols number of columns in matrix
  * @return raft::device_matrix
  */
-    template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-    auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_cols)
-    {
-        return make_device_matrix<ElementType, LayoutPolicy>(n_rows, n_cols, handle.get_stream());
-    }
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_cols)
+{
+  return make_device_matrix<ElementType, LayoutPolicy>(n_rows, n_cols, handle.get_stream());
+}
 
 /**
  * @brief Create a host scalar from v.
@@ -557,19 +557,19 @@ namespace raft {
  * @param[in] v scalar type to wrap
  * @return raft::host_scalar
  */
-    template <typename ElementType>
-    auto make_host_scalar(ElementType const& v)
-    {
-        // FIXME(jiamingy): We can optimize this by using std::array as container policy, which
-        // requires some more compile time dispatching. This is enabled in the ref impl but
-        // hasn't been ported here yet.
-        detail::scalar_extent extents;
-        using policy_t = typename host_scalar<ElementType>::container_policy_type;
-        policy_t policy;
-        auto scalar = host_scalar<ElementType>{extents, policy};
-        scalar(0)   = v;
-        return scalar;
-    }
+template <typename ElementType>
+auto make_host_scalar(ElementType const& v)
+{
+  // FIXME(jiamingy): We can optimize this by using std::array as container policy, which
+  // requires some more compile time dispatching. This is enabled in the ref impl but
+  // hasn't been ported here yet.
+  detail::scalar_extent extents;
+  using policy_t = typename host_scalar<ElementType>::container_policy_type;
+  policy_t policy;
+  auto scalar = host_scalar<ElementType>{extents, policy};
+  scalar(0)   = v;
+  return scalar;
+}
 
 /**
  * @brief Create a device scalar from v.
@@ -579,16 +579,16 @@ namespace raft {
  * @param[in] stream the cuda stream for ordering events
  * @return raft::device_scalar
  */
-    template <typename ElementType>
-    auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
-    {
-        detail::scalar_extent extents;
-        using policy_t = typename device_scalar<ElementType>::container_policy_type;
-        policy_t policy{stream};
-        auto scalar = device_scalar<ElementType>{extents, policy};
-        scalar(0)   = v;
-        return scalar;
-    }
+template <typename ElementType>
+auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
+{
+  detail::scalar_extent extents;
+  using policy_t = typename device_scalar<ElementType>::container_policy_type;
+  policy_t policy{stream};
+  auto scalar = device_scalar<ElementType>{extents, policy};
+  scalar(0)   = v;
+  return scalar;
+}
 
 /**
  * @brief Create a device scalar from v.
@@ -598,11 +598,11 @@ namespace raft {
  * @param[in] v scalar to wrap on device
  * @return raft::device_scalar
  */
-    template <typename ElementType>
-    auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
-    {
-        return make_device_scalar<ElementType>(v, handle.get_stream());
-    }
+template <typename ElementType>
+auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
+{
+  return make_device_scalar<ElementType>(v, handle.get_stream());
+}
 
 /**
  * @brief Create a 1-dim host mdarray.
@@ -610,14 +610,14 @@ namespace raft {
  * @param[in] n number of elements in vector
  * @return raft::host_vector
  */
-    template <typename ElementType>
-    auto make_host_vector(size_t n)
-    {
-        detail::vector_extent extents{n};
-        using policy_t = typename host_vector<ElementType>::container_policy_type;
-        policy_t policy;
-        return host_vector<ElementType>{extents, policy};
-    }
+template <typename ElementType>
+auto make_host_vector(size_t n)
+{
+  detail::vector_extent extents{n};
+  using policy_t = typename host_vector<ElementType>::container_policy_type;
+  policy_t policy;
+  return host_vector<ElementType>{extents, policy};
+}
 
 /**
  * @brief Create a 1-dim device mdarray.
@@ -626,14 +626,14 @@ namespace raft {
  * @param[in] stream the cuda stream for ordering events
  * @return raft::device_vector
  */
-    template <typename ElementType>
-    auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
-    {
-        detail::vector_extent extents{n};
-        using policy_t = typename device_vector<ElementType>::container_policy_type;
-        policy_t policy{stream};
-        return device_vector<ElementType>{extents, policy};
-    }
+template <typename ElementType>
+auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
+{
+  detail::vector_extent extents{n};
+  using policy_t = typename device_vector<ElementType>::container_policy_type;
+  policy_t policy{stream};
+  return device_vector<ElementType>{extents, policy};
+}
 
 /**
  * @brief Create a 1-dim device mdarray.
@@ -642,9 +642,9 @@ namespace raft {
  * @param[in] n number of elements in vector
  * @return raft::device_vector
  */
-    template <typename ElementType>
-    auto make_device_vector(raft::handle_t const& handle, size_t n)
-    {
-        return make_device_vector<ElementType>(n, handle.get_stream());
-    }
+template <typename ElementType>
+auto make_device_vector(raft::handle_t const& handle, size_t n)
+{
+  return make_device_vector<ElementType>(n, handle.get_stream());
+}
 }  // namespace raft
diff --git a/cpp/include/raft/core/nvtx.hpp b/cpp/include/raft/core/nvtx.hpp
index 65f3204a06..d5c8ce16b5 100644
--- a/cpp/include/raft/core/nvtx.hpp
+++ b/cpp/include/raft/core/nvtx.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,19 +66,19 @@
  */
 namespace raft::common::nvtx {
 
-    namespace domain {
+namespace domain {
 
 /** @brief The default NVTX domain. */
-        struct app {
-            static constexpr char const* name{"application"};
-        };
+struct app {
+  static constexpr char const* name{"application"};
+};
 
 /** @brief This NVTX domain is supposed to be used within raft.  */
-        struct raft {
-            static constexpr char const* name{"raft"};
-        };
+struct raft {
+  static constexpr char const* name{"raft"};
+};
 
-    }  // namespace domain
+}  // namespace domain
 
 /**
  * @brief Push a named NVTX range.
@@ -92,11 +92,11 @@ namespace raft::common::nvtx {
  * @param format range name format (accepts printf-style arguments)
  * @param args the arguments for the printf-style formatting
  */
-    template <typename Domain = domain::app, typename... Args>
-    inline void push_range(const char* format, Args... args)
-    {
-        detail::push_range<Domain, Args...>(format, args...);
-    }
+template <typename Domain = domain::app, typename... Args>
+inline void push_range(const char* format, Args... args)
+{
+  detail::push_range<Domain, Args...>(format, args...);
+}
 
 /**
  * @brief Pop the latest range.
@@ -108,11 +108,11 @@ namespace raft::common::nvtx {
  *   \endcode
  *   NB: make sure to use the same domain for `push_range` and `pop_range`.
  */
-    template <typename Domain = domain::app>
-    inline void pop_range()
-    {
-        detail::pop_range<Domain>();
-    }
+template <typename Domain = domain::app>
+inline void pop_range()
+{
+  detail::pop_range<Domain>();
+}
 
 /**
  * @brief Push a named NVTX range that would be popped at the end of the object lifetime.
@@ -125,31 +125,31 @@ namespace raft::common::nvtx {
  *      struct custom_domain { static constexpr char const* name{"custom message"}; }
  *   \endcode
  */
-    template <typename Domain = domain::app>
-    class range {
-    public:
-        /**
-         * Push a named NVTX range.
-         * At the end of the object lifetime, pop the range back.
-         *
-         * @param format range name format (accepts printf-style arguments)
-         * @param args the arguments for the printf-style formatting
-         */
-        template <typename... Args>
-        explicit range(const char* format, Args... args)
-        {
-            push_range<Domain, Args...>(format, args...);
-        }
+template <typename Domain = domain::app>
+class range {
+ public:
+  /**
+   * Push a named NVTX range.
+   * At the end of the object lifetime, pop the range back.
+   *
+   * @param format range name format (accepts printf-style arguments)
+   * @param args the arguments for the printf-style formatting
+   */
+  template <typename... Args>
+  explicit range(const char* format, Args... args)
+  {
+    push_range<Domain, Args...>(format, args...);
+  }
 
-        ~range() { pop_range<Domain>(); }
+  ~range() { pop_range<Domain>(); }
 
-        /* This object is not meant to be touched. */
-        range(const range&) = delete;
-        range(range&&)      = delete;
-        auto operator=(const range&) -> range& = delete;
-        auto operator=(range&&) -> range&                = delete;
-        static auto operator new(std::size_t) -> void*   = delete;
-        static auto operator new[](std::size_t) -> void* = delete;
-    };
+  /* This object is not meant to be touched. */
+  range(const range&) = delete;
+  range(range&&)      = delete;
+  auto operator=(const range&) -> range& = delete;
+  auto operator=(range&&) -> range&                = delete;
+  static auto operator new(std::size_t) -> void*   = delete;
+  static auto operator new[](std::size_t) -> void* = delete;
+};
 
 }  // namespace raft::common::nvtx
diff --git a/cpp/include/raft/core/span.hpp b/cpp/include/raft/core/span.hpp
index d9177b8a3d..b4fbf5b63a 100644
--- a/cpp/include/raft/core/span.hpp
+++ b/cpp/include/raft/core/span.hpp
@@ -34,230 +34,230 @@ namespace raft {
  *   auto view = device_span<float>{uvec.data(), uvec.size()};
  * @endcode
  */
-    template <typename T, bool is_device, std::size_t Extent = dynamic_extent>
-    class span {
-    public:
-        using element_type    = T;
-        using value_type      = typename std::remove_cv<T>::type;
-        using size_type       = std::size_t;
-        using difference_type = std::ptrdiff_t;
-        using pointer         = T*;
-        using const_pointer   = T const*;
-        using reference       = T&;
-        using const_reference = T const&;
-
-        using iterator               = pointer;
-        using const_iterator         = const_pointer;
-        using reverse_iterator       = thrust::reverse_iterator<iterator>;
-        using const_reverse_iterator = thrust::reverse_iterator<const_iterator>;
-
-        /**
-         * @brief Default constructor that constructs a span with size 0 and nullptr.
-         */
-        constexpr span() noexcept = default;
-
-        /**
-         * @brief Constructs a span that is a view over the range [first, first + count);
-         */
-        constexpr span(pointer ptr, size_type count) noexcept : storage_{ptr, count}
-                {
-                        assert(!(Extent != dynamic_extent && count != Extent));
-                assert(ptr || count == 0);
-                }
-        /**
-         * @brief Constructs a span that is a view over the range [first, last)
-         */
-        constexpr span(pointer first, pointer last) noexcept
-                : span{first, static_cast<size_type>(thrust::distance(first, last))}
-                {
-                }
-        /**
-         * @brief Constructs a span that is a view over the array arr.
-         */
-        template <std::size_t N>
-        constexpr span(element_type (&arr)[N]) noexcept : span{&arr[0], N}
-                {
-                }
-
-        /**
-         * @brief Initialize a span class from another one who's underlying type is convertible
-         *        to element_type.
-         */
-        template <class U,
-                std::size_t OtherExtent,
-                class = typename std::enable_if<
-                        detail::is_allowed_element_type_conversion_t<U, T>::value &&
-                        detail::is_allowed_extent_conversion_t<OtherExtent, Extent>::value>>
-        constexpr span(const span<U, is_device, OtherExtent>& other) noexcept
-                : span{other.data(), other.size()}
-                {
-                }
-
-        constexpr span(span const& other) noexcept = default;
-        constexpr span(span&& other) noexcept      = default;
-
-        constexpr auto operator=(span const& other) noexcept -> span& = default;
-        constexpr auto operator=(span&& other) noexcept -> span& = default;
-
-        constexpr auto begin() const noexcept -> iterator { return data(); }
-
-        constexpr auto end() const noexcept -> iterator { return data() + size(); }
-
-        constexpr auto cbegin() const noexcept -> const_iterator { return data(); }
-
-        constexpr auto cend() const noexcept -> const_iterator { return data() + size(); }
-
-        __host__ __device__ constexpr auto rbegin() const noexcept -> reverse_iterator
-        {
-            return reverse_iterator{end()};
-        }
-
-        __host__ __device__ constexpr auto rend() const noexcept -> reverse_iterator
-        {
-            return reverse_iterator{begin()};
-        }
-
-        __host__ __device__ constexpr auto crbegin() const noexcept -> const_reverse_iterator
-        {
-            return const_reverse_iterator{cend()};
-        }
-
-        __host__ __device__ constexpr auto crend() const noexcept -> const_reverse_iterator
-        {
-            return const_reverse_iterator{cbegin()};
-        }
-
-        // element access
-        constexpr auto front() const -> reference { return (*this)[0]; }
-
-        constexpr auto back() const -> reference { return (*this)[size() - 1]; }
-
-        template <typename Index>
-        constexpr auto operator[](Index _idx) const -> reference
-        {
-            assert(static_cast<size_type>(_idx) < size());
-            return data()[_idx];
-        }
-
-        constexpr auto data() const noexcept -> pointer { return storage_.data(); }
-
-        // Observers
-        [[nodiscard]] constexpr auto size() const noexcept -> size_type { return storage_.size(); }
-        [[nodiscard]] constexpr auto size_bytes() const noexcept -> size_type
-        {
-            return size() * sizeof(T);
-        }
-
-        constexpr auto empty() const noexcept { return size() == 0; }
-
-        // Subviews
-        template <std::size_t Count>
-        constexpr auto first() const -> span<element_type, is_device, Count>
-        {
-            assert(Count <= size());
-            return {data(), Count};
-        }
-
-        constexpr auto first(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
-        {
-            assert(_count <= size());
-            return {data(), _count};
-        }
-
-        template <std::size_t Count>
-        constexpr auto last() const -> span<element_type, is_device, Count>
-        {
-            assert(Count <= size());
-            return {data() + size() - Count, Count};
-        }
-
-        constexpr auto last(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
-        {
-            assert(_count <= size());
-            return subspan(size() - _count, _count);
-        }
-
-        /*!
-         * If Count is std::dynamic_extent, r.size() == this->size() - Offset;
-         * Otherwise r.size() == Count.
-         */
-        template <std::size_t Offset, std::size_t Count = dynamic_extent>
-        constexpr auto subspan() const
-        -> span<element_type, is_device, detail::extent_value_t<Extent, Offset, Count>::value>
-        {
-            assert((Count == dynamic_extent) ? (Offset <= size()) : (Offset + Count <= size()));
-            return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
-        }
-
-        constexpr auto subspan(size_type _offset, size_type _count = dynamic_extent) const
-        -> span<element_type, is_device, dynamic_extent>
-        {
-            assert((_count == dynamic_extent) ? (_offset <= size()) : (_offset + _count <= size()));
-            return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
-        }
-
-    private:
-        detail::span_storage<T, Extent> storage_;
-    };
+template <typename T, bool is_device, std::size_t Extent = dynamic_extent>
+class span {
+ public:
+  using element_type    = T;
+  using value_type      = typename std::remove_cv<T>::type;
+  using size_type       = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using pointer         = T*;
+  using const_pointer   = T const*;
+  using reference       = T&;
+  using const_reference = T const&;
+
+  using iterator               = pointer;
+  using const_iterator         = const_pointer;
+  using reverse_iterator       = thrust::reverse_iterator<iterator>;
+  using const_reverse_iterator = thrust::reverse_iterator<const_iterator>;
+
+  /**
+   * @brief Default constructor that constructs a span with size 0 and nullptr.
+   */
+  constexpr span() noexcept = default;
+
+  /**
+   * @brief Constructs a span that is a view over the range [first, first + count);
+   */
+  constexpr span(pointer ptr, size_type count) noexcept : storage_{ptr, count}
+  {
+    assert(!(Extent != dynamic_extent && count != Extent));
+    assert(ptr || count == 0);
+  }
+  /**
+   * @brief Constructs a span that is a view over the range [first, last)
+   */
+  constexpr span(pointer first, pointer last) noexcept
+    : span{first, static_cast<size_type>(thrust::distance(first, last))}
+  {
+  }
+  /**
+   * @brief Constructs a span that is a view over the array arr.
+   */
+  template <std::size_t N>
+  constexpr span(element_type (&arr)[N]) noexcept : span{&arr[0], N}
+  {
+  }
+
+  /**
+   * @brief Initialize a span class from another one who's underlying type is convertible
+   *        to element_type.
+   */
+  template <class U,
+            std::size_t OtherExtent,
+            class = typename std::enable_if<
+              detail::is_allowed_element_type_conversion_t<U, T>::value &&
+              detail::is_allowed_extent_conversion_t<OtherExtent, Extent>::value>>
+  constexpr span(const span<U, is_device, OtherExtent>& other) noexcept
+    : span{other.data(), other.size()}
+  {
+  }
+
+  constexpr span(span const& other) noexcept = default;
+  constexpr span(span&& other) noexcept      = default;
+
+  constexpr auto operator=(span const& other) noexcept -> span& = default;
+  constexpr auto operator=(span&& other) noexcept -> span& = default;
+
+  constexpr auto begin() const noexcept -> iterator { return data(); }
+
+  constexpr auto end() const noexcept -> iterator { return data() + size(); }
+
+  constexpr auto cbegin() const noexcept -> const_iterator { return data(); }
+
+  constexpr auto cend() const noexcept -> const_iterator { return data() + size(); }
+
+  __host__ __device__ constexpr auto rbegin() const noexcept -> reverse_iterator
+  {
+    return reverse_iterator{end()};
+  }
+
+  __host__ __device__ constexpr auto rend() const noexcept -> reverse_iterator
+  {
+    return reverse_iterator{begin()};
+  }
+
+  __host__ __device__ constexpr auto crbegin() const noexcept -> const_reverse_iterator
+  {
+    return const_reverse_iterator{cend()};
+  }
+
+  __host__ __device__ constexpr auto crend() const noexcept -> const_reverse_iterator
+  {
+    return const_reverse_iterator{cbegin()};
+  }
+
+  // element access
+  constexpr auto front() const -> reference { return (*this)[0]; }
+
+  constexpr auto back() const -> reference { return (*this)[size() - 1]; }
+
+  template <typename Index>
+  constexpr auto operator[](Index _idx) const -> reference
+  {
+    assert(static_cast<size_type>(_idx) < size());
+    return data()[_idx];
+  }
+
+  constexpr auto data() const noexcept -> pointer { return storage_.data(); }
+
+  // Observers
+  [[nodiscard]] constexpr auto size() const noexcept -> size_type { return storage_.size(); }
+  [[nodiscard]] constexpr auto size_bytes() const noexcept -> size_type
+  {
+    return size() * sizeof(T);
+  }
+
+  constexpr auto empty() const noexcept { return size() == 0; }
+
+  // Subviews
+  template <std::size_t Count>
+  constexpr auto first() const -> span<element_type, is_device, Count>
+  {
+    assert(Count <= size());
+    return {data(), Count};
+  }
+
+  constexpr auto first(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
+  {
+    assert(_count <= size());
+    return {data(), _count};
+  }
+
+  template <std::size_t Count>
+  constexpr auto last() const -> span<element_type, is_device, Count>
+  {
+    assert(Count <= size());
+    return {data() + size() - Count, Count};
+  }
+
+  constexpr auto last(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
+  {
+    assert(_count <= size());
+    return subspan(size() - _count, _count);
+  }
+
+  /*!
+   * If Count is std::dynamic_extent, r.size() == this->size() - Offset;
+   * Otherwise r.size() == Count.
+   */
+  template <std::size_t Offset, std::size_t Count = dynamic_extent>
+  constexpr auto subspan() const
+    -> span<element_type, is_device, detail::extent_value_t<Extent, Offset, Count>::value>
+  {
+    assert((Count == dynamic_extent) ? (Offset <= size()) : (Offset + Count <= size()));
+    return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
+  }
+
+  constexpr auto subspan(size_type _offset, size_type _count = dynamic_extent) const
+    -> span<element_type, is_device, dynamic_extent>
+  {
+    assert((_count == dynamic_extent) ? (_offset <= size()) : (_offset + _count <= size()));
+    return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
+  }
+
+ private:
+  detail::span_storage<T, Extent> storage_;
+};
 
 /**
  * @brief A span class for host pointer.
  */
-    template <typename T, size_t extent = dynamic_extent>
-    using host_span = span<T, false, extent>;
+template <typename T, size_t extent = dynamic_extent>
+using host_span = span<T, false, extent>;
 
 /**
  * @brief A span class for device pointer.
  */
-    template <typename T, size_t extent = dynamic_extent>
-    using device_span = span<T, true, extent>;
+template <typename T, size_t extent = dynamic_extent>
+using device_span = span<T, true, extent>;
 
-    template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
-    constexpr auto operator==(span<T, is_device, X> l, span<U, is_device, Y> r) -> bool
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator==(span<T, is_device, X> l, span<U, is_device, Y> r) -> bool
 {
-    if (l.size() != r.size()) { return false; }
-for (auto l_beg = l.cbegin(), r_beg = r.cbegin(); l_beg != l.cend(); ++l_beg, ++r_beg) {
-if (*l_beg != *r_beg) { return false; }
-}
-return true;
+  if (l.size() != r.size()) { return false; }
+  for (auto l_beg = l.cbegin(), r_beg = r.cbegin(); l_beg != l.cend(); ++l_beg, ++r_beg) {
+    if (*l_beg != *r_beg) { return false; }
+  }
+  return true;
 }
 
 template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
 constexpr auto operator!=(span<T, is_device, X> l, span<U, is_device, Y> r)
 {
-    return !(l == r);
+  return !(l == r);
 }
 
 template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
 constexpr auto operator<(span<T, is_device, X> l, span<U, is_device, Y> r)
 {
-    return detail::lexicographical_compare<
-           typename span<T, is_device, X>::iterator,
-            typename span<U, is_device, Y>::iterator,
-            thrust::less<typename span<T, is_device, X>::element_type>>(
-                    l.begin(), l.end(), r.begin(), r.end());
+  return detail::lexicographical_compare<
+    typename span<T, is_device, X>::iterator,
+    typename span<U, is_device, Y>::iterator,
+    thrust::less<typename span<T, is_device, X>::element_type>>(
+    l.begin(), l.end(), r.begin(), r.end());
 }
 
 template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
 constexpr auto operator<=(span<T, is_device, X> l, span<U, is_device, Y> r)
 {
-    return !(l > r);
+  return !(l > r);
 }
 
 template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
 constexpr auto operator>(span<T, is_device, X> l, span<U, is_device, Y> r)
 {
-    return detail::lexicographical_compare<
-           typename span<T, is_device, X>::iterator,
-            typename span<U, is_device, Y>::iterator,
-            thrust::greater<typename span<T, is_device, X>::element_type>>(
-                    l.begin(), l.end(), r.begin(), r.end());
+  return detail::lexicographical_compare<
+    typename span<T, is_device, X>::iterator,
+    typename span<U, is_device, Y>::iterator,
+    thrust::greater<typename span<T, is_device, X>::element_type>>(
+    l.begin(), l.end(), r.begin(), r.end());
 }
 
 template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
 constexpr auto operator>=(span<T, is_device, X> l, span<U, is_device, Y> r)
 {
-    return !(l < r);
+  return !(l < r);
 }
 
 /**
@@ -265,9 +265,9 @@ constexpr auto operator>=(span<T, is_device, X> l, span<U, is_device, Y> r)
  */
 template <class T, bool is_device, std::size_t E>
 auto as_bytes(span<T, is_device, E> s) noexcept
--> span<const std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
+  -> span<const std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
 {
-return {reinterpret_cast<const std::byte*>(s.data()), s.size_bytes()};
+  return {reinterpret_cast<const std::byte*>(s.data()), s.size_bytes()};
 }
 
 /**
@@ -275,8 +275,8 @@ return {reinterpret_cast<const std::byte*>(s.data()), s.size_bytes()};
  */
 template <class T, bool is_device, std::size_t E>
 auto as_writable_bytes(span<T, is_device, E> s) noexcept
--> span<std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
+  -> span<std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
 {
-return {reinterpret_cast<std::byte*>(s.data()), s.size_bytes()};
+  return {reinterpret_cast<std::byte*>(s.data()), s.size_bytes()};
 }
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/cublas_macros.h b/cpp/include/raft/linalg/cublas_macros.h
index 8250ad4217..f50a59cc3b 100644
--- a/cpp/include/raft/linalg/cublas_macros.h
+++ b/cpp/include/raft/linalg/cublas_macros.h
@@ -18,7 +18,6 @@
  * Please use the cuh version instead.
  */
 
-
 #pragma once
 
 #include <raft/core/cublas_macros.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cusolver_macros.h b/cpp/include/raft/linalg/cusolver_macros.h
index bdf1238f65..d5ff923e1c 100644
--- a/cpp/include/raft/linalg/cusolver_macros.h
+++ b/cpp/include/raft/linalg/cusolver_macros.h
@@ -18,7 +18,6 @@
  * Please use the cuh version instead.
  */
 
-
 #pragma once
 
 #include <raft/core/cusolver_macros.hpp>
\ No newline at end of file

From 0c3250d6ecdf53e02aeea59fe358d4cb2e796801 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 16:10:38 -0400
Subject: [PATCH 150/167] Updating style

---
 cpp/include/raft/core/handle.hpp | 2 +-
 cpp/include/raft/core/nvtx.hpp   | 2 +-
 cpp/include/raft/handle.hpp      | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp
index 158816f762..ac4bb489c7 100644
--- a/cpp/include/raft/core/handle.hpp
+++ b/cpp/include/raft/core/handle.hpp
@@ -40,7 +40,7 @@
 ///@todo: enable once we have migrated cuml-comms layer too
 //#include <common/cuml_comms_int.hpp>
 
-#include "cudart_utils.h"
+#include <raft/core/cudart_utils.hpp>
 
 #include <raft/comms/comms.hpp>
 #include <raft/interruptible.hpp>
diff --git a/cpp/include/raft/core/nvtx.hpp b/cpp/include/raft/core/nvtx.hpp
index d5c8ce16b5..304432017b 100644
--- a/cpp/include/raft/core/nvtx.hpp
+++ b/cpp/include/raft/core/nvtx.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "detail/nvtx.hpp"
+#include <raft/common/detail/nvtx.hpp>
 #include <optional>
 
 /**
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index da80958ab0..6bd3b02f1a 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -20,4 +20,5 @@
  */
 
 #pragma once
+
 #include <raft/core/handle.hpp>
\ No newline at end of file

From 0610ea1e50ae5916da72b05abed1f02179b98c7d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 16:26:34 -0400
Subject: [PATCH 151/167] more style updates

---
 BUILD.md                       | 9 ++++++++-
 cpp/include/raft/core/nvtx.hpp | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index 8e5e617131..51e1adcda2 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -74,7 +74,14 @@ Add the `--install` flag to the above example to also install the shared librari
 
 ### <a id="gtests"></a>Tests
 
-Compile the tests using the `tests` target in `build.sh`. By default, the shared libraries are assumed to be already built and on the library path. Add `--compile-libs` to also compile them.
+Compile the tests using the `tests` target in `build.sh`.
+
+```bash
+./build.sh libraft tests
+```
+
+Test compile times can be improved significantly by using the optional shared libraries. If installed, they will be used automatically when building the tests but `--compile-libs` can be used to add additional compilation units and compile them with the tests.
+
 ```bash
 ./build.sh libraft tests --compile-libs
 ```
diff --git a/cpp/include/raft/core/nvtx.hpp b/cpp/include/raft/core/nvtx.hpp
index 304432017b..eb536b0e01 100644
--- a/cpp/include/raft/core/nvtx.hpp
+++ b/cpp/include/raft/core/nvtx.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/common/detail/nvtx.hpp>
 #include <optional>
+#include <raft/common/detail/nvtx.hpp>
 
 /**
  * \section Usage

From 44387ecc279da1c354b40fb620fd099124267e68 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 19:08:24 -0400
Subject: [PATCH 152/167] Fixing deprecation warnings

---
 cpp/include/raft/core/cublas_macros.hpp   | 5 -----
 cpp/include/raft/core/cudart_utils.hpp    | 5 -----
 cpp/include/raft/core/cusolver_macros.hpp | 5 -----
 cpp/include/raft/core/error.hpp           | 5 -----
 cpp/include/raft/core/handle.hpp          | 5 -----
 cpp/include/raft/core/interruptible.hpp   | 5 -----
 cpp/include/raft/mdarray.hpp              | 2 +-
 cpp/include/raft/span.hpp                 | 2 +-
 8 files changed, 2 insertions(+), 32 deletions(-)

diff --git a/cpp/include/raft/core/cublas_macros.hpp b/cpp/include/raft/core/cublas_macros.hpp
index 0281c5c667..f5de57677d 100644
--- a/cpp/include/raft/core/cublas_macros.hpp
+++ b/cpp/include/raft/core/cublas_macros.hpp
@@ -14,11 +14,6 @@
  * limitations under the License.
  */
 
-/**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cublas_macros.hpp instead.
- */
-
 #ifndef __RAFT_RT_CUBLAS_MACROS_H
 #define __RAFT_RT_CUBLAS_MACROS_H
 
diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp
index 8e2eeff1ed..95630cd314 100644
--- a/cpp/include/raft/core/cudart_utils.hpp
+++ b/cpp/include/raft/core/cudart_utils.hpp
@@ -14,11 +14,6 @@
  * limitations under the License.
  */
 
-/**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cudart_utils.hpp instead.
- */
-
 #ifndef __RAFT_RT_CUDART_UTILS_H
 #define __RAFT_RT_CUDART_UTILS_H
 
diff --git a/cpp/include/raft/core/cusolver_macros.hpp b/cpp/include/raft/core/cusolver_macros.hpp
index df27f7ce26..c4081db9de 100644
--- a/cpp/include/raft/core/cusolver_macros.hpp
+++ b/cpp/include/raft/core/cusolver_macros.hpp
@@ -14,11 +14,6 @@
  * limitations under the License.
  */
 
-/**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cusolver_macros.hpp instead.
- */
-
 #ifndef __RAFT_RT_CUSOLVER_MACROS_H
 #define __RAFT_RT_CUSOLVER_MACROS_H
 
diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp
index 5e1aa3af28..a65b9a8469 100644
--- a/cpp/include/raft/core/error.hpp
+++ b/cpp/include/raft/core/error.hpp
@@ -14,11 +14,6 @@
  * limitations under the License.
  */
 
-/**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use the include/raft_runtime/error.hpp instead.
- */
-
 #ifndef __RAFT_RT_ERROR
 #define __RAFT_RT_ERROR
 
diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp
index ac4bb489c7..1863a4f037 100644
--- a/cpp/include/raft/core/handle.hpp
+++ b/cpp/include/raft/core/handle.hpp
@@ -14,11 +14,6 @@
  * limitations under the License.
  */
 
-/**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use the include/raft_runtime/handle.hpp instead.
- */
-
 #ifndef __RAFT_RT_HANDLE
 #define __RAFT_RT_HANDLE
 
diff --git a/cpp/include/raft/core/interruptible.hpp b/cpp/include/raft/core/interruptible.hpp
index 6764065363..55d272739f 100644
--- a/cpp/include/raft/core/interruptible.hpp
+++ b/cpp/include/raft/core/interruptible.hpp
@@ -14,11 +14,6 @@
  * limitations under the License.
  */
 
-/**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use the include/raft_runtime/interruptible.hpp instead.
- */
-
 #ifndef __RAFT_RT_INTERRUPTIBLE_H
 #define __RAFT_RT_INTERRUPTIBLE_H
 
diff --git a/cpp/include/raft/mdarray.hpp b/cpp/include/raft/mdarray.hpp
index 30aed9b4e7..7da8b0644f 100644
--- a/cpp/include/raft/mdarray.hpp
+++ b/cpp/include/raft/mdarray.hpp
@@ -16,7 +16,7 @@
 
 /**
  * This file is deprecated and will be removed in release 22.06.
- * Please use the cuh version instead.
+ * Please use include/core/mdarray.hpp instead.
  */
 
 #pragma once
diff --git a/cpp/include/raft/span.hpp b/cpp/include/raft/span.hpp
index ba7b393ca7..5462f45618 100644
--- a/cpp/include/raft/span.hpp
+++ b/cpp/include/raft/span.hpp
@@ -16,7 +16,7 @@
 
 /**
  * This file is deprecated and will be removed in release 22.06.
- * Please use the cuh version instead.
+ * Please use include/core/span.hpp instead.
  */
 
 #pragma once

From 449ee379140d400c09a45639f7978a7972078650 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 22:58:02 -0400
Subject: [PATCH 153/167] Fixing style

---
 cpp/include/raft/error.hpp | 122 ++++++++++++++++++-------------------
 1 file changed, 61 insertions(+), 61 deletions(-)

diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index af41005cb6..30ab862d41 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -36,70 +36,70 @@
 #include <stdexcept>
 #include <string>
 
-namespace raft {
-
-/** base exception class for the whole of raft */
-class exception : public std::exception {
- public:
-  /** default ctor */
-  explicit exception() noexcept : std::exception(), msg_() {}
-
-  /** copy ctor */
-  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
-  {
-    collect_call_stack();
-  }
-
-  /** ctor from an input message */
-  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
-  {
-    collect_call_stack();
-  }
-
-  /** get the message associated with this exception */
-  char const* what() const noexcept override { return msg_.c_str(); }
-
- private:
-  /** message associated with this exception */
-  std::string msg_;
-
-  /** append call stack info to this exception's message for ease of debug */
-  // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-  void collect_call_stack() noexcept
-  {
-#ifdef __GNUC__
-    constexpr int kMaxStackDepth = 64;
-    void* stack[kMaxStackDepth];  // NOLINT
-    auto depth = backtrace(stack, kMaxStackDepth);
-    std::ostringstream oss;
-    oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
-    char** strings = backtrace_symbols(stack, depth);
-    if (strings == nullptr) {
-      oss << "But no stack trace could be found!" << std::endl;
-      msg_ += oss.str();
-      return;
+namespace raft
+{
+  /** base exception class for the whole of raft */
+  class exception : public std::exception {
+   public:
+    /** default ctor */
+    explicit exception() noexcept : std::exception(), msg_() {}
+
+    /** copy ctor */
+    exception(exception const& src) noexcept : std::exception(), msg_(src.what())
+    {
+      collect_call_stack();
     }
-    ///@todo: support for demangling of C++ symbol names
-    for (int i = 0; i < depth; ++i) {
-      oss << "#" << i << " in " << strings[i] << std::endl;
+
+    /** ctor from an input message */
+    explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
+    {
+      collect_call_stack();
     }
-    free(strings);
-    msg_ += oss.str();
-#endif  // __GNUC__
-  }
-};
 
-/**
- * @brief Exception thrown when logical precondition is violated.
- *
- * This exception should not be thrown directly and is instead thrown by the
- * RAFT_EXPECTS and  RAFT_FAIL macros.
- *
- */
-struct logic_error : public raft::exception {
-  explicit logic_error(char const* const message) : raft::exception(message) {}
-  explicit logic_error(std::string const& message) : raft::exception(message) {}
-};
+    /** get the message associated with this exception */
+    char const* what() const noexcept override { return msg_.c_str(); }
+
+   private:
+    /** message associated with this exception */
+    std::string msg_;
+
+    /** append call stack info to this exception's message for ease of debug */
+    // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
+    void collect_call_stack() noexcept
+    {
+#ifdef __GNUC__
+      constexpr int kMaxStackDepth = 64;
+      void* stack[kMaxStackDepth];  // NOLINT
+      auto depth = backtrace(stack, kMaxStackDepth);
+      std::ostringstream oss;
+      oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
+      char** strings = backtrace_symbols(stack, depth);
+      if (strings == nullptr) {
+        oss << "But no stack trace could be found!" << std::endl;
+        msg_ += oss.str();
+        return;
+      }
+      ///@todo: support for demangling of C++ symbol names
+      for (int i = 0; i < depth; ++i) {
+        oss << "#" << i << " in " << strings[i] << std::endl;
+      }
+      free(strings);
+      msg_ += oss.str();
+#endif  // __GNUC__
+    }
+  };
+
+  /**
+   * @brief Exception thrown when logical precondition is violated.
+   *
+   * This exception should not be thrown directly and is instead thrown by the
+   * RAFT_EXPECTS and  RAFT_FAIL macros.
+   *
+   */
+  struct logic_error : public raft::exception {
+    explicit logic_error(char const* const message) : raft::exception(message) {}
+    explicit logic_error(std::string const& message) : raft::exception(message) {}
+  };
 
 }  // namespace raft
 

From 9183adf290b6e93df4fe044db31e5698c1ce595c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 23:01:20 -0400
Subject: [PATCH 154/167] Including only core elemernts

---
 cpp/include/raft/core/handle.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp
index 1863a4f037..08cb812bb7 100644
--- a/cpp/include/raft/core/handle.hpp
+++ b/cpp/include/raft/core/handle.hpp
@@ -37,11 +37,11 @@
 
 #include <raft/core/cudart_utils.hpp>
 
-#include <raft/comms/comms.hpp>
-#include <raft/interruptible.hpp>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/detail/cusolver_wrappers.hpp>
-#include <raft/sparse/detail/cusparse_macros.h>
+#include <raft/core/comms.hpp>
+#include <raft/core/cublas_macros.hpp>
+#include <raft/core/cusolver_macros.hpp>
+#include <raft/core/cusparse_macros.hpp>
+#include <raft/core/interruptible.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 

From dc9b9ddda1cf459a12d803cf8ddf8c5a365faeb5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 24 Mar 2022 23:03:07 -0400
Subject: [PATCH 155/167] Fixing bad merge

---
 cpp/include/raft/error.hpp | 165 +------------------------------------
 1 file changed, 1 insertion(+), 164 deletions(-)

diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index 30ab862d41..e109739781 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -16,171 +16,8 @@
 
 /**
  * This file is deprecated and will be removed in release 22.06.
-<<<<<<< HEAD
  * Please use the include/core/error.hpp instead.
-=======
- * Please use the include/raft_runtime/error.hpp instead.
->>>>>>> rapidsai/branch-22.04
  */
 
 #pragma once
-<<<<<<< HEAD
-#include <raft/core/error.hpp>
-=======
-
-#include <cstdio>
-#include <execinfo.h>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
-namespace raft
-{
-  /** base exception class for the whole of raft */
-  class exception : public std::exception {
-   public:
-    /** default ctor */
-    explicit exception() noexcept : std::exception(), msg_() {}
-
-    /** copy ctor */
-    exception(exception const& src) noexcept : std::exception(), msg_(src.what())
-    {
-      collect_call_stack();
-    }
-
-    /** ctor from an input message */
-    explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
-    {
-      collect_call_stack();
-    }
-
-    /** get the message associated with this exception */
-    char const* what() const noexcept override { return msg_.c_str(); }
-
-   private:
-    /** message associated with this exception */
-    std::string msg_;
-
-    /** append call stack info to this exception's message for ease of debug */
-    // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-    void collect_call_stack() noexcept
-    {
-#ifdef __GNUC__
-      constexpr int kMaxStackDepth = 64;
-      void* stack[kMaxStackDepth];  // NOLINT
-      auto depth = backtrace(stack, kMaxStackDepth);
-      std::ostringstream oss;
-      oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
-      char** strings = backtrace_symbols(stack, depth);
-      if (strings == nullptr) {
-        oss << "But no stack trace could be found!" << std::endl;
-        msg_ += oss.str();
-        return;
-      }
-      ///@todo: support for demangling of C++ symbol names
-      for (int i = 0; i < depth; ++i) {
-        oss << "#" << i << " in " << strings[i] << std::endl;
-      }
-      free(strings);
-      msg_ += oss.str();
-#endif  // __GNUC__
-    }
-  };
-
-  /**
-   * @brief Exception thrown when logical precondition is violated.
-   *
-   * This exception should not be thrown directly and is instead thrown by the
-   * RAFT_EXPECTS and  RAFT_FAIL macros.
-   *
-   */
-  struct logic_error : public raft::exception {
-    explicit logic_error(char const* const message) : raft::exception(message) {}
-    explicit logic_error(std::string const& message) : raft::exception(message) {}
-  };
-
-}  // namespace raft
-
-// FIXME: Need to be replaced with RAFT_FAIL
-/** macro to throw a runtime error */
-#define THROW(fmt, ...)                                                                      \
-  do {                                                                                       \
-    int size1 =                                                                              \
-      std::snprintf(nullptr, 0, "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
-    int size2 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                               \
-    if (size1 < 0 || size2 < 0)                                                              \
-      throw raft::exception("Error in snprintf, cannot handle raft exception.");             \
-    auto size = size1 + size2 + 1; /* +1 for final '\0' */                                   \
-    auto buf  = std::make_unique<char[]>(size_t(size));                                      \
-    std::snprintf(buf.get(),                                                                 \
-                  size1 + 1 /* +1 for '\0' */,                                               \
-                  "exception occured! file=%s line=%d: ",                                    \
-                  __FILE__,                                                                  \
-                  __LINE__);                                                                 \
-    std::snprintf(buf.get() + size1, size2 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);       \
-    std::string msg(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
-    throw raft::exception(msg);                                                              \
-  } while (0)
-
-// FIXME: Need to be replaced with RAFT_EXPECTS
-/** macro to check for a conditional and assert on failure */
-#define ASSERT(check, fmt, ...)              \
-  do {                                       \
-    if (!(check)) THROW(fmt, ##__VA_ARGS__); \
-  } while (0)
-
-/**
- * Macro to append error message to first argument.
- * This should only be called in contexts where it is OK to throw exceptions!
- */
-#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                           \
-  do {                                                                                          \
-    int size1 = std::snprintf(nullptr, 0, "%s", location_prefix);                               \
-    int size2 = std::snprintf(nullptr, 0, "file=%s line=%d: ", __FILE__, __LINE__);             \
-    int size3 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                                  \
-    if (size1 < 0 || size2 < 0 || size3 < 0)                                                    \
-      throw raft::exception("Error in snprintf, cannot handle raft exception.");                \
-    auto size = size1 + size2 + size3 + 1; /* +1 for final '\0' */                              \
-    auto buf  = std::make_unique<char[]>(size_t(size));                                         \
-    std::snprintf(buf.get(), size1 + 1 /* +1 for '\0' */, "%s", location_prefix);               \
-    std::snprintf(                                                                              \
-      buf.get() + size1, size2 + 1 /* +1 for '\0' */, "file=%s line=%d: ", __FILE__, __LINE__); \
-    std::snprintf(buf.get() + size1 + size2, size3 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);  \
-    msg += std::string(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
-  } while (0)
-
-/**
- * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false
- *
- * @param[in] cond Expression that evaluates to true or false
- * @param[in] fmt String literal description of the reason that cond is expected to be true with
- * optinal format tagas
- * @throw raft::logic_error if the condition evaluates to false.
- */
-#define RAFT_EXPECTS(cond, fmt, ...)                              \
-  do {                                                            \
-    if (!(cond)) {                                                \
-      std::string msg{};                                          \
-      SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
-      throw raft::logic_error(msg);                               \
-    }                                                             \
-  } while (0)
-
-/**
- * @brief Indicates that an erroneous code path has been taken.
- *
- * @param[in] fmt String literal description of the reason that this code path is erroneous with
- * optinal format tagas
- * @throw always throws raft::logic_error
- */
-#define RAFT_FAIL(fmt, ...)                                     \
-  do {                                                          \
-    std::string msg{};                                          \
-    SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
-    throw raft::logic_error(msg);                               \
-  } while (0)
-
-#endif
->>>>>>> rapidsai/branch-22.04
+#include <raft/core/error.hpp>
\ No newline at end of file

From f5d5c53dba3a3bb859ada03df17abcf894cf0c0a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 25 Mar 2022 10:04:18 -0400
Subject: [PATCH 156/167] Fixing macro and wrappers

---
 cpp/include/raft/core/cusolver_macros.hpp     |  2 +
 .../raft/linalg/detail/cublas_wrappers.hpp    | 97 +------------------
 .../raft/linalg/detail/cusolver_wrappers.hpp  | 93 +-----------------
 3 files changed, 5 insertions(+), 187 deletions(-)

diff --git a/cpp/include/raft/core/cusolver_macros.hpp b/cpp/include/raft/core/cusolver_macros.hpp
index c4081db9de..b41927f5fb 100644
--- a/cpp/include/raft/core/cusolver_macros.hpp
+++ b/cpp/include/raft/core/cusolver_macros.hpp
@@ -40,6 +40,7 @@ struct cusolver_error : public raft::exception {
 };
 
 namespace linalg {
+namespace detail {
 
 inline const char* cusolver_error_to_string(cusolverStatus_t err)
 {
@@ -58,6 +59,7 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
   };
 }
 
+}  // namespace detail
 }  // namespace linalg
 }  // namespace raft
 
diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 7f9abc324e..a55e1d6d7c 100644
--- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -17,104 +17,11 @@
 #pragma once
 
 #include <cublas_v2.h>
+#include <raft/core/cublas_macros.hpp>
 #include <raft/error.hpp>
 
-#include <cublas_v2.h>
-///@todo: enable this once we have logger enabled
-//#include <cuml/common/logger.hpp>
-
 #include <cstdint>
-
-#define _CUBLAS_ERR_TO_STR(err) \
-  case err: return #err
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuBLAS error is encountered.
- */
-struct cublas_error : public raft::exception {
-  explicit cublas_error(char const* const message) : raft::exception(message) {}
-  explicit cublas_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-namespace detail {
-
-inline const char* cublas_error_to_string(cublasStatus_t err)
-{
-  switch (err) {
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
-    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default: return "CUBLAS_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace detail
-}  // namespace linalg
-}  // namespace raft
-
-#undef _CUBLAS_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuBLAS runtime API functions.
- *
- * Invokes a cuBLAS runtime API function call, if the call does not return
- * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
- */
-#define RAFT_CUBLAS_TRY(call)                                              \
-  do {                                                                     \
-    cublasStatus_t const status = (call);                                  \
-    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                   \
-      SET_ERROR_MSG(msg,                                                   \
-                    "cuBLAS error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                             \
-                    #call,                                                 \
-                    status,                                                \
-                    raft::linalg::detail::cublas_error_to_string(status)); \
-      throw raft::cublas_error(msg);                                       \
-    }                                                                      \
-  } while (0)
-
-// FIXME: Remove after consumers rename
-#ifndef CUBLAS_TRY
-#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUBLAS_TRY_NO_THROW(call)                               \
-  do {                                                               \
-    cublasStatus_t const status = call;                              \
-    if (CUBLAS_STATUS_SUCCESS != status) {                           \
-      printf("CUBLAS call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                  \
-             __FILE__,                                               \
-             __LINE__,                                               \
-             raft::linalg::detail::cublas_error_to_string(status));  \
-    }                                                                \
-  } while (0)
-
-/** FIXME: remove after cuml rename */
-#ifndef CUBLAS_CHECK
-#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
-#endif
-
-/** FIXME: remove after cuml rename */
-#ifndef CUBLAS_CHECK_NO_THROW
-#define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
-#endif
+#include <cublas_v2.h>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
index 34ec6cb673..e7da615748 100644
--- a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
@@ -18,101 +18,10 @@
 
 #include <cusolverDn.h>
 #include <cusolverSp.h>
-///@todo: enable this once logging is enabled
-//#include <cuml/common/logger.hpp>
+#include <raft/core/cusolver_macros.hpp>
 #include <raft/cudart_utils.h>
 #include <type_traits>
 
-#define _CUSOLVER_ERR_TO_STR(err) \
-  case err: return #err;
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a cuSOLVER error is encountered.
- */
-struct cusolver_error : public raft::exception {
-  explicit cusolver_error(char const* const message) : raft::exception(message) {}
-  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
-};
-
-namespace linalg {
-namespace detail {
-
-inline const char* cusolver_error_to_string(cusolverStatus_t err)
-{
-  switch (err) {
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
-    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-    default: return "CUSOLVER_STATUS_UNKNOWN";
-  };
-}
-
-}  // namespace detail
-}  // namespace linalg
-}  // namespace raft
-
-#undef _CUSOLVER_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuSOLVER runtime API functions.
- *
- * Invokes a cuSOLVER runtime API function call, if the call does not return
- * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
- */
-#define RAFT_CUSOLVER_TRY(call)                                              \
-  do {                                                                       \
-    cusolverStatus_t const status = (call);                                  \
-    if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                     \
-      SET_ERROR_MSG(msg,                                                     \
-                    "cuSOLVER error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                               \
-                    #call,                                                   \
-                    status,                                                  \
-                    raft::linalg::detail::cusolver_error_to_string(status)); \
-      throw raft::cusolver_error(msg);                                       \
-    }                                                                        \
-  } while (0)
-
-// FIXME: remove after consumer rename
-#ifndef CUSOLVER_TRY
-#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
-#endif
-
-// /**
-//  * @brief check for cuda runtime API errors but log error instead of raising
-//  *        exception.
-//  */
-#define RAFT_CUSOLVER_TRY_NO_THROW(call)                               \
-  do {                                                                 \
-    cusolverStatus_t const status = call;                              \
-    if (CUSOLVER_STATUS_SUCCESS != status) {                           \
-      printf("CUSOLVER call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                    \
-             __FILE__,                                                 \
-             __LINE__,                                                 \
-             raft::linalg::detail::cusolver_error_to_string(status));  \
-    }                                                                  \
-  } while (0)
-
-// FIXME: remove after cuml rename
-#ifndef CUSOLVER_CHECK
-#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
-#endif
-
-#ifndef CUSOLVER_CHECK_NO_THROW
-#define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
-#endif
-
 namespace raft {
 namespace linalg {
 namespace detail {

From 934f85e1a214b44cf7364db4442c2fb90bbb84eb Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 31 Mar 2022 14:03:10 -0400
Subject: [PATCH 157/167] Updates based on review feedback

---
 BUILD.md                                  |  1 -
 build.sh                                  |  5 ++--
 cpp/CMakeLists.txt                        |  8 +++---
 cpp/cmake/thirdparty/get_cuco.cmake       | 30 +++++++++++------------
 cpp/cmake/thirdparty/get_libcudacxx.cmake | 13 +++++-----
 5 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index 51e1adcda2..5a28fe617a 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -127,7 +127,6 @@ RAFT's cmake has the following configurable flags available:.
 | RAFT_COMPILE_NN_LIBRARY | ON, OFF | OFF | Compiles the `libraft-nn` shared library |
 | RAFT_COMPILE_DIST_LIBRARY | ON, OFF | OFF | Compiles the `libraft-distance` shared library |
 | RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. Needed for `raft::spatial::knn` |
-| RAFT_ENABLE_cuco_DEPENDENCY | ON, OFF | OFF | Enables the cuCollections dependency used by `raft::sparse::distance`. This is turned on automatically when enabling the distance component |
 | RAFT_ENABLE_thrust_DEPENDENCY | ON, OFF | ON | Enables the Thrust dependency. This can be disabled when using many simple utilities or to override with a different Thrust version. |
 | RAFT_ENABLE_mdspan_DEPENDENCY | ON, OFF | ON | Enables the std::mdspan dependency. This can be disabled when using many simple utilities. |
 | RAFT_ENABLE_nccl_DEPENDENCY | ON, OFF | OFF | Enables NCCL dependency used by `raft::comms` and needed to build `pyraft` |
diff --git a/build.sh b/build.sh
index 14f1e42a88..688a58d276 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft pylibraft docs tests bench clean -v -g --install --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss --no-thrust"
+VALIDARGS="clean libraft pyraft pylibraft docs tests bench clean -v -g --install --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss --minimal-deps"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -36,7 +36,8 @@ HELP="$0 [<target> ...] [<flag> ...]
    --compile-libs   - compile shared libraries for all components
    --compile-nn     - compile shared library for nn component
    --compile-dist   - compile shared library for distance component
-   --no-thrust      - disable thrust dependency. can be useful for header-only install
+   --minimal-deps   - disables dependencies like thrust so they can be overridden.
+                      can be useful for a pure header-only install
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
    --install        - install cmake targets
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 23862aaac1..f3ba4baaa6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -66,12 +66,8 @@ option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared li
 option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations" OFF)
 option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss" ${RAFT_COMPILE_LIBRARIES})
 
-option(RAFT_ENABLE_cuco_DEPENDENCY "Enable cuCollections dependency" OFF)
 option(RAFT_ENABLE_mdspan_DEPENDENCY "Enable mdspan dependency" ON)
 option(RAFT_ENABLE_thrust_DEPENDENCY "Enable Thrust dependency" ON)
-if(distance IN_LIST raft_FIND_COMPONENTS OR RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
-  set(RAFT_ENABLE_cuco_DEPENDENCY ON)
-endif()
 
 if(BUILD_TESTS AND NOT RAFT_ENABLE_thrust_DEPENDENCY)
   message(VERBOSE "RAFT: BUILD_TESTS is enabled, overriding RAFT_ENABLE_thrust_DEPENDENCY")
@@ -136,6 +132,10 @@ include(cmake/modules/ConfigureCUDA.cmake)
 ##############################################################################
 # - Requirements -------------------------------------------------------------
 
+if(distance IN_LIST raft_FIND_COMPONENTS OR RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
+  set(RAFT_ENABLE_cuco_DEPENDENCY ON)
+endif()
+
 # add third party dependencies using CPM
 rapids_cpm_init()
 
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 81172a2d8e..7b4ac98ea4 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -16,21 +16,21 @@
 
 function(find_and_configure_cuco VERSION)
 
-    if(RAFT_ENABLE_cuco_DEPENDENCY)
-        rapids_cpm_find(cuco ${VERSION}
-          GLOBAL_TARGETS      cuco::cuco
-          BUILD_EXPORT_SET    raft-distance-exports
-          INSTALL_EXPORT_SET  raft-distance-exports
-          CPM_ARGS
-            GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-            GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
-            OPTIONS        "BUILD_TESTS OFF"
-                           "BUILD_BENCHMARKS OFF"
-                           "BUILD_EXAMPLES OFF"
-        )
-    endif()
+    rapids_cpm_find(cuco ${VERSION}
+      GLOBAL_TARGETS      cuco::cuco
+      BUILD_EXPORT_SET    raft-distance-exports
+      INSTALL_EXPORT_SET  raft-distance-exports
+      CPM_ARGS
+        GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
+        GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
+        OPTIONS        "BUILD_TESTS OFF"
+                       "BUILD_BENCHMARKS OFF"
+                       "BUILD_EXAMPLES OFF"
+    )
 
 endfunction()
 
-# cuCollections doesn't have a version yet
-find_and_configure_cuco(0.0)
+if(RAFT_ENABLE_cuco_DEPENDENCY)
+    # cuCollections doesn't have a version yet
+    find_and_configure_cuco(0.0)
+endif()
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index f2c4d0e700..92d8e57de9 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -15,13 +15,12 @@
 # This function finds libcudacxx and sets any additional necessary environment variables.
 function(find_and_configure_libcudacxx)
 
-  if(RAFT_ENABLE_cuco_DEPENDENCY)
-    include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
-
-    rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
-                          INSTALL_EXPORT_SET raft-exports)
-  endif()
+  include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
+                        INSTALL_EXPORT_SET raft-exports)
 endfunction()
 
-find_and_configure_libcudacxx()
+if(RAFT_ENABLE_cuco_DEPENDENCY)
+  find_and_configure_libcudacxx()
+endif()
\ No newline at end of file

From 1a7295fd2665a9bf6146ab3fa43f174fe19a32f3 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 31 Mar 2022 14:06:31 -0400
Subject: [PATCH 158/167] fixing bad merge

---
 conda/recipes/libraft_headers/build.sh |  1 +
 cpp/cmake/thirdparty/get_cuco.cmake    | 21 ++-------------------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/conda/recipes/libraft_headers/build.sh b/conda/recipes/libraft_headers/build.sh
index f239e545ef..a5b44eaba5 100644
--- a/conda/recipes/libraft_headers/build.sh
+++ b/conda/recipes/libraft_headers/build.sh
@@ -2,3 +2,4 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
 ./build.sh libraft --install -v --allgpuarch
+``
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index adc21d7080..09b08d984d 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -16,37 +16,20 @@
 
 function(find_and_configure_cuco VERSION)
 
-<<<<<<< HEAD
     rapids_cpm_find(cuco ${VERSION}
       GLOBAL_TARGETS      cuco::cuco
       BUILD_EXPORT_SET    raft-distance-exports
       INSTALL_EXPORT_SET  raft-distance-exports
       CPM_ARGS
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-        GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
+        GIT_TAG        6ec8b6dcdeceea07ab4456d32461a05c18864411
         OPTIONS        "BUILD_TESTS OFF"
                        "BUILD_BENCHMARKS OFF"
                        "BUILD_EXAMPLES OFF"
     )
-=======
-    if(RAFT_ENABLE_cuco_DEPENDENCY)
-        rapids_cpm_find(cuco ${VERSION}
-          GLOBAL_TARGETS      cuco::cuco
-          BUILD_EXPORT_SET    raft-distance-exports
-          INSTALL_EXPORT_SET  raft-distance-exports
-          CPM_ARGS
-            GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-            GIT_TAG        6ec8b6dcdeceea07ab4456d32461a05c18864411
-            OPTIONS        "BUILD_TESTS OFF"
-                           "BUILD_BENCHMARKS OFF"
-                           "BUILD_EXAMPLES OFF"
-        )
-    endif()
->>>>>>> f13aceb1ef5779863f6607d73eed58a5fb6a07ec
-
 endfunction()
 
 if(RAFT_ENABLE_cuco_DEPENDENCY)
     # cuCollections doesn't have a version yet
-    find_and_configure_cuco(0.0)
+    find_and_configure_cuco(0.0.1)
 endif()
\ No newline at end of file

From 84d6701b158513c65a4d9922adc6d8531cd42baa Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 31 Mar 2022 14:30:46 -0400
Subject: [PATCH 159/167] Enabling NN dependencies when building tests

---
 cpp/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 63772fe170..2c336dd96f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -65,6 +65,9 @@ option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiation
 option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations" OFF)
 option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations" OFF)
 option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss" ${RAFT_COMPILE_LIBRARIES})
+if(BUILD_TESTS)
+  set(RAFT_ENABLE_NN_DEPENDENCIES ON)
+endif()
 
 option(RAFT_ENABLE_mdspan_DEPENDENCY "Enable mdspan dependency" ON)
 option(RAFT_ENABLE_thrust_DEPENDENCY "Enable Thrust dependency" ON)

From 98d28c1ff7926327e1f0acf5d87a93e47437f4be Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 31 Mar 2022 14:35:44 -0400
Subject: [PATCH 160/167] making compile_libraries default to `BUILD_TESTS`

---
 cpp/CMakeLists.txt | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2c336dd96f..8b06c81099 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -57,17 +57,14 @@ option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
-option(DISABLE_DEPRECATION_WARNINGS "Disable depreaction warnings " ON)
+option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON)
 option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(NVTX "Enable nvtx markers" OFF)
 
-option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" OFF)
+option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" ${BUILD_TESTS})
 option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations" OFF)
 option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations" OFF)
 option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss" ${RAFT_COMPILE_LIBRARIES})
-if(BUILD_TESTS)
-  set(RAFT_ENABLE_NN_DEPENDENCIES ON)
-endif()
 
 option(RAFT_ENABLE_mdspan_DEPENDENCY "Enable mdspan dependency" ON)
 option(RAFT_ENABLE_thrust_DEPENDENCY "Enable Thrust dependency" ON)

From 24f35899e185b8df99398d406692a79b20804e84 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 31 Mar 2022 14:36:42 -0400
Subject: [PATCH 161/167] updating build.md

---
 BUILD.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index 5a28fe617a..c4d8b1b356 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -53,9 +53,9 @@ The following example will download the needed dependencies and install the RAFT
 ./build.sh libraft --install
 ```
 
-The `--no-thrust` flag can be used to install the headers with minimal dependencies:
+The `--minimal-deps` flag can be used to install the headers with minimal dependencies:
 ```bash
-./build.sh libraft --install --no-thrust
+./build.sh libraft --install --minimal-deps
 ```
 
 ### <a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)

From 387d3747a5577ee98d2927bf5f36f7cde2303765 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 31 Mar 2022 16:02:50 -0400
Subject: [PATCH 162/167] Updating kselection. Apparently

---
 cpp/include/raft/spatial/knn/detail/selection_faiss.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 2d2fabd9d6..d157a57f52 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -172,4 +172,4 @@ inline void select_k(key_t* inK,
 };  // namespace detail
 };  // namespace knn
 };  // namespace spatial
-};  // namespace raft
+};  // namespace raft
\ No newline at end of file

From b44c88ed0e724cef85be9cb7cb5dae071e14eaea Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 31 Mar 2022 16:21:29 -0400
Subject: [PATCH 163/167] Updates

---
 README.md                              |  2 +-
 cpp/CMakeLists.txt                     |  2 +-
 cpp/include/raft/core/cudart_utils.hpp | 26 ++++++++++++++++++++++----
 docs/source/cuda_cpp.rst               |  2 +-
 docs/source/index.rst                  |  2 +-
 5 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index f73d474efc..c359a79e39 100755
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: Reusable Accelerated Functions and Tools</div>
 
-RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
+RAFT contains fundamental widely-used algorithms and primitives for data science and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
 
 By taking a primitives-based approach to algorithm development, RAFT 
 - accelerates algorithm construction time
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8b06c81099..12167d6771 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -369,7 +369,7 @@ set(doc_string
 Provide targets for the RAFT: Reusable Accelerated Functions and Tools
 
 RAFT contains fundamental widely-used algorithms and primitives
-for data science, graph, and ml.
+for data science and machine learning.
 
 Optional Components:
   - nn
diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp
index 95630cd314..ff8305d1f2 100644
--- a/cpp/include/raft/core/cudart_utils.hpp
+++ b/cpp/include/raft/core/cudart_utils.hpp
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/**
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/cudart_utils.hpp instead.
+ */
+
 #ifndef __RAFT_RT_CUDART_UTILS_H
 #define __RAFT_RT_CUDART_UTILS_H
 
@@ -154,7 +159,6 @@ class grid_1d_thread_t {
    * @param num_threads_per_block The grid block size, determined according to the kernel's
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
-   * @param max_num_blocks_1d maximum number of 1d blocks
    * @param elements_per_thread Typically, a single kernel thread processes more than a single
    * element; this affects the number of threads the grid must contain
    */
@@ -187,7 +191,6 @@ class grid_1d_warp_t {
   /**
    * @param overall_num_elements The number of elements the kernel needs to handle/process
    * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * @param max_num_blocks_1d maximum number of 1d blocks
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
    */
@@ -217,7 +220,6 @@ class grid_1d_block_t {
   /**
    * @param overall_num_elements The number of elements the kernel needs to handle/process
    * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * @param max_num_blocks_1d maximum number of 1d blocks
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
    */
@@ -402,6 +404,22 @@ IntType gcd(IntType a, IntType b)
   return a;
 }
 
+template <typename T>
+constexpr T lower_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity && std::numeric_limits<T>::is_signed) {
+    return -std::numeric_limits<T>::infinity();
+  }
+  return std::numeric_limits<T>::lowest();
+}
+
+template <typename T>
+constexpr T upper_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity) { return std::numeric_limits<T>::infinity(); }
+  return std::numeric_limits<T>::max();
+}
+
 }  // namespace raft
 
-#endif
+#endif
\ No newline at end of file
diff --git a/docs/source/cuda_cpp.rst b/docs/source/cuda_cpp.rst
index 30e8903f29..27ec7f5545 100644
--- a/docs/source/cuda_cpp.rst
+++ b/docs/source/cuda_cpp.rst
@@ -1,7 +1,7 @@
 CUDA/C++ API
 ============
 
-RAFT is header-only but provides optional shared libraries to speed up compile times for larger projects.
+RAFT is a header-only C++ with optional shared libraries that can speed up compile times for larger projects.
 
 .. _api:
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index d047543c13..97c616dd8e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,7 +1,7 @@
 Welcome to RAFT's documentation!
 =================================
 
-RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning.
+RAFT contains fundamental widely-used algorithms and primitives for data science and machine learning.
 
 .. toctree::
    :maxdepth: 2

From 11d4362d762550076a6450387be18b0a033f33f5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 1 Apr 2022 18:46:47 -0400
Subject: [PATCH 164/167] Updates based on review feedback

---
 conda/recipes/libraft_distance/meta.yaml |  1 -
 conda/recipes/libraft_headers/meta.yaml  |  1 -
 conda/recipes/libraft_nn/meta.yaml       |  1 -
 cpp/CMakeLists.txt                       | 18 +++++++++++-------
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index a2eaab8854..a0891ffb6c 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -44,7 +44,6 @@ requirements:
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
     - gtest=1.10.0
-    - gmock
     - librmm {{ minor_version }}
   run:
     - libraft-headers {{ version }}
diff --git a/conda/recipes/libraft_headers/meta.yaml b/conda/recipes/libraft_headers/meta.yaml
index aaabfce18c..a053e8eacd 100644
--- a/conda/recipes/libraft_headers/meta.yaml
+++ b/conda/recipes/libraft_headers/meta.yaml
@@ -43,7 +43,6 @@ requirements:
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
     - gtest=1.10.0
-    - gmock
     - librmm {{ minor_version}}
     - libcusolver>=11.2.1
   run:
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
index 0c31a67d66..c3b3cb3695 100644
--- a/conda/recipes/libraft_nn/meta.yaml
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -44,7 +44,6 @@ requirements:
     - faiss-proc=*=cuda
     - libfaiss 1.7.0 *_cuda
     - gtest=1.10.0
-    - gmock
     - librmm {{ minor_version }}
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 12167d6771..ab52b766e2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -382,13 +382,18 @@ Imported Targets:
 
 ]=])
 
-set(code_string
-[=[
-
-if(RAFT_ENABLE_thrust_DEPENDENCY AND NOT TARGET raft::Thrust)
-  thrust_create_target(raft::Thrust FROM_OPTIONS)
+set(code_string )
+if(RAFT_ENABLE_thrust_DEPENDENCY)
+  string(APPEND code_string
+  [=[
+  if(NOT TARGET raft::Thrust)
+    thrust_create_target(raft::Thrust FROM_OPTIONS)
+  endif()
+  ]=])
 endif()
 
+string(APPEND code_string
+[=[
 if(distance IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)
 endif()
@@ -402,8 +407,7 @@ if(nn IN_LIST raft_FIND_COMPONENTS)
       add_library(faiss ALIAS faiss::faiss)
   endif()
 endif()
-]=]
-        )
+]=])
 
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)

From 4da9084c13c3e9509eb104342672ccbf83e3d4e6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 1 Apr 2022 21:29:33 -0400
Subject: [PATCH 165/167] Fixing doxygen errors

---
 cpp/include/raft/core/cudart_utils.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp
index ff8305d1f2..5adc0227a8 100644
--- a/cpp/include/raft/core/cudart_utils.hpp
+++ b/cpp/include/raft/core/cudart_utils.hpp
@@ -159,6 +159,7 @@ class grid_1d_thread_t {
    * @param num_threads_per_block The grid block size, determined according to the kernel's
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param max_num_blocks_1d maximum number of blocks in 1d grid
    * @param elements_per_thread Typically, a single kernel thread processes more than a single
    * element; this affects the number of threads the grid must contain
    */
@@ -193,6 +194,7 @@ class grid_1d_warp_t {
    * @param num_threads_per_block The grid block size, determined according to the kernel's
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param max_num_blocks_1d maximum number of blocks in 1d grid
    */
   grid_1d_warp_t(size_t overall_num_elements,
                  size_t num_threads_per_block,
@@ -222,6 +224,7 @@ class grid_1d_block_t {
    * @param num_threads_per_block The grid block size, determined according to the kernel's
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param max_num_blocks_1d maximum number of blocks in 1d grid
    */
   grid_1d_block_t(size_t overall_num_elements,
                   size_t num_threads_per_block,

From d00b35082dbf03ac85b2d1c974f6a1d7418f5dab Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 7 Apr 2022 21:39:27 -0400
Subject: [PATCH 166/167] Review feedback

---
 build.sh                               | 2 +-
 conda/recipes/libraft_headers/build.sh | 3 +--
 cpp/cmake/thirdparty/get_mdspan.cmake  | 6 +++---
 cpp/cmake/thirdparty/get_thrust.cmake  | 6 +++---
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/build.sh b/build.sh
index 688a58d276..568de2956d 100755
--- a/build.sh
+++ b/build.sh
@@ -111,7 +111,7 @@ if hasArg --install; then
   INSTALL_TARGET="install"
 fi
 
-if hasArg --no-thrust; then
+if hasArg --minimal-deps; then
   ENABLE_thrust_DEPENDENCY=OFF
 fi
 
diff --git a/conda/recipes/libraft_headers/build.sh b/conda/recipes/libraft_headers/build.sh
index a5b44eaba5..d351b27577 100644
--- a/conda/recipes/libraft_headers/build.sh
+++ b/conda/recipes/libraft_headers/build.sh
@@ -1,5 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft --install -v --allgpuarch
-``
\ No newline at end of file
+./build.sh libraft --install -v --allgpuarch
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index cc99de9f2c..5af3c4f31e 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -13,7 +13,6 @@
 # =============================================================================
 
 function(find_and_configure_mdspan VERSION)
-  if(RAFT_ENABLE_mdspan_DEPENDENCY)
     rapids_cpm_find(
       mdspan ${VERSION}
       GLOBAL_TARGETS std::mdspan
@@ -25,7 +24,8 @@ function(find_and_configure_mdspan VERSION)
         OPTIONS "MDSPAN_ENABLE_CUDA ON"
                 "MDSPAN_CXX_STANDARD ON"
     )
-  endif()
 endfunction()
 
-find_and_configure_mdspan(0.2.0)
+if(RAFT_ENABLE_mdspan_DEPENDENCY)
+  find_and_configure_mdspan(0.2.0)
+endif()
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index c533b04256..12360b9482 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -14,13 +14,13 @@
 
 # Use CPM to find or clone thrust
 function(find_and_configure_thrust)
-    if(RAFT_ENABLE_thrust_DEPENDENCY)
         include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
         rapids_cpm_thrust( NAMESPACE raft
                            BUILD_EXPORT_SET raft-exports
                            INSTALL_EXPORT_SET raft-exports)
-    endif()
 endfunction()
 
-find_and_configure_thrust()
+if(RAFT_ENABLE_thrust_DEPENDENCY)
+  find_and_configure_thrust()
+endif()

From 184a2202533445a53203876ccf58f6dd49e9c4fa Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 19 Apr 2022 12:57:54 -0400
Subject: [PATCH 167/167] Review feedback

---
 cpp/include/raft/common/logger.hpp | 6 ++++++
 cpp/include/raft/common/nvtx.hpp   | 5 +++++
 docs/source/cuda_cpp.rst           | 2 +-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/common/logger.hpp b/cpp/include/raft/common/logger.hpp
index 0a4c7044bc..77483e577d 100644
--- a/cpp/include/raft/common/logger.hpp
+++ b/cpp/include/raft/common/logger.hpp
@@ -13,6 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * This file is deprecated and will be removed in release 22.08.
+ * Please use the include/core/logger.hpp instead.
+ */
+
 #pragma once
 
 #include <raft/core/logger.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/common/nvtx.hpp b/cpp/include/raft/common/nvtx.hpp
index f5c7527580..385bc544b0 100644
--- a/cpp/include/raft/common/nvtx.hpp
+++ b/cpp/include/raft/common/nvtx.hpp
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+/**
+ * This file is deprecated and will be removed in release 22.08.
+ * Please use the include/core/nvtx.hpp instead.
+ */
+
 #pragma once
 
 #include <raft/core/nvtx.hpp>
\ No newline at end of file
diff --git a/docs/source/cuda_cpp.rst b/docs/source/cuda_cpp.rst
index 27ec7f5545..7a7cdae086 100644
--- a/docs/source/cuda_cpp.rst
+++ b/docs/source/cuda_cpp.rst
@@ -1,7 +1,7 @@
 CUDA/C++ API
 ============
 
-RAFT is a header-only C++ with optional shared libraries that can speed up compile times for larger projects.
+RAFT is a header-only C++ library with optional pre-compiled shared libraries that can speed up compile times for larger projects.
 
 .. _api: