diff --git a/README.md b/README.md
index ddaf8b3f8d..04dd2ff16d 100755
--- a/README.md
+++ b/README.md
@@ -77,11 +77,73 @@ auto metric = raft::distance::DistanceType::L2SqrtExpanded;
 raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
 ```
 
+It's also possible to create `raft::device_mdspan` views to invoke the same API with raw pointers and shape information:
+
+```c++
+#include <raft/core/handle.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <raft/distance/distance.cuh>
+
+raft::handle_t handle;
+
+int n_samples = 5000;
+int n_features = 50;
+
+float *input;
+int *labels;
+float *output;
+
+...
+// Allocate input, labels, and output pointers
+...
+
+auto input_view = raft::make_device_matrix_view(input, n_samples, n_features);
+auto labels_view = raft::make_device_vector_view(labels, n_samples);
+auto output_view = raft::make_device_matrix_view(output, n_samples, n_samples);
+
+raft::random::make_blobs(handle, input_view, labels_view);
+
+auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+raft::distance::pairwise_distance(handle, input_view, input_view, output_view, metric);
+```
+
+
 ### Python Example
 
 The `pylibraft` package contains a Python API for RAFT algorithms and primitives. `pylibraft` integrates nicely into other libraries by being very lightweight with minimal dependencies and accepting any object that supports the `__cuda_array_interface__`, such as [CuPy's ndarray](https://docs.cupy.dev/en/stable/user_guide/interoperability.html#rmm). The number of RAFT algorithms exposed in this package is continuing to grow from release to release.
 
-The example below demonstrates computing the pairwise Euclidean distances between CuPy arrays. `pylibraft` is a low-level API that prioritizes efficiency and simplicity over being pythonic, which is shown here by pre-allocating the output memory before invoking the `pairwise_distance` function. Note that CuPy is not a required dependency for `pylibraft`.
+The example below demonstrates computing the pairwise Euclidean distances between CuPy arrays. Note that CuPy is not a required dependency for `pylibraft`.
+
+```python
+import cupy as cp
+
+from pylibraft.distance import pairwise_distance
+
+n_samples = 5000
+n_features = 50
+
+in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+
+output = pairwise_distance(in1, in2, metric="euclidean")
+```
+
+The `output` array supports [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html#cuda-array-interface-version-2) so it is interoperable with other libraries like CuPy, Numba, and PyTorch that also support it. 
+
+Below is an example of converting the output `pylibraft.device_ndarray` to a CuPy array:
+```python
+cupy_array = cp.asarray(output)
+```
+
+And converting to a PyTorch tensor:
+```python
+import torch
+
+torch_tensor = torch.as_tensor(output, device='cuda')
+```
+
+`pylibraft` also supports writing to a pre-allocated output array so any `__cuda_array_interface__` supported array can be written to in-place:
 
 ```python
 import cupy as cp
@@ -95,9 +157,10 @@ in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
 in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
 output = cp.empty((n_samples, n_samples), dtype=cp.float32)
 
-pairwise_distance(in1, in2, output, metric="euclidean")
+pairwise_distance(in1, in2, out=output, metric="euclidean")
 ```
 
+
 ## Installing
 
 RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), or by building the repository from source. Please refer to the [build instructions](docs/source/build.md) for more a comprehensive guide on building RAFT and using it in downstream projects.
diff --git a/python/pylibraft/pylibraft/common/__init__.py b/python/pylibraft/pylibraft/common/__init__.py
index 7872599a78..4c6f0d686a 100644
--- a/python/pylibraft/pylibraft/common/__init__.py
+++ b/python/pylibraft/pylibraft/common/__init__.py
@@ -13,5 +13,7 @@
 # limitations under the License.
 #
 
+
 from .cuda import Stream
+from .device_ndarray import device_ndarray
 from .handle import Handle
diff --git a/python/pylibraft/pylibraft/common/device_ndarray.py b/python/pylibraft/pylibraft/common/device_ndarray.py
new file mode 100644
index 0000000000..eebbca2f06
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/device_ndarray.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+
+import rmm
+
+
+class device_ndarray:
+    """
+    pylibraft.common.device_ndarray is meant to be a very lightweight
+    __cuda_array_interface__ wrapper around a numpy.ndarray.
+    """
+
+    def __init__(self, np_ndarray):
+        """
+        Construct a pylibraft.common.device_ndarray wrapper around a
+        numpy.ndarray
+
+        Parameters
+        ----------
+        ndarray : A numpy.ndarray which will be copied and moved to the device
+
+        Examples
+        --------
+        The device_ndarray is __cuda_array_interface__ compliant so it is
+        interoperable with other libraries that also support it, such as
+        CuPy and PyTorch.
+
+        The following usage example demonstrates
+        converting a pylibraft.common.device_ndarray to a cupy.ndarray:
+        .. code-block:: python
+
+            import cupy as cp
+            from pylibraft.common import device_ndarray
+
+            raft_array = device_ndarray.empty((100, 50))
+            cupy_array = cp.asarray(raft_array)
+
+        And the converting pylibraft.common.device_ndarray to a PyTorch tensor:
+        .. code-block:: python
+
+            import torch
+            from pylibraft.common import device_ndarray
+
+            raft_array = device_ndarray.empty((100, 50))
+            torch_tensor = torch.as_tensor(raft_array, device='cuda')
+        """
+        self.ndarray_ = np_ndarray
+        order = "C" if self.c_contiguous else "F"
+        self.device_buffer_ = rmm.DeviceBuffer.to_device(
+            self.ndarray_.tobytes(order=order)
+        )
+
+    @classmethod
+    def empty(cls, shape, dtype=np.float32, order="C"):
+        """
+        Return a new device_ndarray of given shape and type, without
+        initializing entries.
+
+        Parameters
+        ----------
+        shape : int or tuple of int
+                Shape of the empty array, e.g., (2, 3) or 2.
+        dtype : data-type, optional
+                Desired output data-type for the array, e.g, numpy.int8.
+                Default is numpy.float32.
+        order : {'C', 'F'}, optional (default: 'C')
+                Whether to store multi-dimensional dat ain row-major (C-style)
+                or column-major (Fortran-style) order in memory
+        """
+        arr = np.empty(shape, dtype=dtype, order=order)
+        return cls(arr)
+
+    @property
+    def c_contiguous(self):
+        """
+        Is the current device_ndarray laid out in row-major format?
+        """
+        array_interface = self.ndarray_.__array_interface__
+        strides = self.strides
+        return (
+            strides is None
+            or array_interface["strides"][1] == self.dtype.itemsize
+        )
+
+    @property
+    def f_contiguous(self):
+        """
+        Is the current device_ndarray laid out in column-major format?
+        """
+        return not self.c_contiguous
+
+    @property
+    def dtype(self):
+        """
+        Datatype of the current device_ndarray instance
+        """
+        array_interface = self.ndarray_.__array_interface__
+        return np.dtype(array_interface["typestr"])
+
+    @property
+    def shape(self):
+        """
+        Shape of the current device_ndarray instance
+        """
+        array_interface = self.ndarray_.__array_interface__
+        return array_interface["shape"]
+
+    @property
+    def strides(self):
+        """
+        Strides of the current device_ndarray instance
+        """
+        array_interface = self.ndarray_.__array_interface__
+        return (
+            None
+            if "strides" not in array_interface
+            else array_interface["strides"]
+        )
+
+    @property
+    def __cuda_array_interface__(self):
+        """
+        Returns the __cuda_array_interface__ compliant dict for
+        integrating with other device-enabled libraries using
+        zero-copy semantics.
+        """
+        device_cai = self.device_buffer_.__cuda_array_interface__
+        host_cai = self.ndarray_.__array_interface__.copy()
+        host_cai["data"] = (device_cai["data"][0], device_cai["data"][1])
+
+        return host_cai
+
+    def copy_to_host(self):
+        """
+        Returns a new numpy.ndarray object on host with the current contents of
+        this device_ndarray
+        """
+        ret = np.frombuffer(
+            self.device_buffer_.tobytes(),
+            dtype=self.dtype,
+            like=self.ndarray_,
+        ).astype(self.dtype)
+        ret = np.lib.stride_tricks.as_strided(ret, self.shape, self.strides)
+        return ret
diff --git a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
index 73cd60058f..9597e3906e 100644
--- a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
+++ b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
@@ -26,7 +26,7 @@ from libcpp cimport bool
 
 from .distance_type cimport DistanceType
 
-from pylibraft.common import Handle
+from pylibraft.common import Handle, device_ndarray
 from pylibraft.common.handle import auto_sync_handle
 from pylibraft.common.handle cimport handle_t
 
@@ -62,7 +62,7 @@ cdef extern from "raft_distance/fused_l2_min_arg.hpp" \
 
 
 @auto_sync_handle
-def fused_l2_nn_argmin(X, Y, output, sqrt=True, handle=None):
+def fused_l2_nn_argmin(X, Y, out=None, sqrt=True, handle=None):
     """
     Compute the 1-nearest neighbors between X and Y using the L2 distance
 
@@ -77,6 +77,35 @@ def fused_l2_nn_argmin(X, Y, output, sqrt=True, handle=None):
     Examples
     --------
 
+    To compute the 1-nearest neighbors argmin:
+    .. code-block:: python
+
+        import cupy as cp
+
+        from pylibraft.common import Handle
+        from pylibraft.distance import fused_l2_nn_argmin
+
+        n_samples = 5000
+        n_clusters = 5
+        n_features = 50
+
+        in1 = cp.random.random_sample((n_samples, n_features),
+                                      dtype=cp.float32)
+        in2 = cp.random.random_sample((n_clusters, n_features),
+                                      dtype=cp.float32)
+
+        # A single RAFT handle can optionally be reused across
+        # pylibraft functions.
+        handle = Handle()
+        ...
+        output = fused_l2_nn_argmin(in1, in2, output, handle=handle)
+        ...
+        # pylibraft functions are often asynchronous so the
+        # handle needs to be explicitly synchronized
+        handle.sync()
+
+    The output can also be computed in-place on a preallocated
+    array:
     .. code-block:: python
 
         import cupy as cp
@@ -98,20 +127,30 @@ def fused_l2_nn_argmin(X, Y, output, sqrt=True, handle=None):
         # pylibraft functions.
         handle = Handle()
         ...
-        fused_l2_nn_argmin(in1, in2, output, handle=handle)
+        fused_l2_nn_argmin(in1, in2, out=output, handle=handle)
         ...
         # pylibraft functions are often asynchronous so the
         # handle needs to be explicitly synchronized
         handle.sync()
+
    """
 
     x_cai = X.__cuda_array_interface__
     y_cai = Y.__cuda_array_interface__
-    output_cai = output.__cuda_array_interface__
+
+    x_dt = np.dtype(x_cai["typestr"])
+    y_dt = np.dtype(y_cai["typestr"])
 
     m = x_cai["shape"][0]
     n = y_cai["shape"][0]
 
+    if out is None:
+        output = device_ndarray.empty((m,), dtype="int32")
+    else:
+        output = out
+
+    output_cai = output.__cuda_array_interface__
+
     x_k = x_cai["shape"][1]
     y_k = y_cai["shape"][1]
 
@@ -127,8 +166,6 @@ def fused_l2_nn_argmin(X, Y, output, sqrt=True, handle=None):
     handle = handle if handle is not None else Handle()
     cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
 
-    x_dt = np.dtype(x_cai["typestr"])
-    y_dt = np.dtype(y_cai["typestr"])
     d_dt = np.dtype(output_cai["typestr"])
 
     x_c_contiguous = is_c_cont(x_cai, x_dt)
@@ -162,3 +199,5 @@ def fused_l2_nn_argmin(X, Y, output, sqrt=True, handle=None):
                             <bool>sqrt)
     else:
         raise ValueError("dtype %s not supported" % x_dt)
+
+    return output
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 76cdf0b2d3..dc4bd982f9 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -28,8 +28,11 @@ from .distance_type cimport DistanceType
 
 from pylibraft.common import Handle
 from pylibraft.common.handle import auto_sync_handle
+
 from pylibraft.common.handle cimport handle_t
 
+from pylibraft.common import device_ndarray
+
 
 def is_c_cont(cai, dt):
     return "strides" not in cai or \
@@ -92,7 +95,7 @@ SUPPORTED_DISTANCES = ["euclidean", "l1", "cityblock", "l2", "inner_product",
 
 
 @auto_sync_handle
-def distance(X, Y, dists, metric="euclidean", p=2.0, handle=None):
+def distance(X, Y, out=None, metric="euclidean", p=2.0, handle=None):
     """
     Compute pairwise distances between X and Y
 
@@ -107,14 +110,20 @@ def distance(X, Y, dists, metric="euclidean", p=2.0, handle=None):
 
     X : CUDA array interface compliant matrix shape (m, k)
     Y : CUDA array interface compliant matrix shape (n, k)
-    dists : Writable CUDA array interface matrix shape (m, n)
+    out : Optional writable CUDA array interface matrix shape (m, n)
     metric : string denoting the metric type (default="euclidean")
     p : metric parameter (currently used only for "minkowski")
     {handle_docstring}
 
+    Returns
+    -------
+
+    raft.device_ndarray containing pairwise distances
+
     Examples
     --------
 
+    To compute pairwise distances on cupy arrays:
     .. code-block:: python
 
         import cupy as cp
@@ -129,29 +138,66 @@ def distance(X, Y, dists, metric="euclidean", p=2.0, handle=None):
                                       dtype=cp.float32)
         in2 = cp.random.random_sample((n_samples, n_features),
                                       dtype=cp.float32)
-        output = cp.empty((n_samples, n_samples), dtype=cp.float32)
 
         # A single RAFT handle can optionally be reused across
         # pylibraft functions.
         handle = Handle()
         ...
-        pairwise_distance(in1, in2, output, metric="euclidean", handle=handle)
+        output = pairwise_distance(in1, in2, metric="euclidean", handle=handle)
         ...
         # pylibraft functions are often asynchronous so the
         # handle needs to be explicitly synchronized
         handle.sync()
+
+   It's also possible to write to a pre-allocated output array:
+   .. code-block:: python
+
+       import cupy as cp
+
+       from pylibraft.common import Handle
+       from pylibraft.distance import pairwise_distance
+
+       n_samples = 5000
+       n_features = 50
+
+       in1 = cp.random.random_sample((n_samples, n_features),
+                                     dtype=cp.float32)
+       in2 = cp.random.random_sample((n_samples, n_features),
+                                     dtype=cp.float32)
+       output = cp.empty((n_samples, n_samples), dtype=cp.float32)
+
+       # A single RAFT handle can optionally be reused across
+       # pylibraft functions.
+       handle = Handle()
+       ...
+       pairwise_distance(in1, in2, out=output,
+                         metric="euclidean", handle=handle)
+       ...
+       # pylibraft functions are often asynchronous so the
+       # handle needs to be explicitly synchronized
+       handle.sync()
+
    """
 
     x_cai = X.__cuda_array_interface__
     y_cai = Y.__cuda_array_interface__
-    dists_cai = dists.__cuda_array_interface__
 
     m = x_cai["shape"][0]
     n = y_cai["shape"][0]
 
+    x_dt = np.dtype(x_cai["typestr"])
+    y_dt = np.dtype(y_cai["typestr"])
+
+    if out is None:
+        dists = device_ndarray.empty((m, n), dtype=y_dt)
+    else:
+        dists = out
+
     x_k = x_cai["shape"][1]
     y_k = y_cai["shape"][1]
 
+    dists_cai = dists.__cuda_array_interface__
+
     if x_k != y_k:
         raise ValueError("Inputs must have same number of columns. "
                          "a=%s, b=%s" % (x_k, y_k))
@@ -163,8 +209,6 @@ def distance(X, Y, dists, metric="euclidean", p=2.0, handle=None):
     handle = handle if handle is not None else Handle()
     cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
 
-    x_dt = np.dtype(x_cai["typestr"])
-    y_dt = np.dtype(y_cai["typestr"])
     d_dt = np.dtype(dists_cai["typestr"])
 
     x_c_contiguous = is_c_cont(x_cai, x_dt)
@@ -205,3 +249,5 @@ def distance(X, Y, dists, metric="euclidean", p=2.0, handle=None):
                           <float>p)
     else:
         raise ValueError("dtype %s not supported" % x_dt)
+
+    return dists
diff --git a/python/pylibraft/pylibraft/test/test_device_ndarray.py b/python/pylibraft/pylibraft/test/test_device_ndarray.py
new file mode 100644
index 0000000000..ee96abe049
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_device_ndarray.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+
+from pylibraft.common import device_ndarray
+
+
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_basic_attributes(order, dtype):
+
+    a = np.random.random((500, 2)).astype(dtype)
+
+    if order == "C":
+        a = np.ascontiguousarray(a)
+    else:
+        a = np.asfortranarray(a)
+
+    db = device_ndarray(a)
+    db_host = db.copy_to_host()
+
+    assert a.shape == db.shape
+    assert a.dtype == db.dtype
+    assert a.data.f_contiguous == db.f_contiguous
+    assert a.data.f_contiguous == db_host.data.f_contiguous
+    assert a.data.c_contiguous == db.c_contiguous
+    assert a.data.c_contiguous == db_host.data.c_contiguous
+    np.testing.assert_array_equal(a.tolist(), db_host.tolist())
+
+
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_empty(order, dtype):
+
+    a = np.random.random((500, 2)).astype(dtype)
+    if order == "C":
+        a = np.ascontiguousarray(a)
+    else:
+        a = np.asfortranarray(a)
+
+    db = device_ndarray.empty(a.shape, dtype=dtype, order=order)
+    db_host = db.copy_to_host()
+
+    assert a.shape == db.shape
+    assert a.dtype == db.dtype
+    assert a.data.f_contiguous == db.f_contiguous
+    assert a.data.f_contiguous == db_host.data.f_contiguous
+    assert a.data.c_contiguous == db.c_contiguous
+    assert a.data.c_contiguous == db_host.data.c_contiguous
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
index 670beb156e..a08656d3aa 100644
--- a/python/pylibraft/pylibraft/test/test_distance.py
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -17,9 +17,8 @@
 import pytest
 from scipy.spatial.distance import cdist
 
-from pylibraft.common import Handle
+from pylibraft.common import Handle, device_ndarray
 from pylibraft.distance import pairwise_distance
-from pylibraft.testing.utils import TestDeviceBuffer
 
 
 @pytest.mark.parametrize("n_rows", [100])
@@ -39,9 +38,10 @@
         "sqeuclidean",
     ],
 )
+@pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("order", ["F", "C"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_distance(n_rows, n_cols, metric, order, dtype):
+def test_distance(n_rows, n_cols, inplace, metric, order, dtype):
     input1 = np.random.random_sample((n_rows, n_cols))
     input1 = np.asarray(input1, order=order).astype(dtype)
 
@@ -61,13 +61,17 @@ def test_distance(n_rows, n_cols, metric, order, dtype):
 
     expected[expected <= 1e-5] = 0.0
 
-    input1_device = TestDeviceBuffer(input1, order)
-    output_device = TestDeviceBuffer(output, order)
+    input1_device = device_ndarray(input1)
+    output_device = device_ndarray(output) if inplace else None
 
     handle = Handle()
-    pairwise_distance(input1_device, input1_device, output_device, metric)
+    ret_output = pairwise_distance(
+        input1_device, input1_device, output_device, metric
+    )
     handle.sync()
 
+    output_device = ret_output if not inplace else output_device
+
     actual = output_device.copy_to_host()
 
     actual[actual <= 1e-5] = 0.0
diff --git a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
index abe56f2b04..b05ad3d530 100644
--- a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
+++ b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
@@ -17,16 +17,16 @@
 import pytest
 from scipy.spatial.distance import cdist
 
-from pylibraft.common import Handle
+from pylibraft.common import Handle, device_ndarray
 from pylibraft.distance import fused_l2_nn_argmin
-from pylibraft.testing.utils import TestDeviceBuffer
 
 
+@pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("n_rows", [10, 100])
 @pytest.mark.parametrize("n_clusters", [5, 10])
 @pytest.mark.parametrize("n_cols", [3, 5])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype):
+def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype, inplace):
     input1 = np.random.random_sample((n_rows, n_cols))
     input1 = np.asarray(input1, order="C").astype(dtype)
 
@@ -38,15 +38,16 @@ def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype):
 
     expected = expected.argmin(axis=1)
 
-    input1_device = TestDeviceBuffer(input1, "C")
-    input2_device = TestDeviceBuffer(input2, "C")
-    output_device = TestDeviceBuffer(output, "C")
+    input1_device = device_ndarray(input1)
+    input2_device = device_ndarray(input2)
+    output_device = device_ndarray(output) if inplace else None
 
     handle = Handle()
-    fused_l2_nn_argmin(
+    ret_output = fused_l2_nn_argmin(
         input1_device, input2_device, output_device, True, handle=handle
     )
     handle.sync()
+    output_device = ret_output if not inplace else output_device
     actual = output_device.copy_to_host()
 
     assert np.allclose(expected, actual, rtol=1e-4)
diff --git a/python/pylibraft/pylibraft/test/test_kmeans.py b/python/pylibraft/pylibraft/test/test_kmeans.py
index d198ac2f8f..58028e90e8 100644
--- a/python/pylibraft/pylibraft/test/test_kmeans.py
+++ b/python/pylibraft/pylibraft/test/test_kmeans.py
@@ -17,9 +17,8 @@
 import pytest
 
 from pylibraft.cluster.kmeans import compute_new_centroids
-from pylibraft.common import Handle
+from pylibraft.common import Handle, device_ndarray
 from pylibraft.distance import pairwise_distance
-from pylibraft.testing.utils import TestDeviceBuffer
 
 
 @pytest.mark.parametrize("n_rows", [100])
@@ -32,41 +31,37 @@ def test_compute_new_centroids(
     n_rows, n_cols, metric, n_clusters, dtype, additional_args
 ):
 
-    order = "C"
-
     # A single RAFT handle can optionally be reused across
     # pylibraft functions.
     handle = Handle()
 
     X = np.random.random_sample((n_rows, n_cols)).astype(dtype)
-    X_device = TestDeviceBuffer(X, order)
+    X_device = device_ndarray(X)
 
     centroids = X[:n_clusters]
-    centroids_device = TestDeviceBuffer(centroids, order)
+    centroids_device = device_ndarray(centroids)
 
     weight_per_cluster = np.zeros((n_clusters,), dtype=dtype)
     weight_per_cluster_device = (
-        TestDeviceBuffer(weight_per_cluster, order)
-        if additional_args
-        else None
+        device_ndarray(weight_per_cluster) if additional_args else None
     )
 
     new_centroids = np.zeros((n_clusters, n_cols), dtype=dtype)
-    new_centroids_device = TestDeviceBuffer(new_centroids, order)
+    new_centroids_device = device_ndarray(new_centroids)
 
     sample_weights = np.ones((n_rows,)).astype(dtype) / n_rows
     sample_weights_device = (
-        TestDeviceBuffer(sample_weights, order) if additional_args else None
+        device_ndarray(sample_weights) if additional_args else None
     )
 
     # Compute new centroids naively
     dists = np.zeros((n_rows, n_clusters), dtype=dtype)
-    dists_device = TestDeviceBuffer(dists, order)
+    dists_device = device_ndarray(dists)
     pairwise_distance(X_device, centroids_device, dists_device, metric=metric)
     handle.sync()
 
     labels = np.argmin(dists_device.copy_to_host(), axis=1).astype(np.int32)
-    labels_device = TestDeviceBuffer(labels, order)
+    labels_device = device_ndarray(labels)
 
     expected_centers = np.empty((n_clusters, n_cols), dtype=dtype)
     expected_wX = X * sample_weights.reshape((-1, 1))
diff --git a/python/pylibraft/pylibraft/test/test_random.py b/python/pylibraft/pylibraft/test/test_random.py
index 77494ea277..229baffff5 100644
--- a/python/pylibraft/pylibraft/test/test_random.py
+++ b/python/pylibraft/pylibraft/test/test_random.py
@@ -16,9 +16,8 @@
 import numpy as np
 import pytest
 
-from pylibraft.common import Handle
+from pylibraft.common import Handle, device_ndarray
 from pylibraft.random import rmat
-from pylibraft.testing.utils import TestDeviceBuffer
 
 
 def generate_theta(r_scale, c_scale):
@@ -34,7 +33,7 @@ def generate_theta(r_scale, c_scale):
         theta[4 * i + 1] = b / total
         theta[4 * i + 2] = c / total
         theta[4 * i + 3] = d / total
-    theta_device = TestDeviceBuffer(theta, "C")
+    theta_device = device_ndarray(theta)
     return theta, theta_device
 
 
@@ -45,7 +44,7 @@ def generate_theta(r_scale, c_scale):
 def test_rmat(n_edges, r_scale, c_scale, dtype):
     theta, theta_device = generate_theta(r_scale, c_scale)
     out_buff = np.empty((n_edges, 2), dtype=dtype)
-    output_device = TestDeviceBuffer(out_buff, "C")
+    output_device = device_ndarray(out_buff)
 
     handle = Handle()
     rmat(output_device, theta_device, r_scale, c_scale, 12345, handle=handle)
@@ -68,7 +67,7 @@ def test_rmat_exception():
     dtype = np.int32
     with pytest.raises(Exception) as exception:
         out_buff = np.empty((n_edges, 2), dtype=dtype)
-        output_device = TestDeviceBuffer(out_buff, "C")
+        output_device = device_ndarray(out_buff)
         rmat(output_device, None, r_scale, c_scale, 12345)
         assert exception is not None
         assert exception.message == "'theta' cannot be None!"
@@ -84,7 +83,7 @@ def test_rmat_valueerror():
     r_scale = c_scale = 16
     with pytest.raises(ValueError) as exception:
         out_buff = np.empty((n_edges, 2), dtype=np.int16)
-        output_device = TestDeviceBuffer(out_buff, "C")
+        output_device = device_ndarray(out_buff)
         theta, theta_device = generate_theta(r_scale, c_scale)
         rmat(output_device, theta_device, r_scale, c_scale, 12345)
         assert exception is not None
diff --git a/python/pylibraft/pylibraft/testing/__init__.py b/python/pylibraft/pylibraft/testing/__init__.py
deleted file mode 100644
index 273b4497cc..0000000000
--- a/python/pylibraft/pylibraft/testing/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
diff --git a/python/pylibraft/pylibraft/testing/utils.py b/python/pylibraft/pylibraft/testing/utils.py
deleted file mode 100644
index 86cf4558db..0000000000
--- a/python/pylibraft/pylibraft/testing/utils.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-
-import rmm
-
-
-class TestDeviceBuffer:
-    def __init__(self, ndarray, order):
-
-        self.ndarray_ = ndarray
-        self.device_buffer_ = rmm.DeviceBuffer.to_device(
-            ndarray.ravel(order=order).tobytes()
-        )
-
-    @property
-    def __cuda_array_interface__(self):
-        device_cai = self.device_buffer_.__cuda_array_interface__
-        host_cai = self.ndarray_.__array_interface__.copy()
-        host_cai["data"] = (device_cai["data"][0], device_cai["data"][1])
-
-        return host_cai
-
-    def copy_to_host(self):
-        return (
-            np.frombuffer(
-                self.device_buffer_.tobytes(),
-                dtype=self.ndarray_.dtype,
-                like=self.ndarray_,
-            )
-            .astype(self.ndarray_.dtype)
-            .reshape(self.ndarray_.shape)
-        )