From 6a7894fd62ae600064ff41620ea7c233ed09070b Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Tue, 9 Feb 2021 17:57:55 -0600
Subject: [PATCH 01/11] get_handle_from_internal_pool

---
 cpp/include/raft/handle.hpp | 23 ++++++++++++++++++++++-
 cpp/test/handle.cpp         | 11 +++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index af53968653..f38aec394c 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -67,9 +67,22 @@ class handle_t {
       host_allocator_(std::make_shared<mr::host::default_allocator>()) {
     create_resources();
   }
+  handle_t(const handle_t& h) : dev_id_(h.get_device()), num_streams_(0) {}
+  handle_t(const handle_t&& h) : dev_id_(h.get_device()), num_streams_(0) {}
+
+  handle_t& operator=(const handle_t& h) {
+    prop_ = h.get_device_properties();
+    device_prop_initialized_ = true;
+    device_allocator_ = get_device_allocator();
+    host_allocator_ = get_host_allocator();
+    return *this;
+  }
 
   /** Destroys all held-up resources */
-  virtual ~handle_t() { destroy_resources(); }
+  virtual ~handle_t() {
+    std::cout << "dtor" << std::endl;
+    destroy_resources();
+  }
 
   int get_device() const { return dev_id_; }
 
@@ -136,6 +149,14 @@ class handle_t {
     return int_streams_vec;
   }
 
+  handle_t get_handle_from_internal_pool(
+    int stream_id, int n_streams = kNumDefaultWorkerStreams) const {
+    handle_t handle(n_streams);
+    handle = *this;
+    handle.set_stream(this->get_internal_stream(stream_id));
+    return handle;
+  }
+
   void wait_on_user_stream() const {
     CUDA_CHECK(cudaEventRecord(event_, user_stream_));
     for (auto s : streams_) {
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 5f6f3ceece..4c8b327e76 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <gtest/gtest.h>
+#include <cstddef>
 #include <iostream>
 #include <memory>
 #include <raft/handle.hpp>
@@ -49,4 +50,14 @@ TEST(Raft, GetInternalStreams) {
   ASSERT_EQ(4U, streams.size());
 }
 
+TEST(Raft, GetHandleFromPool) {
+  handle_t parent(4);
+  int sid = 2;
+  auto child = parent.get_handle_from_internal_pool(sid);
+  std::cout << "done" << std::endl;
+
+  ASSERT_EQ(parent.get_internal_stream(sid), child.get_stream());
+  ASSERT_EQ(0, child.get_num_internal_streams());
+  ASSERT_EQ(parent.get_device(), child.get_device());
+}
 }  // namespace raft

From d88bb146b71251a845b960db56958fca6c5855b7 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Wed, 10 Feb 2021 16:16:58 -0600
Subject: [PATCH 02/11] added rmm stream pool as backend

---
 cpp/include/raft/handle.hpp | 43 ++++++++++++++-----------------------
 cpp/test/handle.cpp         |  2 --
 2 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index f38aec394c..8b2aa58611 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -38,6 +38,7 @@
 #include <raft/comms/comms.hpp>
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/host/allocator.hpp>
+#include <rmm/cuda_stream_pool.hpp>
 #include "cudart_utils.h"
 
 namespace raft {
@@ -62,13 +63,13 @@ class handle_t {
         CUDA_CHECK(cudaGetDevice(&cur_dev));
         return cur_dev;
       }()),
-      num_streams_(n_streams),
+      streams_(n_streams),
       device_allocator_(std::make_shared<mr::device::default_allocator>()),
       host_allocator_(std::make_shared<mr::host::default_allocator>()) {
     create_resources();
   }
-  handle_t(const handle_t& h) : dev_id_(h.get_device()), num_streams_(0) {}
-  handle_t(const handle_t&& h) : dev_id_(h.get_device()), num_streams_(0) {}
+  handle_t(const handle_t& h) : dev_id_(h.get_device()) {}
+  handle_t(const handle_t&& h) : dev_id_(h.get_device()) {}
 
   handle_t& operator=(const handle_t& h) {
     prop_ = h.get_device_properties();
@@ -79,10 +80,7 @@ class handle_t {
   }
 
   /** Destroys all held-up resources */
-  virtual ~handle_t() {
-    std::cout << "dtor" << std::endl;
-    destroy_resources();
-  }
+  virtual ~handle_t() { destroy_resources(); }
 
   int get_device() const { return dev_id_; }
 
@@ -139,12 +137,14 @@ class handle_t {
     return cusparse_handle_;
   }
 
-  cudaStream_t get_internal_stream(int sid) const { return streams_[sid]; }
-  int get_num_internal_streams() const { return num_streams_; }
+  cudaStream_t get_internal_stream(int sid) const {
+    return streams_.get_stream(sid).value();
+  }
+  int get_num_internal_streams() const { return streams_.get_pool_size(); }
   std::vector<cudaStream_t> get_internal_streams() const {
     std::vector<cudaStream_t> int_streams_vec;
-    for (auto s : streams_) {
-      int_streams_vec.push_back(s);
+    for (int i = 0; i < get_num_internal_streams(); i++) {
+      int_streams_vec.push_back(get_internal_stream(i));
     }
     return int_streams_vec;
   }
@@ -159,14 +159,14 @@ class handle_t {
 
   void wait_on_user_stream() const {
     CUDA_CHECK(cudaEventRecord(event_, user_stream_));
-    for (auto s : streams_) {
-      CUDA_CHECK(cudaStreamWaitEvent(s, event_, 0));
+    for (int i = 0; i < get_num_internal_streams(); i++) {
+      CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0));
     }
   }
 
   void wait_on_internal_streams() const {
-    for (auto s : streams_) {
-      CUDA_CHECK(cudaEventRecord(event_, s));
+    for (int i = 0; i < get_num_internal_streams(); i++) {
+      CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i)));
       CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0));
     }
   }
@@ -213,8 +213,7 @@ class handle_t {
   std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
 
   const int dev_id_;
-  const int num_streams_;
-  std::vector<cudaStream_t> streams_;
+  rmm::cuda_stream_pool streams_{0};
   mutable cublasHandle_t cublas_handle_;
   mutable bool cublas_initialized_{false};
   mutable cusolverDnHandle_t cusolver_dn_handle_;
@@ -232,11 +231,6 @@ class handle_t {
   mutable std::mutex mutex_;
 
   void create_resources() {
-    for (int i = 0; i < num_streams_; ++i) {
-      cudaStream_t stream;
-      CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-      streams_.push_back(stream);
-    }
     CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
   }
 
@@ -258,11 +252,6 @@ class handle_t {
       //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
       CUBLAS_CHECK(cublasDestroy(cublas_handle_));
     }
-    while (!streams_.empty()) {
-      //CUDA_CHECK_NO_THROW(cudaStreamDestroy(streams_.back()));
-      CUDA_CHECK(cudaStreamDestroy(streams_.back()));
-      streams_.pop_back();
-    }
     //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
     CUDA_CHECK(cudaEventDestroy(event_));
   }
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 4c8b327e76..ee6d6d2a48 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -54,8 +54,6 @@ TEST(Raft, GetHandleFromPool) {
   handle_t parent(4);
   int sid = 2;
   auto child = parent.get_handle_from_internal_pool(sid);
-  std::cout << "done" << std::endl;
-
   ASSERT_EQ(parent.get_internal_stream(sid), child.get_stream());
   ASSERT_EQ(0, child.get_num_internal_streams());
   ASSERT_EQ(parent.get_device(), child.get_device());

From cf92c412371166cacd1ef262a0556d3df717581f Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Wed, 10 Feb 2021 16:27:31 -0600
Subject: [PATCH 03/11] added rmm stream pool as backend

---
 cpp/include/raft/handle.hpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 8b2aa58611..a42fdd67b2 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -86,6 +86,9 @@ class handle_t {
 
   void set_stream(cudaStream_t stream) { user_stream_ = stream; }
   cudaStream_t get_stream() const { return user_stream_; }
+  rmm::cuda_stream_view get_stream_view() const {
+    return rmm::cuda_stream_view(user_stream_);
+  }
 
   void set_device_allocator(std::shared_ptr<mr::device::allocator> allocator) {
     device_allocator_ = allocator;
@@ -137,9 +140,15 @@ class handle_t {
     return cusparse_handle_;
   }
 
+  // legacy compatibility for cuML
   cudaStream_t get_internal_stream(int sid) const {
     return streams_.get_stream(sid).value();
   }
+  // new accessor return rmm::cuda_stream_view
+  rmm::cuda_stream_view get_internal_stream_view(int sid) const {
+    return streams_.get_stream(sid);
+  }
+
   int get_num_internal_streams() const { return streams_.get_pool_size(); }
   std::vector<cudaStream_t> get_internal_streams() const {
     std::vector<cudaStream_t> int_streams_vec;

From 4cebf2453eec8c48fadf5ff5cb12a4e21e914509 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Wed, 10 Feb 2021 16:54:23 -0600
Subject: [PATCH 04/11] exposed rmm::cuda_stream_view for streams access

---
 cpp/test/handle.cpp | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index ee6d6d2a48..8fef4ead61 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -52,10 +52,25 @@ TEST(Raft, GetInternalStreams) {
 
 TEST(Raft, GetHandleFromPool) {
   handle_t parent(4);
-  int sid = 2;
-  auto child = parent.get_handle_from_internal_pool(sid);
-  ASSERT_EQ(parent.get_internal_stream(sid), child.get_stream());
+
+  auto child = parent.get_handle_from_internal_pool(2);
+  ASSERT_EQ(parent.get_internal_stream(2), child.get_stream());
   ASSERT_EQ(0, child.get_num_internal_streams());
+
+  child.set_stream(parent.get_internal_stream(3));
+  ASSERT_EQ(parent.get_internal_stream(3), child.get_stream());
+  ASSERT_NE(parent.get_internal_stream(2), child.get_stream());
+
   ASSERT_EQ(parent.get_device(), child.get_device());
 }
+
+TEST(Raft, GetHandleStreamViews) {
+  handle_t parent(4);
+
+  auto child = parent.get_handle_from_internal_pool(2);
+  ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view());
+  ASSERT_EQ(parent.get_internal_stream_view(2).value(),
+            child.get_stream_view().value());
+  EXPECT_FALSE(child.get_stream_view().is_default());
+}
 }  // namespace raft

From 2c38896e63e5c726108385012b71c0b1b6beec80 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Thu, 11 Feb 2021 15:14:10 -0500
Subject: [PATCH 05/11] Moving cuml sparse prims to raft (#139)

Moving cuml sparse prims and gtests to raft. The namespaces have already been adjusted in cuml so this is a simple move for the most part. There are a few places where includes needed to be updated and I Needed to remove cuml debug logs.

Authors:
  - Corey J. Nolet (@cjnolet)

Approvers:
  - Divye Gala (@divyegala)
  - Dante Gama Dessavre (@dantegd)

URL: https://github.com/rapidsai/raft/pull/139
---
 cpp/CMakeLists.txt                            |  19 +-
 cpp/include/raft/sparse/convert/coo.cuh       |  74 ++
 cpp/include/raft/sparse/convert/csr.cuh       | 189 +++++
 cpp/include/raft/sparse/convert/dense.cuh     | 110 +++
 cpp/include/raft/sparse/coo.cuh               | 259 ++++++
 cpp/include/raft/sparse/csr.cuh               | 263 ++++++
 .../raft/sparse/distance/bin_distance.cuh     | 195 +++++
 cpp/include/raft/sparse/distance/common.h     |  59 ++
 cpp/include/raft/sparse/distance/coo_spmv.cuh | 350 ++++++++
 cpp/include/raft/sparse/distance/csr_spmv.cuh | 484 +++++++++++
 cpp/include/raft/sparse/distance/distance.cuh | 104 +++
 .../raft/sparse/distance/ip_distance.cuh      | 329 ++++++++
 .../raft/sparse/distance/l2_distance.cuh      | 262 ++++++
 .../raft/sparse/distance/lp_distance.cuh      | 196 +++++
 .../raft/sparse/distance/operators.cuh        |  88 ++
 cpp/include/raft/sparse/linalg/add.cuh        | 226 ++++++
 cpp/include/raft/sparse/linalg/degree.cuh     | 184 +++++
 cpp/include/raft/sparse/linalg/norm.cuh       | 169 ++++
 cpp/include/raft/sparse/linalg/spectral.cuh   | 103 +++
 cpp/include/raft/sparse/linalg/symmetrize.cuh | 309 +++++++
 cpp/include/raft/sparse/linalg/transpose.h    |  87 ++
 cpp/include/raft/sparse/op/filter.cuh         | 201 +++++
 cpp/include/raft/sparse/op/row_op.cuh         |  76 ++
 cpp/include/raft/sparse/op/slice.h            |  99 +++
 cpp/include/raft/sparse/op/sort.h             | 105 +++
 cpp/include/raft/sparse/selection/knn.cuh     | 483 +++++++++++
 .../raft/sparse/selection/selection.cuh       | 157 ++++
 cpp/include/raft/sparse/utils.h               | 114 +++
 cpp/include/raft/spatial/knn/knn.hpp          |   2 +-
 cpp/test/sparse/add.cu                        | 174 ++++
 cpp/test/sparse/convert_coo.cu                |  98 +++
 cpp/test/sparse/convert_csr.cu                | 180 +++++
 cpp/test/sparse/csr_row_slice.cu              | 184 +++++
 cpp/test/sparse/csr_to_dense.cu               | 140 ++++
 cpp/test/sparse/csr_transpose.cu              | 174 ++++
 cpp/test/sparse/degree.cu                     | 110 +++
 cpp/test/sparse/dist_coo_spmv.cu              | 628 ++++++++++++++
 cpp/test/sparse/dist_csr_spmv.cu              | 608 ++++++++++++++
 cpp/test/sparse/distance.cu                   | 764 ++++++++++++++++++
 cpp/test/sparse/filter.cu                     | 122 +++
 cpp/test/sparse/knn.cu                        | 192 +++++
 cpp/test/sparse/norm.cu                       | 127 +++
 cpp/test/sparse/row_op.cu                     | 111 +++
 cpp/test/sparse/selection.cu                  | 157 ++++
 cpp/test/sparse/sort.cu                       | 103 +++
 cpp/test/sparse/symmetrize.cu                 | 111 +++
 46 files changed, 9277 insertions(+), 2 deletions(-)
 create mode 100644 cpp/include/raft/sparse/convert/coo.cuh
 create mode 100644 cpp/include/raft/sparse/convert/csr.cuh
 create mode 100644 cpp/include/raft/sparse/convert/dense.cuh
 create mode 100644 cpp/include/raft/sparse/coo.cuh
 create mode 100644 cpp/include/raft/sparse/csr.cuh
 create mode 100644 cpp/include/raft/sparse/distance/bin_distance.cuh
 create mode 100644 cpp/include/raft/sparse/distance/common.h
 create mode 100644 cpp/include/raft/sparse/distance/coo_spmv.cuh
 create mode 100644 cpp/include/raft/sparse/distance/csr_spmv.cuh
 create mode 100644 cpp/include/raft/sparse/distance/distance.cuh
 create mode 100644 cpp/include/raft/sparse/distance/ip_distance.cuh
 create mode 100644 cpp/include/raft/sparse/distance/l2_distance.cuh
 create mode 100644 cpp/include/raft/sparse/distance/lp_distance.cuh
 create mode 100644 cpp/include/raft/sparse/distance/operators.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/add.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/degree.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/norm.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/spectral.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/symmetrize.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/transpose.h
 create mode 100644 cpp/include/raft/sparse/op/filter.cuh
 create mode 100644 cpp/include/raft/sparse/op/row_op.cuh
 create mode 100644 cpp/include/raft/sparse/op/slice.h
 create mode 100644 cpp/include/raft/sparse/op/sort.h
 create mode 100644 cpp/include/raft/sparse/selection/knn.cuh
 create mode 100644 cpp/include/raft/sparse/selection/selection.cuh
 create mode 100644 cpp/include/raft/sparse/utils.h
 create mode 100644 cpp/test/sparse/add.cu
 create mode 100644 cpp/test/sparse/convert_coo.cu
 create mode 100644 cpp/test/sparse/convert_csr.cu
 create mode 100644 cpp/test/sparse/csr_row_slice.cu
 create mode 100644 cpp/test/sparse/csr_to_dense.cu
 create mode 100644 cpp/test/sparse/csr_transpose.cu
 create mode 100644 cpp/test/sparse/degree.cu
 create mode 100644 cpp/test/sparse/dist_coo_spmv.cu
 create mode 100644 cpp/test/sparse/dist_csr_spmv.cu
 create mode 100644 cpp/test/sparse/distance.cu
 create mode 100644 cpp/test/sparse/filter.cu
 create mode 100644 cpp/test/sparse/knn.cu
 create mode 100644 cpp/test/sparse/norm.cu
 create mode 100644 cpp/test/sparse/row_op.cu
 create mode 100644 cpp/test/sparse/selection.cu
 create mode 100644 cpp/test/sparse/sort.cu
 create mode 100644 cpp/test/sparse/symmetrize.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3baee48a5f..ae91d75b31 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -277,6 +277,23 @@ if(BUILD_RAFT_TESTS)
     test/random/rng.cu
     test/random/rng_int.cu
     test/random/sample_without_replacement.cu
+    test/sparse/add.cu
+    test/sparse/convert_coo.cu
+    test/sparse/convert_csr.cu
+    test/sparse/csr_row_slice.cu
+    test/sparse/csr_to_dense.cu
+    test/sparse/csr_transpose.cu
+    test/sparse/degree.cu
+    test/sparse/dist_coo_spmv.cu
+    test/sparse/dist_csr_spmv.cu
+    test/sparse/distance.cu
+    test/sparse/filter.cu
+    test/sparse/knn.cu
+    test/sparse/norm.cu
+    test/sparse/row_op.cu
+    test/sparse/selection.cu
+    test/sparse/sort.cu
+    test/sparse/symmetrize.cu
     test/spatial/knn.cu
     test/stats/mean.cu
     test/stats/mean_center.cu
diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh
new file mode 100644
index 0000000000..e367550060
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/coo.cuh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/utils.h>
+#include <raft/sparse/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+template <typename value_idx = int, int TPB_X = 32>
+__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m,
+                                  value_idx *coo_rows, value_idx nnz) {
+  // row-based matrix 1 thread per row
+  value_idx row = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (row < m) {
+    value_idx start_idx = row_ind[row];
+    value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind);
+    for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row;
+  }
+}
+
+/**
+ * @brief Convert a CSR row_ind array to a COO rows array
+ * @param row_ind: Input CSR row_ind array
+ * @param m: size of row_ind array
+ * @param coo_rows: Output COO row array
+ * @param nnz: size of output COO row array
+ * @param stream: cuda stream to use
+ */
+template <typename value_idx = int, int TPB_X = 32>
+void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows,
+                value_idx nnz, cudaStream_t stream) {
+  // @TODO: Use cusparse for this.
+  dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  csr_to_coo_kernel<value_idx, TPB_X>
+    <<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
new file mode 100644
index 0000000000..a034bdbda8
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/utils.h>
+#include <raft/sparse/coo.cuh>
+#include <raft/sparse/linalg/degree.cuh>
+#include <raft/sparse/op/row_op.cuh>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+template <typename value_t>
+void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
+                const int *srcCols, const value_t *srcVals, int nnz, int m,
+                int *dst_offsets, int *dstCols, value_t *dstVals) {
+  auto stream = handle.get_stream();
+  auto cusparseHandle = handle.get_cusparse_handle();
+  auto d_alloc = handle.get_device_allocator();
+  raft::mr::device::buffer<int> dstRows(d_alloc, stream, nnz);
+  CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz,
+                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz,
+                             cudaMemcpyDeviceToDevice, stream));
+  auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt(
+    cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
+  raft::mr::device::buffer<char> pBuffer(d_alloc, stream, buffSize);
+  raft::mr::device::buffer<int> P(d_alloc, stream, nnz);
+  CUSPARSE_CHECK(
+    cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
+  raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(),
+                                     dstCols, P.data(), pBuffer.data(), stream);
+  raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(),
+                             stream);
+  raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m,
+                                dst_offsets, stream);
+  CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+/**
+ * @brief Constructs an adjacency graph CSR row_ind_ptr array from
+ * a row_ind array and adjacency array.
+ * @tparam T the numeric type of the index arrays
+ * @tparam TPB_X the number of threads to use per block for kernels
+ * @tparam Lambda function for fused operation in the adj_graph construction
+ * @param row_ind the input CSR row_ind array
+ * @param total_rows number of vertices in graph
+ * @param nnz number of non-zeros
+ * @param batchSize number of vertices in current batch
+ * @param adj an adjacency array (size batchSize x total_rows)
+ * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
+ * @param stream cuda stream to use
+ * @param fused_op: the fused operation
+ */
+template <typename Index_, int TPB_X = 32,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
+                           Index_ batchSize, const bool *adj,
+                           Index_ *row_ind_ptr, cudaStream_t stream,
+                           Lambda fused_op) {
+  op::csr_row_op<Index_, TPB_X>(
+    row_ind, batchSize, nnz,
+    [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__(
+      Index_ row, Index_ start_idx, Index_ stop_idx) {
+      fused_op(row, start_idx, stop_idx);
+      Index_ k = 0;
+      for (Index_ i = 0; i < total_rows; i++) {
+        // @todo: uncoalesced mem accesses!
+        if (adj[batchSize * i + row]) {
+          row_ind_ptr[start_idx + k] = i;
+          k += 1;
+        }
+      }
+    },
+    stream);
+}
+
+template <typename Index_, int TPB_X = 32,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
+                           Index_ batchSize, const bool *adj,
+                           Index_ *row_ind_ptr, cudaStream_t stream) {
+  csr_adj_graph_batched(
+    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream,
+    [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {});
+}
+
+/**
+ * @brief Constructs an adjacency graph CSR row_ind_ptr array from a
+ * a row_ind array and adjacency array.
+ * @tparam T the numeric type of the index arrays
+ * @tparam TPB_X the number of threads to use per block for kernels
+ * @param row_ind the input CSR row_ind array
+ * @param total_rows number of total vertices in graph
+ * @param nnz number of non-zeros
+ * @param adj an adjacency array
+ * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
+ * @param stream cuda stream to use
+ * @param fused_op the fused operation
+ */
+template <typename Index_, int TPB_X = 32,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
+                   const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream,
+                   Lambda fused_op) {
+  csr_adj_graph_batched<Index_, TPB_X>(row_ind, total_rows, nnz, total_rows,
+                                       adj, row_ind_ptr, stream, fused_op);
+}
+
+/**
+ * @brief Generate the row indices array for a sorted COO matrix
+ *
+ * @param rows: COO rows array
+ * @param nnz: size of COO rows array
+ * @param row_ind: output row indices array
+ * @param m: number of rows in dense matrix
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
+                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                       cudaStream_t stream) {
+  raft::mr::device::buffer<T> row_counts(d_alloc, stream, m);
+
+  CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
+
+  linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream);
+
+  // create csr compressed row index from row counts
+  thrust::device_ptr<T> row_counts_d =
+    thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<T> c_ind_d = thrust::device_pointer_cast(row_ind);
+  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
+                 c_ind_d);
+}
+
+/**
+ * @brief Generate the row indices array for a sorted COO matrix
+ *
+ * @param coo: Input COO matrix
+ * @param row_ind: output row indices array
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void sorted_coo_to_csr(COO<T> *coo, int *row_ind,
+                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                       cudaStream_t stream) {
+  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc,
+                    stream);
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh
new file mode 100644
index 0000000000..299f9d36d4
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/dense.cuh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+template <typename value_t>
+__global__ void csr_to_dense_warp_per_row_kernel(int n_cols,
+                                                 const value_t *csrVal,
+                                                 const int *csrRowPtr,
+                                                 const int *csrColInd,
+                                                 value_t *a) {
+  int row = blockIdx.x;
+  int tid = threadIdx.x;
+
+  int colStart = csrRowPtr[row];
+  int colEnd = csrRowPtr[row + 1];
+  int rowNnz = colEnd - colStart;
+
+  for (int i = tid; i < rowNnz; i += blockDim.x) {
+    int colIdx = colStart + i;
+    if (colIdx < colEnd) {
+      int col = csrColInd[colIdx];
+      a[row * n_cols + col] = csrVal[colIdx];
+    }
+  }
+}
+
+/**
+ * Convert CSR arrays to a dense matrix in either row-
+ * or column-major format. A custom kernel is used when
+ * row-major output is desired since cusparse does not
+ * output row-major.
+ * @tparam value_idx : data type of the CSR index arrays
+ * @tparam value_t : data type of the CSR value array
+ * @param[in] handle : cusparse handle for conversion
+ * @param[in] nrows : number of rows in CSR
+ * @param[in] ncols : number of columns in CSR
+ * @param[in] csr_indptr : CSR row index pointer array
+ * @param[in] csr_indices : CSR column indices array
+ * @param[in] csr_data : CSR data array
+ * @param[in] lda : Leading dimension (used for col-major only)
+ * @param[out] out : Dense output array of size nrows * ncols
+ * @param[in] stream : Cuda stream for ordering events
+ * @param[in] row_major : Is row-major output desired?
+ */
+template <typename value_idx, typename value_t>
+void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols,
+                  const value_idx *csr_indptr, const value_idx *csr_indices,
+                  const value_t *csr_data, value_idx lda, value_t *out,
+                  cudaStream_t stream, bool row_major = true) {
+  if (!row_major) {
+    /**
+     * If we need col-major, use cusparse.
+     */
+    cusparseMatDescr_t out_mat;
+    CUSPARSE_CHECK(cusparseCreateMatDescr(&out_mat));
+    CUSPARSE_CHECK(cusparseSetMatIndexBase(out_mat, CUSPARSE_INDEX_BASE_ZERO));
+    CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL));
+
+    CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense(
+      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out,
+      lda, stream));
+
+    CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat));
+
+  } else {
+    int blockdim = block_dim(ncols);
+    CUDA_CHECK(
+      cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
+    csr_to_dense_warp_per_row_kernel<<<nrows, blockdim, 0, stream>>>(
+      ncols, csr_data, csr_indptr, csr_indices, out);
+  }
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh
new file mode 100644
index 0000000000..73120fea8c
--- /dev/null
+++ b/cpp/include/raft/sparse/coo.cuh
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <cusparse_v2.h>
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <raft/device_atomics.cuh>
+
+#include <iostream>
+#define restrict __restrict__
+
+#pragma once
+
+namespace raft {
+namespace sparse {
+
+/** @brief A Container object for sparse coordinate. There are two motivations
+ * behind using a container for COO arrays.
+ *
+ * The first motivation is that it simplifies code, rather than always having
+ * to pass three arrays as function arguments.
+ *
+ * The second is more subtle, but much more important. The size
+ * of the resulting COO from a sparse operation is often not known ahead of time,
+ * since it depends on the contents of the underlying graph. The COO object can
+ * allocate the underlying arrays lazily so that the object can be created by the
+ * user and passed as an output argument in a sparse primitive. The sparse primitive
+ * would have the responsibility for allocating and populating the output arrays,
+ * while the original caller still maintains ownership of the underlying memory.
+ *
+ * @tparam T: the type of the value array.
+ * @tparam Index_Type: the type of index array
+ *
+ */
+template <typename T, typename Index_Type = int>
+class COO {
+ protected:
+  raft::mr::device::buffer<Index_Type> rows_arr;
+  raft::mr::device::buffer<Index_Type> cols_arr;
+  raft::mr::device::buffer<T> vals_arr;
+
+ public:
+  Index_Type nnz;
+  Index_Type n_rows;
+  Index_Type n_cols;
+
+  /**
+    * @param d_alloc: the device allocator to use for the underlying buffers
+    * @param stream: CUDA stream to use
+    */
+  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream)
+    : rows_arr(d_alloc, stream, 0),
+      cols_arr(d_alloc, stream, 0),
+      vals_arr(d_alloc, stream, 0),
+      nnz(0),
+      n_rows(0),
+      n_cols(0) {}
+
+  /**
+    * @param rows: coo rows array
+    * @param cols: coo cols array
+    * @param vals: coo vals array
+    * @param nnz: size of the rows/cols/vals arrays
+    * @param n_rows: number of rows in the dense matrix
+    * @param n_cols: number of cols in the dense matrix
+    */
+  COO(raft::mr::device::buffer<Index_Type> &rows,
+      raft::mr::device::buffer<Index_Type> &cols,
+      raft::mr::device::buffer<T> &vals, Index_Type nnz, Index_Type n_rows = 0,
+      Index_Type n_cols = 0)
+    : rows_arr(rows),
+      cols_arr(cols),
+      vals_arr(vals),
+      nnz(nnz),
+      n_rows(n_rows),
+      n_cols(n_cols) {}
+
+  /**
+    * @param d_alloc: the device allocator use
+    * @param stream: CUDA stream to use
+    * @param nnz: size of the rows/cols/vals arrays
+    * @param n_rows: number of rows in the dense matrix
+    * @param n_cols: number of cols in the dense matrix
+    * @param init: initialize arrays with zeros
+    */
+  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream,
+      Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0,
+      bool init = true)
+    : rows_arr(d_alloc, stream, nnz),
+      cols_arr(d_alloc, stream, nnz),
+      vals_arr(d_alloc, stream, nnz),
+      nnz(nnz),
+      n_rows(n_rows),
+      n_cols(n_cols) {
+    if (init) init_arrays(stream);
+  }
+
+  void init_arrays(cudaStream_t stream) {
+    CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0,
+                               this->nnz * sizeof(Index_Type), stream));
+    CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0,
+                               this->nnz * sizeof(Index_Type), stream));
+    CUDA_CHECK(
+      cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
+  }
+
+  ~COO() {}
+
+  /**
+    * @brief Size should be > 0, with the number of rows
+    * and cols in the dense matrix being > 0.
+    */
+  bool validate_size() const {
+    if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false;
+    return true;
+  }
+
+  /**
+    * @brief If the underlying arrays have not been set,
+    * return false. Otherwise true.
+    */
+  bool validate_mem() const {
+    if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 ||
+        this->vals_arr.size() == 0) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /*
+   * @brief Returns the rows array
+   */
+  Index_Type *rows() { return this->rows_arr.data(); }
+
+  /**
+   * @brief Returns the cols array
+   */
+  Index_Type *cols() { return this->cols_arr.data(); }
+
+  /**
+   * @brief Returns the vals array
+   */
+  T *vals() { return this->vals_arr.data(); }
+
+  /**
+    * @brief Send human-readable state information to output stream
+    */
+  friend std::ostream &operator<<(std::ostream &out,
+                                  const COO<T, Index_Type> &c) {
+    if (c.validate_size() && c.validate_mem()) {
+      cudaStream_t stream;
+      CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+      out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream)
+          << std::endl;
+      out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream)
+          << std::endl;
+      out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream)
+          << std::endl;
+      out << "nnz=" << c.nnz << std::endl;
+      out << "n_rows=" << c.n_rows << std::endl;
+      out << "n_cols=" << c.n_cols << std::endl;
+
+      CUDA_CHECK(cudaStreamDestroy(stream));
+    } else {
+      out << "Cannot print COO object: Uninitialized or invalid." << std::endl;
+    }
+
+    return out;
+  }
+
+  /**
+    * @brief Set the number of rows and cols
+    * @param n_rows: number of rows in the dense matrix
+    * @param n_cols: number of columns in the dense matrix
+    */
+  void setSize(int n_rows, int n_cols) {
+    this->n_rows = n_rows;
+    this->n_cols = n_cols;
+  }
+
+  /**
+    * @brief Set the number of rows and cols for a square dense matrix
+    * @param n: number of rows and cols
+    */
+  void setSize(int n) {
+    this->n_rows = n;
+    this->n_cols = n;
+  }
+
+  /**
+    * @brief Allocate the underlying arrays
+    * @param nnz: size of underlying row/col/val arrays
+    * @param init: should values be initialized to 0?
+    * @param stream: CUDA stream to use
+    */
+  void allocate(int nnz, bool init, cudaStream_t stream) {
+    this->allocate(nnz, 0, init, stream);
+  }
+
+  /**
+    * @brief Allocate the underlying arrays
+    * @param nnz: size of the underlying row/col/val arrays
+    * @param size: the number of rows/cols in a square dense matrix
+    * @param init: should values be initialized to 0?
+    * @param stream: CUDA stream to use
+    */
+  void allocate(int nnz, int size, bool init, cudaStream_t stream) {
+    this->allocate(nnz, size, size, init, stream);
+  }
+
+  /**
+    * @brief Allocate the underlying arrays
+    * @param nnz: size of the underlying row/col/val arrays
+    * @param n_rows: number of rows in the dense matrix
+    * @param n_cols: number of columns in the dense matrix
+    * @param init: should values be initialized to 0?
+    * @param stream: stream to use for init
+    */
+  void allocate(int nnz, int n_rows, int n_cols, bool init,
+                cudaStream_t stream) {
+    this->n_rows = n_rows;
+    this->n_cols = n_cols;
+    this->nnz = nnz;
+
+    this->rows_arr.resize(this->nnz, stream);
+    this->cols_arr.resize(this->nnz, stream);
+    this->vals_arr.resize(this->nnz, stream);
+
+    if (init) init_arrays(stream);
+  }
+};
+
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh
new file mode 100644
index 0000000000..bc4a68d296
--- /dev/null
+++ b/cpp/include/raft/sparse/csr.cuh
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+
+//@TODO: Pull this out into a separate file
+
+struct WeakCCState {
+ public:
+  bool *m;
+  WeakCCState(bool *m) : m(m) {}
+};
+
+template <typename Index_, int TPB_X = 256, typename Lambda>
+__global__ void weak_cc_label_device(Index_ *__restrict__ labels,
+                                     const Index_ *__restrict__ row_ind,
+                                     const Index_ *__restrict__ row_ind_ptr,
+                                     Index_ nnz, bool *__restrict__ m,
+                                     Index_ start_vertex_id, Index_ batch_size,
+                                     Index_ N, Lambda filter_op) {
+  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
+  Index_ global_id = tid + start_vertex_id;
+  if (tid < batch_size && global_id < N) {
+    Index_ start = __ldg(row_ind + tid);
+
+    Index_ ci, cj;
+    bool ci_mod = false;
+    ci = labels[global_id];
+    bool ci_allow_prop = filter_op(global_id);
+
+    Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind);
+    /// TODO: add one element to row_ind and avoid get_stop_idx
+    for (Index_ j = start; j < end; j++) {
+      Index_ j_ind = __ldg(row_ind_ptr + j);
+      cj = labels[j_ind];
+      bool cj_allow_prop = filter_op(j_ind);
+      if (ci < cj && ci_allow_prop) {
+        if (sizeof(Index_) == 4)
+          atomicMin((int *)(labels + j_ind), ci);
+        else if (sizeof(Index_) == 8)
+          atomicMin((long long int *)(labels + j_ind), ci);
+        if (cj_allow_prop) *m = true;
+      } else if (ci > cj && cj_allow_prop) {
+        ci = cj;
+        ci_mod = true;
+      }
+    }
+    if (ci_mod) {
+      if (sizeof(Index_) == 4)
+        atomicMin((int *)(labels + global_id), ci);
+      else if (sizeof(Index_) == 8)
+        atomicMin((long long int *)(labels + global_id), ci);
+      if (ci_allow_prop) *m = true;
+    }
+  }
+}
+
+template <typename Index_, int TPB_X = 256, typename Lambda>
+__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N,
+                                        Index_ MAX_LABEL, Lambda filter_op) {
+  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
+  if (tid < N) {
+    if (filter_op(tid))
+      labels[tid] = tid + 1;
+    else
+      labels[tid] = MAX_LABEL;
+  }
+}  // namespace sparse
+
+/**
+ * @brief Partial calculation of the weakly connected components in the
+ * context of a batched algorithm: the labels are computed wrt the sub-graph
+ * represented by the given CSR matrix of dimensions batch_size * N.
+ * Note that this overwrites the labels array and it is the responsibility of
+ * the caller to combine the results from different batches
+ * (cf label/merge_labels.cuh)
+ *
+ * @tparam Index_ the numeric type of non-floating point elements
+ * @tparam TPB_X the threads to use per block when configuring the kernel
+ * @param labels an array for the output labels
+ * @param row_ind the compressed row index of the CSR array
+ * @param row_ind_ptr the row index pointer of the CSR array
+ * @param nnz the size of row_ind_ptr array
+ * @param N number of vertices
+ * @param start_vertex_id the starting vertex index for the current batch
+ * @param batch_size number of vertices for current batch
+ * @param state instance of inter-batch state management
+ * @param stream the cuda stream to use
+ * @param filter_op an optional filtering function to determine which points
+ * should get considered for labeling. It gets global indexes (not batch-wide!)
+ */
+template <typename Index_, int TPB_X = 256,
+          typename Lambda = auto(Index_)->bool>
+void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
+                     const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
+                     Index_ start_vertex_id, Index_ batch_size,
+                     WeakCCState *state, cudaStream_t stream,
+                     Lambda filter_op) {
+  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8,
+         "Index_ should be 4 or 8 bytes");
+
+  bool host_m;
+
+  Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
+  weak_cc_init_all_kernel<Index_, TPB_X>
+    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(
+      labels, N, MAX_LABEL, filter_op);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  int n_iters = 0;
+  do {
+    CUDA_CHECK(cudaMemsetAsync(state->m, false, sizeof(bool), stream));
+
+    weak_cc_label_device<Index_, TPB_X>
+      <<<raft::ceildiv(batch_size, Index_(TPB_X)), TPB_X, 0, stream>>>(
+        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id,
+        batch_size, N, filter_op);
+    CUDA_CHECK(cudaPeekAtLastError());
+
+    //** Updating m *
+    raft::update_host(&host_m, state->m, 1, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    n_iters++;
+  } while (host_m);
+}
+
+/**
+ * @brief Partial calculation of the weakly connected components in the
+ * context of a batched algorithm: the labels are computed wrt the sub-graph
+ * represented by the given CSR matrix of dimensions batch_size * N.
+ * Note that this overwrites the labels array and it is the responsibility of
+ * the caller to combine the results from different batches
+ * (cf label/merge_labels.cuh)
+ *
+ * @tparam Index_ the numeric type of non-floating point elements
+ * @tparam TPB_X the threads to use per block when configuring the kernel
+ * @param labels an array for the output labels
+ * @param row_ind the compressed row index of the CSR array
+ * @param row_ind_ptr the row index pointer of the CSR array
+ * @param nnz the size of row_ind_ptr array
+ * @param N number of vertices
+ * @param start_vertex_id the starting vertex index for the current batch
+ * @param batch_size number of vertices for current batch
+ * @param state instance of inter-batch state management
+ * @param stream the cuda stream to use
+ */
+template <typename Index_, int TPB_X = 256>
+void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
+                     const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
+                     Index_ start_vertex_id, Index_ batch_size,
+                     WeakCCState *state, cudaStream_t stream) {
+  weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id,
+                  batch_size, state, stream,
+                  [] __device__(Index_ tid) { return true; });
+}
+
+/**
+ * @brief Compute weakly connected components. Note that the resulting labels
+ * may not be taken from a monotonically increasing set (eg. numbers may be
+ * skipped). The MLCommon::Label package contains a primitive `make_monotonic`,
+ * which will make a monotonically increasing set of labels.
+ *
+ * This implementation comes from [1] and solves component labeling problem in
+ * parallel on CSR-indexes based upon the vertex degree and adjacency graph.
+ *
+ * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA"
+ *
+ * @tparam Type the numeric type of non-floating point elements
+ * @tparam TPB_X the threads to use per block when configuring the kernel
+ * @tparam Lambda the type of an optional filter function (int)->bool
+ * @param labels an array for the output labels
+ * @param row_ind the compressed row index of the CSR array
+ * @param row_ind_ptr the row index pointer of the CSR array
+ * @param nnz the size of row_ind_ptr array
+ * @param N number of vertices
+ * @param d_alloc: deviceAllocator to use for temp memory
+ * @param stream the cuda stream to use
+ * @param filter_op an optional filtering function to determine which points
+ * should get considered for labeling. It gets global indexes (not batch-wide!)
+ */
+template <typename Index_ = int, int TPB_X = 256,
+          typename Lambda = auto(Index_)->bool>
+void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
+             Index_ nnz, Index_ N,
+             std::shared_ptr<raft::mr::device::allocator> d_alloc,
+             cudaStream_t stream, Lambda filter_op) {
+  raft::mr::device::buffer<bool> m(d_alloc, stream, 1);
+
+  WeakCCState state(m.data());
+  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
+                                 stream, filter_op);
+}
+
+/**
+ * @brief Compute weakly connected components. Note that the resulting labels
+ * may not be taken from a monotonically increasing set (eg. numbers may be
+ * skipped). The MLCommon::Label package contains a primitive `make_monotonic`,
+ * which will make a monotonically increasing set of labels.
+ *
+ * This implementation comes from [1] and solves component labeling problem in
+ * parallel on CSR-indexes based upon the vertex degree and adjacency graph.
+ *
+ * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA"
+ *
+ * @tparam Type the numeric type of non-floating point elements
+ * @tparam TPB_X the threads to use per block when configuring the kernel
+ * @tparam Lambda the type of an optional filter function (int)->bool
+ * @param labels an array for the output labels
+ * @param row_ind the compressed row index of the CSR array
+ * @param row_ind_ptr the row index pointer of the CSR array
+ * @param nnz the size of row_ind_ptr array
+ * @param N number of vertices
+ * @param d_alloc: deviceAllocator to use for temp memory
+ * @param stream the cuda stream to use
+ */
+template <typename Index_, int TPB_X = 256>
+void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
+             Index_ nnz, Index_ N,
+             std::shared_ptr<raft::mr::device::allocator> d_alloc,
+             cudaStream_t stream) {
+  raft::mr::device::buffer<bool> m(d_alloc, stream, 1);
+  WeakCCState state(m.data());
+  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
+                                 stream, [](Index_) { return true; });
+}
+
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh
new file mode 100644
index 0000000000..a0467b9566
--- /dev/null
+++ b/cpp/include/raft/sparse/distance/bin_distance.cuh
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <limits.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/sparse/distance/common.h>
+#include <raft/sparse/utils.h>
+#include <raft/sparse/distance/ip_distance.cuh>
+
+#include <nvfunctional>
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+// @TODO: Move this into sparse prims (coo_norm)
+template <typename value_idx, typename value_t>
+__global__ void compute_binary_row_norm_kernel(
+  value_t *out, const value_idx *__restrict__ coo_rows,
+  const value_t *__restrict__ data, value_idx nnz) {
+  value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < nnz) {
+    // We do conditional here only because it's
+    // possible there could be some stray zeros in
+    // the sparse structure and removing them would be
+    // more expensive.
+    atomicAdd(&out[coo_rows[i]], data[i] == 1.0);
+  }
+}
+
+template <typename value_idx, typename value_t, typename expansion_f>
+__global__ void compute_binary_warp_kernel(value_t *__restrict__ C,
+                                           const value_t *__restrict__ Q_norms,
+                                           const value_t *__restrict__ R_norms,
+                                           value_idx n_rows, value_idx n_cols,
+                                           expansion_f expansion_func) {
+  value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
+  value_idx i = tid / n_cols;
+  value_idx j = tid % n_cols;
+
+  if (i >= n_rows || j >= n_cols) return;
+
+  value_t q_norm = Q_norms[i];
+  value_t r_norm = R_norms[j];
+  value_t dot = C[(size_t)i * n_cols + j];
+  C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm);
+}
+
+template <typename value_idx, typename value_t, typename expansion_f,
+          int tpb = 1024>
+void compute_binary(value_t *C, const value_t *Q_norms, const value_t *R_norms,
+                    value_idx n_rows, value_idx n_cols,
+                    expansion_f expansion_func, cudaStream_t stream) {
+  int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
+  compute_binary_warp_kernel<<<blocks, tpb, 0, stream>>>(
+    C, Q_norms, R_norms, n_rows, n_cols, expansion_func);
+}
+
+template <typename value_idx, typename value_t, typename expansion_f,
+          int tpb = 1024>
+void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows,
+                          const value_t *Q_data, value_idx Q_nnz,
+                          const value_idx *R_coo_rows, const value_t *R_data,
+                          value_idx R_nnz, value_idx m, value_idx n,
+                          cusparseHandle_t handle,
+                          std::shared_ptr<raft::mr::device::allocator> alloc,
+                          cudaStream_t stream, expansion_f expansion_func) {
+  raft::mr::device::buffer<value_t> Q_norms(alloc, stream, m);
+  raft::mr::device::buffer<value_t> R_norms(alloc, stream, n);
+  CUDA_CHECK(
+    cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(
+    cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+
+  compute_binary_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_binary_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func,
+                 stream);
+}
+
+/**
+ * Jaccard distance using the expanded form:
+ * 1 - (sum(x_k * y_k) / ((sum(x_k) + sum(y_k)) - sum(x_k * y_k))
+ */
+template <typename value_idx = int, typename value_t = float>
+class jaccard_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit jaccard_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config),
+      workspace(config.allocator, config.stream, 0),
+      ip_dists(config) {}
+
+  void compute(value_t *out_dists) {
+    ip_dists.compute(out_dists);
+
+    value_idx *b_indices = ip_dists.b_rows_coo();
+    value_t *b_data = ip_dists.b_data_coo();
+
+    raft::mr::device::buffer<value_idx> search_coo_rows(
+      config_->allocator, config_->stream, config_->a_nnz);
+    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
+                                      search_coo_rows.data(), config_->a_nnz,
+                                      config_->stream);
+
+    compute_bin_distance(
+      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
+      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
+      config_->handle, config_->allocator, config_->stream,
+      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+        value_t q_r_union = q_norm + r_norm;
+        return 1 - (dot / (q_r_union - dot));
+      });
+  }
+
+  ~jaccard_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+  raft::mr::device::buffer<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Dice distance using the expanded form:
+ * 1 - ((2 * sum(x_k * y_k)) / (sum(x_k)^2 + sum(y_k)^2))
+ */
+template <typename value_idx = int, typename value_t = float>
+class dice_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit dice_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config),
+      workspace(config.allocator, config.stream, 0),
+      ip_dists(config) {}
+
+  void compute(value_t *out_dists) {
+    ip_dists.compute(out_dists);
+
+    value_idx *b_indices = ip_dists.b_rows_coo();
+    value_t *b_data = ip_dists.b_data_coo();
+
+    raft::mr::device::buffer<value_idx> search_coo_rows(
+      config_->allocator, config_->stream, config_->a_nnz);
+    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
+                                      search_coo_rows.data(), config_->a_nnz,
+                                      config_->stream);
+
+    compute_bin_distance(
+      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
+      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
+      config_->handle, config_->allocator, config_->stream,
+      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+        value_t q_r_union = (q_norm * q_norm) + (r_norm * r_norm);
+        return (2 * dot) / q_r_union;
+      });
+  }
+
+  ~dice_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+  raft::mr::device::buffer<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+};  // END namespace distance
+};  // END namespace sparse
+};  // END namespace raft
diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h
new file mode 100644
index 0000000000..712d2c52bd
--- /dev/null
+++ b/cpp/include/raft/sparse/distance/common.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/mr/device/allocator.hpp>
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+template <typename value_idx, typename value_t>
+struct distances_config_t {
+  // left side
+  value_idx a_nrows;
+  value_idx a_ncols;
+  value_idx a_nnz;
+  value_idx *a_indptr;
+  value_idx *a_indices;
+  value_t *a_data;
+
+  // right side
+  value_idx b_nrows;
+  value_idx b_ncols;
+  value_idx b_nnz;
+  value_idx *b_indptr;
+  value_idx *b_indices;
+  value_t *b_data;
+
+  cusparseHandle_t handle;
+
+  std::shared_ptr<raft::mr::device::allocator> allocator;
+  cudaStream_t stream;
+};
+
+template <typename value_t>
+class distances_t {
+ public:
+  virtual void compute(value_t *out) {}
+  virtual ~distances_t() = default;
+};
+
+};  // namespace distance
+}  // namespace sparse
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh
new file mode 100644
index 0000000000..d596c6b852
--- /dev/null
+++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/sparse/distance/common.h>
+#include <raft/sparse/utils.h>
+#include <raft/sparse/csr.cuh>
+
+#include <limits.h>
+
+#include <nvfunctional>
+
+#include <cusparse_v2.h>
+
+#include <cub/cub.cuh>
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+/**
+ * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
+ * sparse-matrix-sparse-vector multiplication layout (SPMV).
+ * This is intended to be scheduled n_chunks_b times for each row of a.
+ * The steps are as follows:
+ *
+ * 1. Load row from A into dense vector in shared memory.
+ *    This can be further chunked in the future if necessary to support larger
+ *    column sizes.
+ * 2. Threads of block all step through chunks of B in parallel.
+ *    When a new row is encountered in row_indices_b, a segmented
+ *    reduction is performed across the warps and then across the
+ *    block and the final value written out to host memory.
+ *
+ * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam tpb threads per block configured on launch
+ * @tparam rev if this is true, the reduce/accumulate functions are only
+ *         executed when A[col] == 0.0. when executed before/after !rev
+ *         and A & B are reversed, this allows the full symmetric difference
+ *         and intersection to be computed.
+ * @tparam kv_t data type stored in shared mem cache
+ * @tparam product_f reduce function type (semiring product() function).
+ *                  accepts two arguments of value_t and returns a value_t
+ * @tparam accum_f accumulation function type (semiring sum() function).
+ *                 accepts two arguments of value_t and returns a value_t
+ * @tparam write_f function to write value out. this should be mathematically
+ *                 equivalent to the accumulate function but implemented as
+ *                 an atomic operation on global memory. Accepts two arguments
+ *                 of value_t* and value_t and updates the value given by the
+ *                 pointer.
+ * @param[in] indptrA column pointer array for A
+ * @param[in] indicesA column indices array for A
+ * @param[in] dataA data array for A
+ * @param[in] rowsB coo row array for B
+ * @param[in] indicesB column indices array for B
+ * @param[in] dataB data array for B
+ * @param[in] m number of rows in A
+ * @param[in] n number of rows in B
+ * @param[in] dim number of features
+ * @param[in] nnz_b number of nonzeros in B
+ * @param[out] out array of size m*n
+ * @param[in] n_blocks_per_row number of blocks of B per row of A
+ * @param[in] chunk_size number of nnz for B to use for each row of A
+ * @param[in] buffer_size amount of smem to use for each row of A
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ */
+template <typename value_idx, typename value_t, int tpb, bool rev,
+          typename kv_t, typename product_f, typename accum_f, typename write_f>
+__global__ void balanced_coo_generalized_spmv_kernel(
+  value_idx *indptrA, value_idx *indicesA, value_t *dataA, value_idx *rowsB,
+  value_idx *indicesB, value_t *dataB, value_idx m, value_idx n, value_idx dim,
+  value_idx nnz_b, value_t *out, int n_blocks_per_row, int chunk_size,
+  product_f product_func, accum_f accum_func, write_f write_func) {
+  typedef cub::WarpReduce<value_t> warp_reduce;
+
+  value_idx cur_row_a = blockIdx.x / n_blocks_per_row;
+  value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row;
+
+  // chunk starting offset
+  value_idx ind_offset = cur_chunk_offset * chunk_size * tpb;
+  // how many total cols will be processed by this block (should be <= chunk_size * n_threads)
+  value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset);
+
+  int tid = threadIdx.x;
+  int warp_id = tid / raft::warp_size();
+
+  // compute id relative to current warp
+  unsigned int lane_id = tid & (raft::warp_size() - 1);
+  value_idx ind = ind_offset + threadIdx.x;
+
+  extern __shared__ char smem[];
+
+  value_idx *offsets_a = (value_idx *)smem;
+  kv_t *A = (kv_t *)(offsets_a + 2);
+  typename warp_reduce::TempStorage *temp_storage =
+    (typename warp_reduce::TempStorage *)(A + dim);
+
+  // Create dense vector A and populate with 0s
+  for (int k = tid; k < dim; k += blockDim.x) A[k] = 0;
+
+  if (tid == 0) {
+    offsets_a[0] = indptrA[cur_row_a];
+    offsets_a[1] = indptrA[cur_row_a + 1];
+  }
+
+  __syncthreads();
+
+  value_idx start_offset_a = offsets_a[0];
+  value_idx stop_offset_a = offsets_a[1];
+
+  // Convert current row vector in A to dense
+  for (int i = tid; i < (stop_offset_a - start_offset_a); i += blockDim.x) {
+    A[indicesA[start_offset_a + i]] = dataA[start_offset_a + i];
+  }
+
+  __syncthreads();
+
+  if (cur_row_a > m || cur_chunk_offset > n_blocks_per_row) return;
+  if (ind >= nnz_b) return;
+
+  value_idx cur_row_b = -1;
+  value_t c = 0.0;
+
+  auto warp_red = warp_reduce(*(temp_storage + warp_id));
+
+  // coalesced reads from B
+  if (tid < active_chunk_size) {
+    cur_row_b = rowsB[ind];
+    value_t a_col = A[indicesB[ind]];
+    if (!rev || a_col == 0.0) c = product_func(a_col, dataB[ind]);
+  }
+
+  // loop through chunks in parallel, reducing when a new row is
+  // encountered by each thread
+  for (int i = tid; i < active_chunk_size; i += blockDim.x) {
+    value_idx ind_next = ind + blockDim.x;
+    value_idx next_row_b = -1;
+
+    if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next];
+
+    bool diff_rows = next_row_b != cur_row_b;
+
+    if (__any_sync(0xffffffff, diff_rows)) {
+      // grab the threads currently participating in loops.
+      // because any other threads should have returned already.
+      unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b);
+      bool is_leader = get_lowest_peer(peer_group) == lane_id;
+      value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
+
+      // thread with lowest lane id among peers writes out
+      if (is_leader && v != 0.0) {
+        // this conditional should be uniform, since rev is constant
+        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b
+                          : (size_t)cur_row_b * m + cur_row_a;
+        write_func(out + idx, v);
+      }
+
+      c = 0.0;
+    }
+
+    if (next_row_b != -1) {
+      ind = ind_next;
+      value_t a_col = A[indicesB[ind]];
+      if (!rev || a_col == 0.0)
+        c = accum_func(c, product_func(a_col, dataB[ind]));
+      cur_row_b = next_row_b;
+    }
+  }
+}
+
+/**
+ * Computes the maximum number of columns that can be stored
+ * in shared memory in dense form with the given block size
+ * and precision.
+ * @return the maximum number of columns that can be stored in smem
+ */
+template <typename value_idx, typename value_t, int tpb = 1024>
+inline int max_cols_per_block() {
+  // max cols = (total smem available - offsets for A - cub reduction smem)
+  return (raft::getSharedMemPerBlock() - (2 * sizeof(value_idx)) -
+          ((tpb / raft::warp_size()) * sizeof(value_t))) /
+         sizeof(value_t);
+}
+
+template <typename value_idx, typename value_t, int tpb = 1024>
+inline int smem_per_block(int n_cols) {
+  int max_cols = max_cols_per_block<value_idx, value_t, tpb>();
+  ASSERT(n_cols <= max_cols, "COO SPMV Requires max dimensionality of %d",
+         max_cols);
+  return (n_cols * sizeof(value_t)) + (2 * sizeof(value_idx)) +
+         ((tpb / raft::warp_size()) * sizeof(value_t));
+}
+
+/**
+ * Performs generalized sparse-matrix-sparse-matrix multiplication via a
+ * sparse-matrix-sparse-vector layout `out=A*B` where generalized product()
+ * and sum() operations can be used in place of the standard sum and product:
+ *
+ * out_ij = sum_k(product(A_ik, B_ik)) The sum goes through values of
+ * k=0..n_cols-1 where B_kj is nonzero.
+ *
+ * The product and sum operations shall form a semiring algebra with the
+ * following properties:
+ * 1. {+, 0} is a commutative sum reduction monoid with identity element 0
+ * 2. {*, 1} is a product monoid with identity element 1
+ * 3. Multiplication by 0 annihilates x. e.g. product(x, 0) = 0
+ *
+ * Each vector of A is loaded into shared memory in dense form and the
+ * non-zeros of B load balanced across the threads of each block.
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam threads_per_block block size
+ * @tparam chunk_size number of nonzeros of B to process for each row of A
+ *         this value was found through profiling and represents a reasonable
+ *         setting for both large and small densities
+ * @tparam product_f semiring product() function
+ * @tparam accum_f semiring sum() function
+ * @tparam write_f atomic semiring sum() function
+ * @param[out] out_dists dense array of out distances of size m * n in row-major
+ *             format.
+ * @param[in] config_ distance config object
+ * @param[in] coo_rows_b coo row array for B
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ */
+template <typename value_idx, typename value_t, int threads_per_block = 1024,
+          int chunk_size = 500000, typename product_f, typename accum_f,
+          typename write_f>
+inline void balanced_coo_pairwise_generalized_spmv(
+  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
+  value_idx *coo_rows_b, product_f product_func, accum_f accum_func,
+  write_f write_func) {
+  CUDA_CHECK(cudaMemsetAsync(
+    out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows,
+    config_.stream));
+  int n_blocks_per_row =
+    raft::ceildiv(config_.b_nnz, chunk_size * threads_per_block);
+  int n_blocks = config_.a_nrows * n_blocks_per_row;
+
+  int smem =
+    smem_per_block<value_idx, value_t, threads_per_block>(config_.a_ncols);
+
+  CUDA_CHECK(cudaFuncSetCacheConfig(
+    balanced_coo_generalized_spmv_kernel<value_idx, value_t, threads_per_block,
+                                         false, value_t, product_f, accum_f,
+                                         write_f>,
+    cudaFuncCachePreferShared));
+
+  balanced_coo_generalized_spmv_kernel<value_idx, value_t, threads_per_block,
+                                       false, value_t>
+    <<<n_blocks, threads_per_block, smem, config_.stream>>>(
+      config_.a_indptr, config_.a_indices, config_.a_data, coo_rows_b,
+      config_.b_indices, config_.b_data, config_.a_nrows, config_.b_nrows,
+      config_.b_ncols, config_.b_nnz, out_dists, n_blocks_per_row, chunk_size,
+      product_func, accum_func, write_func);
+};
+
+/**
+ * Used for computing distances where the reduction (e.g. product()) function
+ * requires an implicit union (product(x, 0) = x) to capture the difference A-B.
+ * This is necessary in some applications because the standard semiring algebra
+ * endowed with the default multiplication product monoid will only
+ * compute the intersection & B-A.
+ *
+ * This particular function is meant to accompany the function
+ * `balanced_coo_pairwise_generalized_spmv` and executes the product operation
+ * on only those columns that exist in B and not A.
+ *
+ * The product and sum operations shall enable the computation of a
+ * non-annihilating semiring algebra with the following properties:
+ * 1. {+, 0} is a commutative sum reduction monoid with identity element 0
+ * 2. {*, 0} is a product monoid with identity element 0
+ * 3. Multiplication by 0 does not annihilate x. e.g. product(x, 0) = x
+ *
+ * Manattan distance sum(abs(x_k-y_k)) is a great example of when this type of
+ * execution pattern is necessary.
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam threads_per_block block size
+ * @tparam chunk_size number of nonzeros of B to process for each row of A
+ *         this value was found through profiling and represents a reasonable
+ *         setting for both large and small densities
+ * @tparam product_f semiring product() function
+ * @tparam accum_f semiring sum() function
+ * @tparam write_f atomic semiring sum() function
+ * @param[out] out_dists dense array of out distances of size m * n
+ * @param[in] config_ distance config object
+ * @param[in] coo_rows_a coo row array for A
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ */
+template <typename value_idx, typename value_t, int threads_per_block = 1024,
+          int chunk_size = 500000, typename product_f, typename accum_f,
+          typename write_f>
+inline void balanced_coo_pairwise_generalized_spmv_rev(
+  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
+  value_idx *coo_rows_a, product_f product_func, accum_f accum_func,
+  write_f write_func) {
+  int n_blocks_per_row =
+    raft::ceildiv(config_.a_nnz, chunk_size * threads_per_block);
+  int n_blocks = config_.b_nrows * n_blocks_per_row;
+
+  int smem =
+    smem_per_block<value_idx, value_t, threads_per_block>(config_.a_ncols);
+
+  CUDA_CHECK(cudaFuncSetCacheConfig(
+    balanced_coo_generalized_spmv_kernel<value_idx, value_t, threads_per_block,
+                                         true, value_t, product_f, accum_f,
+                                         write_f>,
+    cudaFuncCachePreferShared));
+
+  balanced_coo_generalized_spmv_kernel<value_idx, value_t, threads_per_block,
+                                       true, value_t>
+    <<<n_blocks, threads_per_block, smem, config_.stream>>>(
+      config_.b_indptr, config_.b_indices, config_.b_data, coo_rows_a,
+      config_.a_indices, config_.a_data, config_.b_nrows, config_.a_nrows,
+      config_.a_ncols, config_.a_nnz, out_dists, n_blocks_per_row, chunk_size,
+      product_func, accum_func, write_func);
+};
+}  // namespace distance
+}  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/csr_spmv.cuh b/cpp/include/raft/sparse/distance/csr_spmv.cuh
new file mode 100644
index 0000000000..eff8f9281e
--- /dev/null
+++ b/cpp/include/raft/sparse/distance/csr_spmv.cuh
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/sparse/distance/common.h>
+#include <raft/sparse/utils.h>
+#include <raft/sparse/csr.cuh>
+#include <raft/sparse/distance/operators.cuh>
+
+#include <limits.h>
+
+#include <nvfunctional>
+
+#include <cub/cub.cuh>
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+/**
+ * Semiring which schedules each row of B in a different thread.
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam tpb
+ * @tparam buffer_size
+ * @tparam rows_per_block
+ */
+template <typename value_idx, typename value_t, int tpb, typename product_f,
+          typename accum_f>
+struct BlockSemiring {
+  __device__ inline BlockSemiring(value_idx n_, value_idx *shared_cols_,
+                                  value_t *shared_vals_, value_idx *offsets_a_)
+    : n(n_),
+      a_cols(shared_cols_),
+      a_vals(shared_vals_),
+      offsets_a(offsets_a_),
+      done(false),
+      a_idx(0),
+      b_row_count(0),
+      cur_sum(0.0) {}
+
+  /**
+   * Load columns for a single row of A into shared memory
+   * @param row
+   * @param indptrA
+   * @param indicesA
+   * @param dataA
+   */
+  __device__ inline void load_a_shared(value_idx row, value_idx *indptrA,
+                                       value_idx *indicesA, value_t *dataA) {
+    if (threadIdx.x == 0) {
+      offsets_a[0] = indptrA[row];
+      offsets_a[1] = indptrA[row + 1];
+    }
+    __syncthreads();
+
+    value_idx start_offset_a = offsets_a[0];
+    value_idx stop_offset_a = offsets_a[1];
+
+    a_size = stop_offset_a - start_offset_a;
+
+    // Coalesce reads of row from matrix A into shared memory
+    for (int i = threadIdx.x; i < a_size; i += blockDim.x) {
+      a_cols[i] = indicesA[start_offset_a + i];
+      a_vals[i] = dataA[start_offset_a + i];
+    }
+
+    __syncthreads();
+
+    row_a = row;
+  }
+
+  /**
+   * Sets the head for A's pointers so they can be
+   * iterated in each thread. This is used for the
+   * case when the maximum degree of any row in A
+   * is too large to fit into shared memory, so we
+   * default to increasing the size of the L1 cache
+   * and suffering the uncoalesced memory accesses
+   * for both A and B.
+   * @param row
+   * @param indptrA
+   * @param indicesA
+   * @param dataA
+   */
+  __device__ inline void load_a(value_idx row, value_idx *indptrA,
+                                value_idx *indicesA, value_t *dataA) {
+    offsets_a[0] = indptrA[row];
+    offsets_a[1] = indptrA[row + 1];
+
+    value_idx start_offset_a = offsets_a[0];
+    value_idx stop_offset_a = offsets_a[1];
+
+    a_size = stop_offset_a - start_offset_a;
+
+    a_cols = indicesA + start_offset_a;
+    a_vals = dataA + start_offset_a;
+
+    row_a = row;
+  }
+
+  /**
+   * Prepare index & offsets for looping through rows of B
+   * @param start_row
+   * @param indptrB
+   */
+  __device__ inline void load_b(value_idx start_row, value_idx *indptrB) {
+    done = false;
+    a_idx = 0;
+    cur_sum = 0.0;
+
+    value_idx start_row_b = start_row;
+    value_idx stop_row_b = min(start_row_b + tpb, n);
+
+    n_rows_b = stop_row_b - start_row_b;
+
+    if (threadIdx.x < n_rows_b) {
+      row_b = start_row_b + threadIdx.x;
+      value_idx start_offset_b = indptrB[row_b];
+      b_row_count = indptrB[row_b + 1] - start_offset_b;
+      b_idx = start_offset_b;
+      b_idx_stop = start_offset_b + b_row_count;
+    }
+  }
+
+  /**
+   * Perform single single column intersection/union for A & B
+   * based on the row of A mapped to shared memory and the row
+   * of B mapped to current thread.
+   * @param product_func
+   * @param accum_func
+   */
+  __device__ inline void step(value_idx *b_cols, value_t *b_vals,
+                              product_f product_func, accum_f accum_func) {
+    if (threadIdx.x < n_rows_b) {
+      bool local_idx_in_bounds = b_idx < b_idx_stop && b_row_count > 0;
+
+      value_idx b = local_idx_in_bounds ? b_cols[b_idx] : -1;
+      value_t bv = local_idx_in_bounds ? b_vals[b_idx] : 0.0;
+
+      bool a_idx_in_bounds = a_idx < a_size;
+
+      value_idx a = a_idx_in_bounds ? a_cols[a_idx] : -1;
+      value_t av = a_idx_in_bounds ? a_vals[a_idx] : 0.0;
+
+      bool run_b = ((b <= a && b != -1) || (b != -1 && a == -1));
+      b_idx += 1 * run_b;
+      value_t b_side = bv * run_b;
+
+      bool run_a = ((a <= b && a != -1) || (b == -1 && a != -1));
+      a_idx += 1 * run_a;
+      value_t a_side = av * run_a;
+
+      // Apply semiring "sum" & "product" functions locally
+      cur_sum = accum_func(cur_sum, product_func(b_side, a_side));
+
+      // finished when all items in chunk have been
+      // processed
+      done = b == -1 && a == -1;
+
+    } else {
+      done = true;
+    }
+  }
+
+  __device__ inline bool isdone() { return done; }
+
+  __device__ inline void write(value_t *out) {
+    if (threadIdx.x < n_rows_b) {
+      out[(size_t)row_a * n + row_b] = cur_sum;
+    }
+  }
+
+ private:
+  bool done;
+
+  int a_size;
+
+  value_idx n_rows_b;
+
+  value_idx b_idx;
+  value_idx b_idx_stop;
+  value_idx a_idx;
+
+  value_t cur_sum;
+
+  value_idx n;
+
+  value_idx row_a;
+  value_idx row_b;
+
+  value_idx *offsets_a;
+
+  // shared memory
+  value_idx b_row_count;
+  value_idx *a_cols;
+  value_t *a_vals;
+};
+
+/**
+ * Optimized for large numbers of rows but small enough numbers of columns
+ * that each thread can process their rows in parallel.
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam tpb block size
+ * @tparam product_f semiring product() function
+ * @tparam accum_f semiring sum() function
+ * @param[in] indptrA csr column index pointer array for A
+ * @param[in] indicesA csr column indices array for A
+ * @param[in] dataA csr data array for A
+ * @param[in] indptrB csr column index pointer array for B
+ * @param[in] indicesB csr column indices array for B
+ * @param[in] dataB csr data array for B
+ * @param[in] m number of rows in A
+ * @param[in] n number of rows in B
+ * @param[out] out dense output array of size m * n in row-major layout
+ * @param[in] n_blocks_per_row number of blocks of B scheduled per row of A
+ * @param[in] n_rows_per_block number of rows of A scheduled per block of B
+ * @param[in] buffer_size number of nonzeros to store in smem
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ */
+template <typename value_idx, typename value_t, int tpb, typename product_f,
+          typename accum_f>
+__global__ void classic_csr_semiring_spmv_smem_kernel(
+  value_idx *indptrA, value_idx *indicesA, value_t *dataA, value_idx *indptrB,
+  value_idx *indicesB, value_t *dataB, value_idx m, value_idx n, value_t *out,
+  int n_blocks_per_row, int n_rows_per_block, int buffer_size,
+  product_f product_func, accum_f accum_func) {
+  value_idx out_row = blockIdx.x / n_blocks_per_row;
+  value_idx out_col_start = blockIdx.x % n_blocks_per_row;
+
+  value_idx row_b_start = out_col_start * n_rows_per_block;
+
+  extern __shared__ char smem[];
+
+  value_idx *offsets_a = (value_idx *)smem;
+  value_idx *a_cols = offsets_a + 2;
+  value_t *a_vals = (value_t *)(a_cols + buffer_size);
+
+  BlockSemiring<value_idx, value_t, tpb, product_f, accum_f> semiring(
+    n, a_cols, a_vals, offsets_a);
+
+  semiring.load_a_shared(out_row, indptrA, indicesA, dataA);
+
+  if (out_row > m || row_b_start > n) return;
+
+  // for each batch, parallelize the resulting rows across threads
+  for (int i = 0; i < n_rows_per_block; i += blockDim.x) {
+    semiring.load_b(row_b_start + i, indptrB);
+    do {
+      semiring.step(indicesB, dataB, product_func, accum_func);
+    } while (!semiring.isdone());
+
+    semiring.write(out);
+  }
+}
+
+template <typename value_idx, typename value_t, int tpb, typename product_f,
+          typename accum_f>
+__global__ void classic_csr_semiring_spmv_kernel(
+  value_idx *indptrA, value_idx *indicesA, value_t *dataA, value_idx *indptrB,
+  value_idx *indicesB, value_t *dataB, value_idx m, value_idx n, value_t *out,
+  int n_blocks_per_row, int n_rows_per_block, product_f product_func,
+  accum_f accum_func) {
+  value_idx out_row = blockIdx.x / n_blocks_per_row;
+  value_idx out_col_start = blockIdx.x % n_blocks_per_row;
+
+  value_idx row_b_start = out_col_start * n_rows_per_block;
+
+  value_idx offsets_a[2];
+
+  BlockSemiring<value_idx, value_t, tpb, product_f, accum_f> semiring(
+    n, indicesA, dataA, offsets_a);
+
+  semiring.load_a(out_row, indptrA, indicesA, dataA);
+
+  if (out_row > m || row_b_start > n) return;
+
+  // for each batch, parallel the resulting rows across threads
+  for (int i = 0; i < n_rows_per_block; i += blockDim.x) {
+    semiring.load_b(row_b_start + i, indptrB);
+    do {
+      semiring.step(indicesB, dataB, product_func, accum_func);
+    } while (!semiring.isdone());
+
+    semiring.write(out);
+  }
+}
+
+/**
+ * Compute the maximum number of nonzeros that can be stored in shared
+ * memory per block with the given index and value precision
+ * @return max nnz that can be stored in smem per block
+ */
+template <typename value_idx, typename value_t>
+inline value_idx max_nnz_per_block() {
+  // max nnz = total smem - offsets for A
+  // (division because we need to store cols & vals separately)
+  return (raft::getSharedMemPerBlock() - (2 * sizeof(value_idx))) /
+         (sizeof(value_t) + sizeof(value_idx));
+}
+
+/**
+ * @tparam value_idx
+ * @param out
+ * @param in
+ * @param n
+ */
+template <typename value_idx>
+__global__ void max_kernel(value_idx *out, value_idx *in, value_idx n) {
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+  typedef cub::BlockReduce<value_idx, 256> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  value_idx v = tid < n ? in[tid] - in[tid - 1] : 0;
+  value_idx agg = BlockReduce(temp_storage).Reduce(v, cub::Max());
+
+  if (threadIdx.x == 0) atomicMax(out, agg);
+}
+
+template <typename value_idx>
+inline value_idx max_degree(
+  value_idx *indptr, value_idx n_rows,
+  std::shared_ptr<raft::mr::device::allocator> allocator, cudaStream_t stream) {
+  raft::mr::device::buffer<value_idx> max_d(allocator, stream, 1);
+  CUDA_CHECK(cudaMemsetAsync(max_d.data(), 0, sizeof(value_idx), stream));
+
+  /**
+   * A custom max reduction is performed until https://github.com/rapidsai/cuml/issues/3431
+   * is fixed.
+   */
+  max_kernel<<<raft::ceildiv(n_rows, 256), 256, 0, stream>>>(
+    max_d.data(), indptr + 1, n_rows);
+
+  value_idx max_h;
+  raft::update_host(&max_h, max_d.data(), 1, stream);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  return max_h;
+}
+
+template <typename value_idx = int, typename value_t = float,
+          int threads_per_block = 64, typename product_f, typename accum_f>
+void _generalized_csr_pairwise_semiring(
+  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
+  product_f product_func, accum_f accum_func) {
+  int n_chunks = 1;
+  int n_rows_per_block = min(n_chunks * threads_per_block, config_.b_nrows);
+  int n_blocks_per_row = raft::ceildiv(config_.b_nrows, n_rows_per_block);
+  int n_blocks = config_.a_nrows * n_blocks_per_row;
+
+  CUDA_CHECK(cudaFuncSetCacheConfig(
+    classic_csr_semiring_spmv_kernel<value_idx, value_t, threads_per_block,
+                                     product_f, accum_f>,
+    cudaFuncCachePreferL1));
+
+  classic_csr_semiring_spmv_kernel<value_idx, value_t, threads_per_block,
+                                   product_f, accum_f>
+    <<<n_blocks, threads_per_block, 0, config_.stream>>>(
+      config_.a_indptr, config_.a_indices, config_.a_data, config_.b_indptr,
+      config_.b_indices, config_.b_data, config_.a_nrows, config_.b_nrows,
+      out_dists, n_blocks_per_row, n_rows_per_block, product_func, accum_func);
+};
+
+template <typename value_idx = int, typename value_t = float,
+          int threads_per_block = 32, typename product_f, typename accum_f>
+void _generalized_csr_pairwise_smem_semiring(
+  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
+  product_f product_func, accum_f accum_func, value_idx max_nnz) {
+  int n_chunks = 10000;
+  int n_rows_per_block = min(n_chunks * threads_per_block, config_.b_nrows);
+  int n_blocks_per_row = raft::ceildiv(config_.b_nrows, n_rows_per_block);
+  int n_blocks = config_.a_nrows * n_blocks_per_row;
+
+  // TODO: Figure out why performance is worse with smaller smem sizes
+  int smem_size = raft::getSharedMemPerBlock();
+
+  CUDA_CHECK(cudaFuncSetCacheConfig(
+    classic_csr_semiring_spmv_smem_kernel<value_idx, value_t, threads_per_block,
+                                          product_f, accum_f>,
+    cudaFuncCachePreferShared));
+
+  classic_csr_semiring_spmv_smem_kernel<value_idx, value_t, threads_per_block,
+                                        product_f, accum_f>
+    <<<n_blocks, threads_per_block, smem_size, config_.stream>>>(
+      config_.a_indptr, config_.a_indices, config_.a_data, config_.b_indptr,
+      config_.b_indices, config_.b_data, config_.a_nrows, config_.b_nrows,
+      out_dists, n_blocks_per_row, n_rows_per_block, max_nnz, product_func,
+      accum_func);
+}
+
+/**
+ * Perform generalized sparse-matrix-sparse-vector multiply in
+ * a semiring algebra by allowing the product and sum operations
+ * to be defined. This approach saves the most memory as it can
+ * work directly on a CSR w/o the need for conversion to another
+ * sparse format, does not require any transposition, nor loading
+ * any vectors in dense form. The major drawback to this kernel
+ * is that the non-uniform memory access pattern dominates performance.
+ * When the shared memory option is used, bank conflicts also dominate
+ * performance, making it slower than other options but guaranteeing
+ * that the product() operation will be executed across every column
+ * in A and B.
+ *
+ * This is primarily useful when in cases where the product() operation
+ * is non-anniliating (e.g. product(x, 0) = x.
+ *
+ * There are two potential code paths for this primitive- if the largest
+ * degree of any row is small enough to fit in shared memory then shared
+ * memory is used to coalesce the reads from the vectors of A, otherwise
+ * no shared memory is used and all loads from A and B happen independently
+ * in separate threads.
+ *
+ * Iterators are maintained for the vectors from both A and B and each
+ * thread iterates to a maximum of |a|+|b| (which will happen only when
+ * the set of columns for vectors a and b are completely disjoint.
+ *
+ * TODO: Some potential things to try for future optimizations:
+ *  - Always iterating for n_cols so that each warp is iterating
+ *    a uniform number of times.
+ *  - Computing an argsort() of B based on the number of columns
+ *    in each row to attempt to load balance the warps naturally
+ *  - Finding a way to coalesce the reads
+ *
+ *  Ref: https://github.com/rapidsai/cuml/issues/3371
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam product_f semiring product() function
+ * @tparam accum_f semiring sum() function
+ * @param[out] out_dists dense array of output distances size m * n in row-major layout
+ * @param[in] config_ distance config object
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ */
+template <typename value_idx = int, typename value_t = float,
+          typename product_f, typename accum_f>
+void generalized_csr_pairwise_semiring(
+  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
+  product_f product_func, accum_f accum_func) {
+  int nnz_upper_bound = max_nnz_per_block<value_idx, value_t>();
+
+  // max_nnz set from max(diff(indptrA))
+  value_idx max_nnz = max_degree<value_idx>(config_.a_indptr, config_.a_nrows,
+                                            config_.allocator, config_.stream) +
+                      1;
+
+  if (max_nnz <= nnz_upper_bound)
+    // use smem
+    _generalized_csr_pairwise_smem_semiring<value_idx, value_t>(
+      out_dists, config_, product_func, accum_func, max_nnz);
+
+  else
+    // load each row of A separately
+    _generalized_csr_pairwise_semiring<value_idx, value_t>(
+      out_dists, config_, product_func, accum_func);
+};
+
+}  // namespace distance
+}  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh
new file mode 100644
index 0000000000..1559e9776f
--- /dev/null
+++ b/cpp/include/raft/sparse/distance/distance.cuh
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+
+#include <raft/linalg/distance_type.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/sparse/linalg/transpose.h>
+#include <raft/sparse/utils.h>
+#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/convert/dense.cuh>
+#include <raft/sparse/csr.cuh>
+
+#include <raft/sparse/distance/bin_distance.cuh>
+#include <raft/sparse/distance/ip_distance.cuh>
+#include <raft/sparse/distance/l2_distance.cuh>
+#include <raft/sparse/distance/lp_distance.cuh>
+
+#include <cusparse_v2.h>
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+/**
+ * Compute pairwise distances between A and B, using the provided
+ * input configuration and distance function.
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @param[out] out dense output array (size A.nrows * B.nrows)
+ * @param[in] input_config input argument configuration
+ * @param[in] metric distance metric to use
+ */
+template <typename value_idx = int, typename value_t = float>
+void pairwiseDistance(value_t *out,
+                      distances_config_t<value_idx, value_t> input_config,
+                      raft::distance::DistanceType metric, float metric_arg) {
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+      l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::InnerProduct:
+      ip_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::L2Unexpanded:
+      l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::L1:
+      l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::LpUnexpanded:
+      lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg)
+        .compute(out);
+      break;
+    case raft::distance::DistanceType::Linf:
+      linf_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case raft::distance::DistanceType::Canberra:
+      canberra_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case raft::distance::DistanceType::JaccardExpanded:
+      jaccard_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case raft::distance::DistanceType::CosineExpanded:
+      cosine_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case raft::distance::DistanceType::HellingerExpanded:
+      hellinger_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+
+    default:
+      THROW("Unsupported distance: %d", metric);
+  }
+}
+
+};  // namespace distance
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh
new file mode 100644
index 0000000000..a832c2b6a9
--- /dev/null
+++ b/cpp/include/raft/sparse/distance/ip_distance.cuh
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <limits.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/sparse/distance/common.h>
+#include <raft/sparse/linalg/transpose.h>
+#include <raft/sparse/utils.h>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/convert/dense.cuh>
+#include <raft/sparse/distance/coo_spmv.cuh>
+#include <raft/sparse/distance/operators.cuh>
+
+#include <nvfunctional>
+
+#include <cusparse_v2.h>
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+/**
+ * A simple interface that enables different instances
+ * of inner product. Currently, there are two implementations:
+ * cusparse gemm and our own semiring spmv.
+ * @tparam value_idx
+ * @tparam value_t
+ */
+template <typename value_idx, typename value_t>
+class ip_trans_getters_t : public distances_t<value_t> {
+ public:
+  /**
+   * A copy of B's data in coo format. This is
+   * useful for downstream distances that
+   * might be able to compute a norm instead of
+   * point-wise products.
+   * @return
+   */
+  virtual value_t *b_data_coo() = 0;
+
+  /**
+   * A copy of B's rows in coo format. This is
+   * useful for downstream distances that
+   * might be able to compute a norm instead of
+   * point-wise products.
+   * @return
+   */
+  virtual value_idx *b_rows_coo() = 0;
+
+  virtual ~ip_trans_getters_t() = default;
+};
+
+/**
+ * Simple inner product distance with sparse matrix multiply. This
+ * uses cusparse and requires both B to be transposed as well as
+ * the output to be explicitly converted to dense form (which requires
+ * 3 copies of the dense data- 2 for the cusparse csr output and
+ * 1 for the final m*n dense matrix.)
+ */
+template <typename value_idx, typename value_t>
+class ip_distances_gemm_t : public ip_trans_getters_t<value_idx, value_t> {
+ public:
+  /**
+   * Computes simple sparse inner product distances as sum(x_y * y_k)
+   * @param[in] config specifies inputs, outputs, and sizes
+   *
+   * TODO: Remove this once we have a semiring SPGEMM
+   * Ref: https://github.com/rapidsai/cuml/issues/3371
+   */
+  explicit ip_distances_gemm_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config),
+      workspace(config.allocator, config.stream, 0),
+      csc_indptr(config.allocator, config.stream, 0),
+      csc_indices(config.allocator, config.stream, 0),
+      csc_data(config.allocator, config.stream, 0),
+      alpha(1.0) {
+    init_mat_descriptor(matA);
+    init_mat_descriptor(matB);
+    init_mat_descriptor(matC);
+    init_mat_descriptor(matD);
+
+    CUSPARSE_CHECK(cusparseCreateCsrgemm2Info(&info));
+
+    CUSPARSE_CHECK(cusparseGetPointerMode(config.handle, &orig_ptr_mode));
+
+    CUSPARSE_CHECK(
+      cusparseSetPointerMode(config.handle, CUSPARSE_POINTER_MODE_HOST));
+  }
+
+  /**
+   * Performs pairwise distance computation and computes output distances
+   * @param out_distances dense output matrix (size a_nrows * b_nrows)
+   */
+  void compute(value_t *out_distances) {
+    /**
+	   * Compute pairwise distances and return dense matrix in column-major format
+	   */
+    raft::mr::device::buffer<value_idx> out_batch_indptr(
+      config_->allocator, config_->stream, config_->a_nrows + 1);
+    raft::mr::device::buffer<value_idx> out_batch_indices(config_->allocator,
+                                                          config_->stream, 0);
+    raft::mr::device::buffer<value_t> out_batch_data(config_->allocator,
+                                                     config_->stream, 0);
+
+    value_idx out_batch_nnz = get_nnz(out_batch_indptr.data());
+
+    out_batch_indices.resize(out_batch_nnz, config_->stream);
+    out_batch_data.resize(out_batch_nnz, config_->stream);
+
+    compute_gemm(out_batch_indptr.data(), out_batch_indices.data(),
+                 out_batch_data.data());
+
+    raft::sparse::convert::csr_to_dense(
+      config_->handle, config_->a_nrows, config_->b_nrows,
+      out_batch_indptr.data(), out_batch_indices.data(), out_batch_data.data(),
+      config_->a_nrows, out_distances, config_->stream, true);
+  }
+
+  virtual value_idx *b_rows_coo() { return csc_indices.data(); }
+
+  value_t *b_data_coo() { return csc_data.data(); }
+
+  ~ip_distances_gemm_t() {
+    CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(matA));
+    CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(matB));
+    CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(matC));
+    CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(matD));
+
+    CUSPARSE_CHECK_NO_THROW(
+      cusparseSetPointerMode(config_->handle, orig_ptr_mode));
+  }
+
+ private:
+  void init_mat_descriptor(cusparseMatDescr_t &mat) {
+    CUSPARSE_CHECK(cusparseCreateMatDescr(&mat));
+    CUSPARSE_CHECK(cusparseSetMatIndexBase(mat, CUSPARSE_INDEX_BASE_ZERO));
+    CUSPARSE_CHECK(cusparseSetMatType(mat, CUSPARSE_MATRIX_TYPE_GENERAL));
+  }
+
+  value_idx get_nnz(value_idx *csr_out_indptr) {
+    value_idx m = config_->a_nrows, n = config_->b_nrows, k = config_->a_ncols;
+
+    transpose_b();
+
+    size_t workspace_size;
+
+    CUSPARSE_CHECK(raft::sparse::cusparsecsrgemm2_buffersizeext<value_t>(
+      config_->handle, m, n, k, &alpha, NULL, matA, config_->a_nnz,
+      config_->a_indptr, config_->a_indices, matB, config_->b_nnz,
+      csc_indptr.data(), csc_indices.data(), matD, 0, NULL, NULL, info,
+      &workspace_size, config_->stream));
+
+    workspace.resize(workspace_size, config_->stream);
+
+    value_idx out_nnz = 0;
+
+    CUSPARSE_CHECK(raft::sparse::cusparsecsrgemm2nnz(
+      config_->handle, m, n, k, matA, config_->a_nnz, config_->a_indptr,
+      config_->a_indices, matB, config_->b_nnz, csc_indptr.data(),
+      csc_indices.data(), matD, 0, NULL, NULL, matC, csr_out_indptr, &out_nnz,
+      info, workspace.data(), config_->stream));
+
+    return out_nnz;
+  }
+
+  void compute_gemm(const value_idx *csr_out_indptr, value_idx *csr_out_indices,
+                    value_t *csr_out_data) {
+    value_idx m = config_->a_nrows, n = config_->b_nrows, k = config_->a_ncols;
+
+    int start = raft::curTimeMillis();
+
+    CUDA_CHECK(cudaStreamSynchronize(config_->stream));
+
+    CUSPARSE_CHECK(raft::sparse::cusparsecsrgemm2<value_t>(
+      config_->handle, m, n, k, &alpha, matA, config_->a_nnz, config_->a_data,
+      config_->a_indptr, config_->a_indices, matB, config_->b_nnz,
+      csc_data.data(), csc_indptr.data(), csc_indices.data(), NULL, matD, 0,
+      NULL, NULL, NULL, matC, csr_out_data, csr_out_indptr, csr_out_indices,
+      info, workspace.data(), config_->stream));
+
+    CUDA_CHECK(cudaStreamSynchronize(config_->stream));
+  }
+
+  void transpose_b() {
+    /**
+     * Transpose index array into csc
+     */
+    csc_indptr.resize(config_->b_ncols + 1, config_->stream);
+    csc_indices.resize(config_->b_nnz, config_->stream);
+    csc_data.resize(config_->b_nnz, config_->stream);
+
+    raft::sparse::linalg::csr_transpose(
+      config_->handle, config_->b_indptr, config_->b_indices, config_->b_data,
+      csc_indptr.data(), csc_indices.data(), csc_data.data(), config_->b_nrows,
+      config_->b_ncols, config_->b_nnz, config_->allocator, config_->stream);
+  }
+
+  value_t alpha;
+  csrgemm2Info_t info;
+  cusparseMatDescr_t matA;
+  cusparseMatDescr_t matB;
+  cusparseMatDescr_t matC;
+  cusparseMatDescr_t matD;
+  cusparsePointerMode_t orig_ptr_mode;
+  raft::mr::device::buffer<char> workspace;
+  raft::mr::device::buffer<value_idx> csc_indptr;
+  raft::mr::device::buffer<value_idx> csc_indices;
+  raft::mr::device::buffer<value_t> csc_data;
+  const distances_config_t<value_idx, value_t> *config_;
+};
+
+template <typename value_idx, typename value_t>
+class ip_distances_spmv_t : public ip_trans_getters_t<value_idx, value_t> {
+ public:
+  /**
+   * Computes simple sparse inner product distances as sum(x_y * y_k)
+   * @param[in] config specifies inputs, outputs, and sizes
+   */
+  ip_distances_spmv_t(const distances_config_t<value_idx, value_t> &config)
+    : config_(&config),
+      coo_rows_b(config.allocator, config.stream, config.b_nnz) {
+    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
+                                      coo_rows_b.data(), config_->b_nnz,
+                                      config_->stream);
+  }
+
+  /**
+   * Performs pairwise distance computation and computes output distances
+   * @param out_distances dense output matrix (size a_nrows * b_nrows)
+   */
+  void compute(value_t *out_distances) {
+    /**
+	   * Compute pairwise distances and return dense matrix in row-major format
+	   */
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+      out_distances, *config_, coo_rows_b.data(), Product(), Sum(),
+      AtomicAdd());
+  }
+
+  value_idx *b_rows_coo() { return coo_rows_b.data(); }
+
+  value_t *b_data_coo() { return config_->b_data; }
+
+  ~ip_distances_spmv_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+  raft::mr::device::buffer<value_idx> coo_rows_b;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class ip_distances_t : public distances_t<value_t> {
+ public:
+  /**
+   * Computes simple sparse inner product distances as sum(x_y * y_k)
+   * @param[in] config specifies inputs, outputs, and sizes
+   */
+  explicit ip_distances_t(const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {
+    if (config_->a_ncols < max_cols_per_block<value_idx, value_t>()) {
+      internal_ip_dist =
+        std::make_unique<ip_distances_spmv_t<value_idx, value_t>>(*config_);
+    } else {
+      internal_ip_dist =
+        std::make_unique<ip_distances_gemm_t<value_idx, value_t>>(*config_);
+    }
+  }
+
+  /**
+   * Performs pairwise distance computation and computes output distances
+   * @param out_distances dense output matrix (size a_nrows * b_nrows)
+   */
+  void compute(value_t *out_distances) {
+    /**
+	   * Compute pairwise distances and return dense matrix in column-major format
+	   */
+    internal_ip_dist->compute(out_distances);
+  }
+
+  virtual value_idx *b_rows_coo() const {
+    return internal_ip_dist->b_rows_coo();
+  }
+
+  virtual value_t *b_data_coo() const { return internal_ip_dist->b_data_coo(); }
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+  std::unique_ptr<ip_trans_getters_t<value_idx, value_t>> internal_ip_dist;
+};
+
+/**
+ * Compute pairwise distances between A and B, using the provided
+ * input configuration and distance function.
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @param[out] out dense output array (size A.nrows * B.nrows)
+ * @param[in] input_config input argument configuration
+ * @param[in] metric distance metric to use
+ */
+template class ip_distances_t<int, float>;
+template class distances_config_t<int, float>;
+
+};  // END namespace distance
+};  // END namespace sparse
+};  // END namespace raft
diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh
new file mode 100644
index 0000000000..9d481e34ef
--- /dev/null
+++ b/cpp/include/raft/sparse/distance/l2_distance.cuh
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <limits.h>
+#include <cmath>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/distance/common.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.cuh>
+
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/sparse/utils.h>
+#include <raft/sparse/csr.cuh>
+
+#include <raft/sparse/distance/common.h>
+#include <raft/sparse/distance/ip_distance.cuh>
+
+#include <nvfunctional>
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+// @TODO: Move this into sparse prims (coo_norm)
+template <typename value_idx, typename value_t>
+__global__ void compute_row_norm_kernel(value_t *out,
+                                        const value_idx *__restrict__ coo_rows,
+                                        const value_t *__restrict__ data,
+                                        value_idx nnz) {
+  value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < nnz) {
+    atomicAdd(&out[coo_rows[i]], data[i] * data[i]);
+  }
+}
+
+template <typename value_idx, typename value_t, typename expansion_f>
+__global__ void compute_euclidean_warp_kernel(
+  value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms,
+  const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols,
+  expansion_f expansion_func) {
+  value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
+  value_idx i = tid / n_cols;
+  value_idx j = tid % n_cols;
+
+  if (i >= n_rows || j >= n_cols) return;
+
+  value_t dot = C[(size_t)i * n_cols + j];
+
+  // e.g. Euclidean expansion func = -2.0 * dot + q_norm + r_norm
+  value_t val = expansion_func(dot, Q_sq_norms[i], R_sq_norms[j]);
+
+  // correct for small instabilities
+  if (fabs(val) < 0.0001) val = 0.0;
+
+  C[(size_t)i * n_cols + j] = val;
+}
+
+template <typename value_idx, typename value_t, int tpb = 1024,
+          typename expansion_f>
+void compute_euclidean(value_t *C, const value_t *Q_sq_norms,
+                       const value_t *R_sq_norms, value_idx n_rows,
+                       value_idx n_cols, cudaStream_t stream,
+                       expansion_f expansion_func) {
+  int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
+  compute_euclidean_warp_kernel<<<blocks, tpb, 0, stream>>>(
+    C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func);
+}
+
+template <typename value_idx, typename value_t, int tpb = 1024,
+          typename expansion_f>
+void compute_l2(value_t *out, const value_idx *Q_coo_rows,
+                const value_t *Q_data, value_idx Q_nnz,
+                const value_idx *R_coo_rows, const value_t *R_data,
+                value_idx R_nnz, value_idx m, value_idx n,
+                cusparseHandle_t handle,
+                std::shared_ptr<raft::mr::device::allocator> alloc,
+                cudaStream_t stream, expansion_f expansion_func) {
+  raft::mr::device::buffer<value_t> Q_sq_norms(alloc, stream, m);
+  raft::mr::device::buffer<value_t> R_sq_norms(alloc, stream, n);
+  CUDA_CHECK(
+    cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(
+    cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+
+  compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_sq_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream,
+                    expansion_func);
+}
+
+/**
+ * L2 distance using the expanded form: sum(x_k)^2 + sum(y_k)^2 - 2 * sum(x_k * y_k)
+ * The expanded form is more efficient for sparse data.
+ */
+template <typename value_idx = int, typename value_t = float>
+class l2_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit l2_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config),
+      workspace(config.allocator, config.stream, 0),
+      ip_dists(config) {}
+
+  void compute(value_t *out_dists) {
+    ip_dists.compute(out_dists);
+
+    value_idx *b_indices = ip_dists.b_rows_coo();
+    value_t *b_data = ip_dists.b_data_coo();
+
+    raft::mr::device::buffer<value_idx> search_coo_rows(
+      config_->allocator, config_->stream, config_->a_nnz);
+    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
+                                      search_coo_rows.data(), config_->a_nnz,
+                                      config_->stream);
+
+    compute_l2(
+      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
+      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
+      config_->handle, config_->allocator, config_->stream,
+      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+        return -2 * dot + q_norm + r_norm;
+      });
+  }
+
+  ~l2_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+  raft::mr::device::buffer<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * sqrt(sum(y_k)^2)))
+ * The expanded form is more efficient for sparse data.
+ */
+template <typename value_idx = int, typename value_t = float>
+class cosine_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit cosine_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config),
+      workspace(config.allocator, config.stream, 0),
+      ip_dists(config) {}
+
+  void compute(value_t *out_dists) {
+    ip_dists.compute(out_dists);
+
+    value_idx *b_indices = ip_dists.b_rows_coo();
+    value_t *b_data = ip_dists.b_data_coo();
+
+    raft::mr::device::buffer<value_idx> search_coo_rows(
+      config_->allocator, config_->stream, config_->a_nnz);
+    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
+                                      search_coo_rows.data(), config_->a_nnz,
+                                      config_->stream);
+
+    compute_l2(
+      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
+      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
+      config_->handle, config_->allocator, config_->stream,
+      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+        value_t norms = sqrt(q_norm) * sqrt(r_norm);
+        // deal with potential for 0 in denominator by forcing 0/1 instead
+        value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
+        return 1 - cos;
+      });
+  }
+
+  ~cosine_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+  raft::mr::device::buffer<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Hellinger distance using the expanded form: sqrt(1 - sum(sqrt(x_k) * sqrt(y_k)))
+ * The expanded form is more efficient for sparse data.
+ *
+ * This distance computation modifies A and B by computing a sqrt
+ * and then performing a `pow(x, 2)` to convert it back. Because of this,
+ * it is possible that the values in A and B might differ slightly
+ * after this is invoked.
+ */
+template <typename value_idx = int, typename value_t = float>
+class hellinger_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit hellinger_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config),
+      workspace(config.allocator, config.stream, 0),
+      ip_dists(config) {}
+
+  void compute(value_t *out_dists) {
+    // First sqrt A and B
+    raft::linalg::unaryOp<value_t>(
+      config_->a_data, config_->a_data, config_->a_nnz,
+      [=] __device__(value_t input) { return sqrt(input); }, config_->stream);
+
+    if (config_->a_data != config_->b_data) {
+      raft::linalg::unaryOp<value_t>(
+        config_->b_data, config_->b_data, config_->b_nnz,
+        [=] __device__(value_t input) { return sqrt(input); }, config_->stream);
+    }
+
+    ip_dists.compute(out_dists);
+
+    // Revert sqrt of A and B
+    raft::linalg::unaryOp<value_t>(
+      config_->a_data, config_->a_data, config_->a_nnz,
+      [=] __device__(value_t input) { return input * input; }, config_->stream);
+    if (config_->a_data != config_->b_data) {
+      raft::linalg::unaryOp<value_t>(
+        config_->b_data, config_->b_data, config_->b_nnz,
+        [=] __device__(value_t input) { return input * input; },
+        config_->stream);
+    }
+
+    raft::linalg::unaryOp<value_t>(
+      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      [=] __device__(value_t input) { return sqrt(1 - input); },
+      config_->stream);
+  }
+
+  ~hellinger_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+  raft::mr::device::buffer<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+};  // END namespace distance
+};  // END namespace sparse
+};  // END namespace raft
diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/lp_distance.cuh
new file mode 100644
index 0000000000..e991224f1b
--- /dev/null
+++ b/cpp/include/raft/sparse/distance/lp_distance.cuh
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <limits.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/sparse/utils.h>
+#include <raft/sparse/csr.cuh>
+
+#include <raft/sparse/distance/common.h>
+#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/distance/csr_spmv.cuh>
+#include <raft/sparse/distance/operators.cuh>
+
+#include <nvfunctional>
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+template <typename value_idx = int, typename value_t = float,
+          typename product_f, typename accum_f, typename write_f>
+
+void unexpanded_lp_distances(
+  value_t *out_dists, const distances_config_t<value_idx, value_t> *config_,
+  product_f product_func, accum_f accum_func, write_f write_func) {
+  /**
+ * @TODO: Main logic here:
+ *
+ *  - if n_cols < available smem, just use dense conversion for rows of A
+ *  - if n_cols > available smem but max nnz < available smem, use hashing
+ *    (not yet available)
+ *  - if n_cols > available smem & max_nnz > available smem,
+ *              use batching + hashing only for those large cols
+ *  Ref: https://github.com/rapidsai/cuml/issues/3371
+ */
+
+  if (config_->a_ncols < max_cols_per_block<value_idx, value_t>()) {
+    // TODO: Use n_cols to set shared memory and threads per block
+    // for max occupancy.
+    // Ref: https://github.com/rapidsai/cuml/issues/3371
+
+    raft::mr::device::buffer<value_idx> coo_rows(
+      config_->allocator, config_->stream, max(config_->b_nnz, config_->a_nnz));
+
+    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
+                                      coo_rows.data(), config_->b_nnz,
+                                      config_->stream);
+
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+      out_dists, *config_, coo_rows.data(), product_func, accum_func,
+      write_func);
+
+    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
+                                      coo_rows.data(), config_->a_nnz,
+                                      config_->stream);
+
+    balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
+      out_dists, *config_, coo_rows.data(), product_func, accum_func,
+      write_func);
+
+  } else {
+    // TODO: Find max nnz and set smem based on this value.
+    // Ref: https://github.com/rapidsai/cuml/issues/3371
+    generalized_csr_pairwise_semiring<value_idx, value_t>(
+      out_dists, *config_, product_func, accum_func);
+  }
+}
+
+/**
+ * Computes L1 distances for sparse input. This does not have
+ * an equivalent expanded form, so it is only executed in
+ * an unexpanded form.
+ * @tparam value_idx
+ * @tparam value_t
+ */
+template <typename value_idx = int, typename value_t = float>
+class l1_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  l1_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {}
+
+  void compute(value_t *out_dists) {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(),
+                                                Sum(), AtomicAdd());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class l2_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  l2_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {}
+
+  void compute(value_t *out_dists) {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, SqDiff(),
+                                                Sum(), AtomicAdd());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class linf_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit linf_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {}
+
+  void compute(value_t *out_dists) {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(),
+                                                Max(), AtomicMax());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class canberra_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit canberra_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {}
+
+  void compute(value_t *out_dists) {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_,
+      [] __device__(value_t a, value_t b) {
+        value_t d = fabs(a) + fabs(b);
+
+        // deal with potential for 0 in denominator by
+        // forcing 1/0 instead
+        return ((d != 0) * fabs(a - b)) / (d + (d == 0));
+      },
+      Sum(), AtomicAdd());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class lp_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit lp_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config, value_t p_)
+    : config_(&config), p(p_) {}
+
+  void compute(value_t *out_dists) {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, PDiff(p),
+                                                Sum(), AtomicAdd());
+
+    float one_over_p = 1.0f / p;
+    raft::linalg::unaryOp<value_t>(
+      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      [=] __device__(value_t input) { return pow(input, one_over_p); },
+      config_->stream);
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t> *config_;
+  value_t p;
+};
+
+};  // END namespace distance
+};  // END namespace sparse
+};  // END namespace raft
diff --git a/cpp/include/raft/sparse/distance/operators.cuh b/cpp/include/raft/sparse/distance/operators.cuh
new file mode 100644
index 0000000000..d14a42b407
--- /dev/null
+++ b/cpp/include/raft/sparse/distance/operators.cuh
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/device_atomics.cuh>
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+struct Sum {
+  template <typename value_t>
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+    return a + b;
+  }
+};
+
+struct SqDiff {
+  template <typename value_t>
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+    return (a - b) * (a - b);
+  }
+};
+
+struct PDiff {
+  float p;
+
+  PDiff(float p_) : p(p_) {}
+
+  template <typename value_t>
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+    return pow(a - b, p);
+  }
+};
+
+struct Max {
+  template <typename value_t>
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+    return fmax(a, b);
+  }
+};
+
+struct AtomicAdd {
+  template <typename value_t>
+  __host__ __device__ __forceinline__ value_t operator()(value_t *a,
+                                                         value_t b) {
+    return atomicAdd(a, b);
+  }
+};
+
+struct AtomicMax {
+  template <typename value_t>
+  __host__ __device__ __forceinline__ value_t operator()(value_t *a,
+                                                         value_t b) {
+    return atomicMax(a, b);
+  }
+};
+
+struct Product {
+  template <typename value_t>
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+    return a * b;
+  }
+};
+
+struct AbsDiff {
+  template <typename value_t>
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+    return fabs(a - b);
+  }
+};
+}  // namespace distance
+}  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
new file mode 100644
index 0000000000..bf3e93a06f
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+template <typename T, int TPB_X = 128>
+__global__ void csr_add_calc_row_counts_kernel(
+  const int *a_ind, const int *a_indptr, const T *a_val, int nnz1,
+  const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m,
+  int *out_rowcounts) {
+  // loop through columns in each set of rows and
+  // calculate number of unique cols across both rows
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (row < m) {
+    int a_start_idx = a_ind[row];
+    int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
+
+    int b_start_idx = b_ind[row];
+    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
+
+    /**
+         * Union of columns within each row of A and B so that we can scan through
+         * them, adding their values together.
+         */
+    int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx);
+
+    int *arr = new int[max_size];
+    int cur_arr_idx = 0;
+    for (int j = a_start_idx; j < a_stop_idx; j++) {
+      arr[cur_arr_idx] = a_indptr[j];
+      cur_arr_idx++;
+    }
+
+    int arr_size = cur_arr_idx;
+    int final_size = arr_size;
+
+    for (int j = b_start_idx; j < b_stop_idx; j++) {
+      int cur_col = b_indptr[j];
+      bool found = false;
+      for (int k = 0; k < arr_size; k++) {
+        if (arr[k] == cur_col) {
+          found = true;
+          break;
+        }
+      }
+
+      if (!found) {
+        final_size++;
+      }
+    }
+
+    out_rowcounts[row] = final_size;
+    raft::myAtomicAdd(out_rowcounts + m, final_size);
+
+    delete arr;
+  }
+}
+
+template <typename T, int TPB_X = 128>
+__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
+                               const T *a_val, int nnz1, const int *b_ind,
+                               const int *b_indptr, const T *b_val, int nnz2,
+                               int m, int *out_ind, int *out_indptr,
+                               T *out_val) {
+  // 1 thread per row
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (row < m) {
+    int a_start_idx = a_ind[row];
+
+    // TODO: Shouldn't need this if rowind is proper CSR
+    int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
+
+    int b_start_idx = b_ind[row];
+    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
+
+    int o_idx = out_ind[row];
+
+    int cur_o_idx = o_idx;
+    for (int j = a_start_idx; j < a_stop_idx; j++) {
+      out_indptr[cur_o_idx] = a_indptr[j];
+      out_val[cur_o_idx] = a_val[j];
+      cur_o_idx++;
+    }
+
+    int arr_size = cur_o_idx - o_idx;
+    for (int j = b_start_idx; j < b_stop_idx; j++) {
+      int cur_col = b_indptr[j];
+      bool found = false;
+      for (int k = o_idx; k < o_idx + arr_size; k++) {
+        // If we found a match, sum the two values
+        if (out_indptr[k] == cur_col) {
+          out_val[k] += b_val[j];
+          found = true;
+          break;
+        }
+      }
+
+      // if we didn't find a match, add the value for b
+      if (!found) {
+        out_indptr[o_idx + arr_size] = cur_col;
+        out_val[o_idx + arr_size] = b_val[j];
+        arr_size++;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Calculate the CSR row_ind array that would result
+ * from summing together two CSR matrices
+ * @param a_ind: left hand row_ind array
+ * @param a_indptr: left hand index_ptr array
+ * @param a_val: left hand data array
+ * @param nnz1: size of left hand index_ptr and val arrays
+ * @param b_ind: right hand row_ind array
+ * @param b_indptr: right hand index_ptr array
+ * @param b_val: right hand data array
+ * @param nnz2: size of right hand index_ptr and val arrays
+ * @param m: size of output array (number of rows in final matrix)
+ * @param out_ind: output row_ind array
+ * @param d_alloc: device allocator to use for temp memory
+ * @param stream: cuda stream to use
+ */
+template <typename T, int TPB_X = 128>
+size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
+                         int nnz1, const int *b_ind, const int *b_indptr,
+                         const T *b_val, int nnz2, int m, int *out_ind,
+                         std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                         cudaStream_t stream) {
+  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  raft::mr::device::buffer<int> row_counts(d_alloc, stream, m + 1);
+  CUDA_CHECK(
+    cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
+
+  csr_add_calc_row_counts_kernel<T, TPB_X>
+    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
+                               b_val, nnz2, m, row_counts.data());
+
+  int cnnz = 0;
+  raft::update_host(&cnnz, row_counts.data() + m, 1, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  // create csr compressed row index from row counts
+  thrust::device_ptr<int> row_counts_d =
+    thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<int> c_ind_d = thrust::device_pointer_cast(out_ind);
+  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
+                 c_ind_d);
+
+  return cnnz;
+}
+
+/**
+ * @brief Calculate the CSR row_ind array that would result
+ * from summing together two CSR matrices
+ * @param a_ind: left hand row_ind array
+ * @param a_indptr: left hand index_ptr array
+ * @param a_val: left hand data array
+ * @param nnz1: size of left hand index_ptr and val arrays
+ * @param b_ind: right hand row_ind array
+ * @param b_indptr: right hand index_ptr array
+ * @param b_val: right hand data array
+ * @param nnz2: size of right hand index_ptr and val arrays
+ * @param m: size of output array (number of rows in final matrix)
+ * @param c_ind: output row_ind array
+ * @param c_indptr: output ind_ptr array
+ * @param c_val: output data array
+ * @param stream: cuda stream to use
+ */
+template <typename T, int TPB_X = 128>
+void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val,
+                      int nnz1, const int *b_ind, const int *b_indptr,
+                      const T *b_val, int nnz2, int m, int *c_ind,
+                      int *c_indptr, T *c_val, cudaStream_t stream) {
+  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  csr_add_kernel<T, TPB_X>
+    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
+                               b_val, nnz2, m, c_ind, c_indptr, c_val);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
new file mode 100644
index 0000000000..081fbbe841
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <raft/sparse/utils.h>
+#include <raft/sparse/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief Count all the rows in the coo row array and place them in the
+ * results matrix, indexed by row.
+ *
+ * @tparam TPB_X: number of threads to use per block
+ * @param rows the rows array of the coo matrix
+ * @param nnz the size of the rows array
+ * @param results array to place results
+ */
+template <int TPB_X = 64>
+__global__ void coo_degree_kernel(const int *rows, int nnz, int *results) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (row < nnz) {
+    raft::myAtomicAdd(results + rows[row], 1);
+  }
+}
+
+/**
+ * @brief Count the number of values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @param rows: rows array of the COO matrix
+ * @param nnz: size of the rows array
+ * @param results: output result array
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X = 64>
+void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) {
+  dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+
+  coo_degree_kernel<TPB_X><<<grid_rc, blk_rc, 0, stream>>>(rows, nnz, results);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+/**
+ * @brief Count the number of values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: type name of underlying values array
+ * @param in: input COO object for counting rows
+ * @param results: output array with row counts (size=in->n_rows)
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X = 64, typename T>
+void coo_degree(COO<T> *in, int *results, cudaStream_t stream) {
+  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+
+  coo_degree_kernel<TPB_X>
+    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <int TPB_X = 64, typename T>
+__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz,
+                                     int *results) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (row < nnz && vals[row] != 0.0) {
+    raft::myAtomicAdd(results + rows[row], 1);
+  }
+}
+
+template <int TPB_X = 64, typename T>
+__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals,
+                                         int nnz, T scalar, int *results) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (row < nnz && vals[row] != scalar) {
+    raft::myAtomicAdd(results + rows[row], 1);
+  }
+}
+
+/**
+ * @brief Count the number of values for each row matching a particular scalar
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param in: Input COO array
+ * @param scalar: scalar to match for counting rows
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X = 64, typename T>
+void coo_degree_scalar(COO<T> *in, T scalar, int *results,
+                       cudaStream_t stream) {
+  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+  coo_degree_scalar_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(
+    in->rows(), in->vals(), in->nnz, scalar, results);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+/**
+ * @brief Count the number of values for each row matching a particular scalar
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param rows: Input COO row array
+ * @param vals: Input COO val arrays
+ * @param nnz: size of input COO arrays
+ * @param scalar: scalar to match for counting rows
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X = 64, typename T>
+void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar,
+                       int *results, cudaStream_t stream = 0) {
+  dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+  coo_degree_scalar_kernel<TPB_X, T>
+    <<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, scalar, results);
+}
+
+/**
+ * @brief Count the number of nonzeros for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param rows: Input COO row array
+ * @param vals: Input COO val arrays
+ * @param nnz: size of input COO arrays
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X = 64, typename T>
+void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results,
+                   cudaStream_t stream) {
+  dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+  coo_degree_nz_kernel<TPB_X, T>
+    <<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
+}
+
+/**
+ * @brief Count the number of nonzero values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param in: Input COO array
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X = 64, typename T>
+void coo_degree_nz(COO<T> *in, int *results, cudaStream_t stream) {
+  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
+  dim3 blk_rc(TPB_X, 1, 1);
+
+  coo_degree_nz_kernel<TPB_X, T>
+    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->vals(), in->nnz, results);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
new file mode 100644
index 0000000000..bfcd3fd592
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <iostream>
+#include <limits>
+
+#include <raft/sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+template <int TPB_X = 64, typename T>
+__global__ void csr_row_normalize_l1_kernel(
+  // @TODO: This can be done much more parallel by
+  // having threads in a warp compute the sum in parallel
+  // over each row and then divide the values in parallel.
+  const int *ia,           // csr row ex_scan (sorted by row)
+  const T *vals, int nnz,  // array of values and number of non-zeros
+  int m,                   // num rows in csr
+  T *result) {             // output array
+
+  // row-based matrix 1 thread per row
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  // sum all vals_arr for row and divide each val by sum
+  if (row < m) {
+    int start_idx = ia[row];
+    int stop_idx = 0;
+    if (row < m - 1) {
+      stop_idx = ia[row + 1];
+    } else
+      stop_idx = nnz;
+
+    T sum = T(0.0);
+    for (int j = start_idx; j < stop_idx; j++) {
+      sum = sum + fabs(vals[j]);
+    }
+
+    for (int j = start_idx; j < stop_idx; j++) {
+      if (sum != 0.0) {
+        T val = vals[j];
+        result[j] = val / sum;
+      } else {
+        result[j] = 0.0;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Perform L1 normalization on the rows of a given CSR-formatted sparse matrix
+ *
+ * @param ia: row_ind array
+ * @param vals: data array
+ * @param nnz: size of data array
+ * @param m: size of row_ind array
+ * @param result: l1 normalized data array
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X = 64, typename T>
+void csr_row_normalize_l1(const int *ia,  // csr row ex_scan (sorted by row)
+                          const T *vals,
+                          int nnz,  // array of values and number of non-zeros
+                          int m,    // num rows in csr
+                          T *result,
+                          cudaStream_t stream) {  // output array
+
+  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  csr_row_normalize_l1_kernel<TPB_X, T>
+    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <int TPB_X = 64, typename T>
+__global__ void csr_row_normalize_max_kernel(
+  // @TODO: This can be done much more parallel by
+  // having threads in a warp compute the sum in parallel
+  // over each row and then divide the values in parallel.
+  const int *ia,           // csr row ind array (sorted by row)
+  const T *vals, int nnz,  // array of values and number of non-zeros
+  int m,                   // num total rows in csr
+  T *result) {             // output array
+
+  // row-based matrix 1 thread per row
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  // find max across columns and divide
+  if (row < m) {
+    int start_idx = ia[row];
+    int stop_idx = 0;
+    if (row < m - 1) {
+      stop_idx = ia[row + 1];
+    } else
+      stop_idx = nnz;
+
+    T max = std::numeric_limits<float>::min();
+    for (int j = start_idx; j < stop_idx; j++) {
+      if (vals[j] > max) max = vals[j];
+    }
+
+    // divide nonzeros in current row by max
+    for (int j = start_idx; j < stop_idx; j++) {
+      if (max != 0.0 && max > std::numeric_limits<float>::min()) {
+        T val = vals[j];
+        result[j] = val / max;
+      } else {
+        result[j] = 0.0;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Perform L_inf normalization on a given CSR-formatted sparse matrix
+ *
+ * @param ia: row_ind array
+ * @param vals: data array
+ * @param nnz: size of data array
+ * @param m: size of row_ind array
+ * @param result: l1 normalized data array
+ * @param stream: cuda stream to use
+ */
+
+template <int TPB_X = 64, typename T>
+void csr_row_normalize_max(const int *ia,  // csr row ind array (sorted by row)
+                           const T *vals,
+                           int nnz,  // array of values and number of non-zeros
+                           int m,    // num total rows in csr
+                           T *result, cudaStream_t stream) {
+  dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  csr_row_normalize_max_kernel<TPB_X, T>
+    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
new file mode 100644
index 0000000000..43638471ad
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cudart_utils.h>
+
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+#include <raft/spectral/partition.hpp>
+
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/coo.cuh>
+#include <selection/knn.cuh>
+
+namespace raft {
+namespace sparse {
+namespace spectral {
+
+template <typename T>
+void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
+                   int nnz, int n, int n_components, T *out) {
+  auto stream = handle.get_stream();
+  auto d_alloc = handle.get_device_allocator();
+  raft::mr::device::buffer<int> src_offsets(d_alloc, stream, n + 1);
+  raft::mr::device::buffer<int> dst_cols(d_alloc, stream, nnz);
+  raft::mr::device::buffer<T> dst_vals(d_alloc, stream, nnz);
+  convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(),
+                      dst_cols.data(), dst_vals.data());
+
+  raft::mr::device::buffer<T> eigVals(d_alloc, stream, n_components + 1);
+  raft::mr::device::buffer<T> eigVecs(d_alloc, stream, n * (n_components + 1));
+  raft::mr::device::buffer<int> labels(d_alloc, stream, n);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  /**
+   * Raft spectral clustering
+   */
+  using index_type = int;
+  using value_type = T;
+
+  index_type *ro = src_offsets.data();
+  index_type *ci = dst_cols.data();
+  value_type *vs = dst_vals.data();
+
+  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
+    handle, ro, ci, vs, n, nnz};
+
+  index_type neigvs = n_components + 1;
+  index_type maxiter = 4000;  //default reset value (when set to 0);
+  value_type tol = 0.01;
+  index_type restart_iter = 15 + neigvs;  //what cugraph is using
+  auto t_exe_p = thrust::cuda::par.on(stream);
+  using thrust_exe_policy_t = decltype(t_exe_p);
+
+  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter,
+                                                          restart_iter, tol};
+
+  raft::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+
+  //cluster computation here is irrelevant,
+  //hence define a no-op such solver to
+  //feed partition():
+  //
+  struct no_op_cluster_solver_t {
+    using index_type_t = index_type;
+    using size_type_t = index_type;
+    using value_type_t = value_type;
+
+    std::pair<value_type_t, index_type_t> solve(
+      handle_t const &handle, thrust_exe_policy_t t_exe_policy,
+      size_type_t n_obs_vecs, size_type_t dim,
+      value_type_t const *__restrict__ obs,
+      index_type_t *__restrict__ codes) const {
+      return std::make_pair<value_type_t, index_type_t>(0, 0);
+    }
+  };
+
+  raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver,
+                            no_op_cluster_solver_t{}, labels.data(),
+                            eigVals.data(), eigVecs.data());
+
+  raft::copy<T>(out, eigVecs.data() + n, n * n_components, stream);
+
+  CUDA_CHECK(cudaGetLastError());
+}
+};  // namespace spectral
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
new file mode 100644
index 0000000000..bb298008b7
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/device_atomics.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/utils.h>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+// TODO: value_idx param needs to be used for this once FAISS is updated to use float32
+// for indices so that the index types can be uniform
+template <int TPB_X = 128, typename T, typename Lambda>
+__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
+                                      T *vals, int *orows, int *ocols, T *ovals,
+                                      int n, int cnnz, Lambda reduction_op) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (row < n) {
+    int start_idx = row_ind[row];  // each thread processes one row
+    int stop_idx = get_stop_idx(row, n, cnnz, row_ind);
+
+    int row_nnz = 0;
+    int out_start_idx = start_idx * 2;
+
+    for (int idx = 0; idx < stop_idx - start_idx; idx++) {
+      int cur_row = rows[idx + start_idx];
+      int cur_col = cols[idx + start_idx];
+      T cur_val = vals[idx + start_idx];
+
+      int lookup_row = cur_col;
+      int t_start = row_ind[lookup_row];  // Start at
+      int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind);
+
+      T transpose = 0.0;
+
+      bool found_match = false;
+      for (int t_idx = t_start; t_idx < t_stop; t_idx++) {
+        // If we find a match, let's get out of the loop. We won't
+        // need to modify the transposed value, since that will be
+        // done in a different thread.
+        if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) {
+          // If it exists already, set transposed value to existing value
+          transpose = vals[t_idx];
+          found_match = true;
+          break;
+        }
+      }
+
+      // Custom reduction op on value and its transpose, which enables
+      // specialized weighting.
+      // If only simple X+X.T is desired, this op can just sum
+      // the two values.
+      T res = reduction_op(cur_row, cur_col, cur_val, transpose);
+
+      // if we didn't find an exact match, we need to add
+      // the computed res into our current matrix to guarantee
+      // symmetry.
+      // Note that if we did find a match, we don't need to
+      // compute `res` on it here because it will be computed
+      // in a different thread.
+      if (!found_match && vals[idx] != 0.0) {
+        orows[out_start_idx + row_nnz] = cur_col;
+        ocols[out_start_idx + row_nnz] = cur_row;
+        ovals[out_start_idx + row_nnz] = res;
+        ++row_nnz;
+      }
+
+      if (res != 0.0) {
+        orows[out_start_idx + row_nnz] = cur_row;
+        ocols[out_start_idx + row_nnz] = cur_col;
+        ovals[out_start_idx + row_nnz] = res;
+        ++row_nnz;
+      }
+    }
+  }
+}
+
+/**
+ * @brief takes a COO matrix which may not be symmetric and symmetrizes
+ * it, running a custom reduction function against the each value
+ * and its transposed value.
+ *
+ * @param in: Input COO matrix
+ * @param out: Output symmetrized COO matrix
+ * @param reduction_op: a custom reduction function
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X = 128, typename T, typename Lambda>
+void coo_symmetrize(COO<T> *in, COO<T> *out,
+                    Lambda reduction_op,  // two-argument reducer
+                    std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                    cudaStream_t stream) {
+  dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  ASSERT(!out->validate_mem(), "Expecting unallocated COO for output");
+
+  raft::mr::device::buffer<int> in_row_ind(d_alloc, stream, in->n_rows);
+
+  convert::sorted_coo_to_csr(in, in_row_ind.data(), d_alloc, stream);
+
+  out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream);
+
+  coo_symmetrize_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(
+    in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(),
+    out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Find how much space needed in each row.
+ * We look through all datapoints and increment the count for each row.
+ *
+ * @param data: Input knn distances(n, k)
+ * @param indices: Input knn indices(n, k)
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param row_sizes: Input empty row sum 1 array(n)
+ * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+__global__ static void symmetric_find_size(const value_t *restrict data,
+                                           const value_idx *restrict indices,
+                                           const value_idx n, const int k,
+                                           value_idx *restrict row_sizes,
+                                           value_idx *restrict row_sizes2) {
+  const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
+  const auto j =
+    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  if (row >= n || j >= k) return;
+
+  const auto col = indices[row * k + j];
+  if (j % 2)
+    atomicAdd(&row_sizes[col], value_idx(1));
+  else
+    atomicAdd(&row_sizes2[col], value_idx(1));
+}
+
+/**
+ * @brief Reduce sum(row_sizes) + k
+ * Reduction for symmetric_find_size kernel. Allows algo to be faster.
+ *
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param row_sizes: Input row sum 1 array(n)
+ * @param row_sizes2: Input row sum 2 array(n) for faster reduction
+ */
+template <typename value_idx>
+__global__ static void reduce_find_size(const value_idx n, const int k,
+                                        value_idx *restrict row_sizes,
+                                        const value_idx *restrict row_sizes2) {
+  const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (i >= n) return;
+  row_sizes[i] += (row_sizes2[i] + k);
+}
+
+/**
+ * @brief Perform data + data.T operation.
+ * Can only run once row_sizes from the CSR matrix of data + data.T has been
+ * determined.
+ *
+ * @param edges: Input row sum array(n) after reduction
+ * @param data: Input knn distances(n, k)
+ * @param indices: Input knn indices(n, k)
+ * @param VAL: Output values for data + data.T
+ * @param COL: Output column indices for data + data.T
+ * @param ROW: Output row indices for data + data.T
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+__global__ static void symmetric_sum(value_idx *restrict edges,
+                                     const value_t *restrict data,
+                                     const value_idx *restrict indices,
+                                     value_t *restrict VAL,
+                                     value_idx *restrict COL,
+                                     value_idx *restrict ROW, const value_idx n,
+                                     const int k) {
+  const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
+  const auto j =
+    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  if (row >= n || j >= k) return;
+
+  const auto col = indices[row * k + j];
+  const auto original = atomicAdd(&edges[row], value_idx(1));
+  const auto transpose = atomicAdd(&edges[col], value_idx(1));
+
+  VAL[transpose] = VAL[original] = data[row * k + j];
+  // Notice swapped ROW, COL since transpose
+  ROW[original] = row;
+  COL[original] = col;
+
+  ROW[transpose] = col;
+  COL[transpose] = row;
+}
+
+/**
+ * @brief Perform data + data.T on raw KNN data.
+ * The following steps are invoked:
+ * (1) Find how much space needed in each row
+ * (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
+ * (3) Allocate new space
+ * (4) Prepare edges for each new row
+ * (5) Perform final data + data.T operation
+ * (6) Return summed up VAL, COL, ROW
+ *
+ * @param knn_indices: Input knn distances(n, k)
+ * @param knn_dists: Input knn indices(n, k)
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param out: Output COO Matrix class
+ * @param stream: Input cuda stream
+ * @param d_alloc device allocator for temporary buffers
+ */
+template <typename value_idx = int64_t, typename value_t = float,
+          int TPB_X = 32, int TPB_Y = 32>
+void from_knn_symmetrize_matrix(
+  const value_idx *restrict knn_indices, const value_t *restrict knn_dists,
+  const value_idx n, const int k, COO<value_t, value_idx> *out,
+  cudaStream_t stream, std::shared_ptr<raft::mr::device::allocator> d_alloc) {
+  // (1) Find how much space needed in each row
+  // We look through all datapoints and increment the count for each row.
+  const dim3 threadsPerBlock(TPB_X, TPB_Y);
+  const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X),
+                       raft::ceildiv(k, TPB_Y));
+
+  // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4)
+  raft::mr::device::buffer<value_idx> row_sizes(d_alloc, stream, n);
+  CUDA_CHECK(
+    cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
+
+  raft::mr::device::buffer<value_idx> row_sizes2(d_alloc, stream, n);
+  CUDA_CHECK(
+    cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
+
+  symmetric_find_size<<<numBlocks, threadsPerBlock, 0, stream>>>(
+    knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data());
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  reduce_find_size<<<raft::ceildiv(n, (value_idx)1024), 1024, 0, stream>>>(
+    n, k, row_sizes.data(), row_sizes2.data());
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  // (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
+  // Notice we don't do any merging and leave the result as 2*NNZ
+  const auto NNZ = 2 * n * k;
+
+  // (3) Allocate new space
+  out->allocate(NNZ, n, n, true, stream);
+
+  // (4) Prepare edges for each new row
+  // This mirrors CSR matrix's row Pointer, were maximum bounds for each row
+  // are calculated as the cumulative rolling sum of the previous rows.
+  // Notice reusing old row_sizes2 memory
+  value_idx *edges = row_sizes2.data();
+  thrust::device_ptr<value_idx> __edges = thrust::device_pointer_cast(edges);
+  thrust::device_ptr<value_idx> __row_sizes =
+    thrust::device_pointer_cast(row_sizes.data());
+
+  // Rolling cumulative sum
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes,
+                         __row_sizes + n, __edges);
+
+  // (5) Perform final data + data.T operation in tandem with memcpying
+  symmetric_sum<<<numBlocks, threadsPerBlock, 0, stream>>>(
+    edges, knn_dists, knn_indices, out->vals(), out->cols(), out->rows(), n, k);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h
new file mode 100644
index 0000000000..6afe4ca8f6
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/transpose.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * Transpose a set of CSR arrays into a set of CSC arrays.
+ * @tparam value_idx : data type of the CSR index arrays
+ * @tparam value_t : data type of the CSR data array
+ * @param[in] handle : used for invoking cusparse
+ * @param[in] csr_indptr : CSR row index array
+ * @param[in] csr_indices : CSR column indices array
+ * @param[in] csr_data : CSR data array
+ * @param[out] csc_indptr : CSC row index array
+ * @param[out] csc_indices : CSC column indices array
+ * @param[out] csc_data : CSC data array
+ * @param[in] csr_nrows : Number of rows in CSR
+ * @param[in] csr_ncols : Number of columns in CSR
+ * @param[in] nnz : Number of nonzeros of CSR
+ * @param[in] allocator : Allocator for intermediate memory
+ * @param[in] stream : Cuda stream for ordering events
+ */
+template <typename value_idx, typename value_t>
+void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr,
+                   const value_idx *csr_indices, const value_t *csr_data,
+                   value_idx *csc_indptr, value_idx *csc_indices,
+                   value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols,
+                   value_idx nnz,
+                   std::shared_ptr<raft::mr::device::allocator> allocator,
+                   cudaStream_t stream) {
+  size_t convert_csc_workspace_size = 0;
+
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(
+    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
+    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
+    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
+    &convert_csc_workspace_size, stream));
+
+  raft::mr::device::buffer<char> convert_csc_workspace(
+    allocator, stream, convert_csc_workspace_size);
+
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(
+    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
+    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
+    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
+    convert_csc_workspace.data(), stream));
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
new file mode 100644
index 0000000000..53359be57c
--- /dev/null
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/utils.h>
+#include <raft/sparse/coo.cuh>
+#include <raft/sparse/linalg/degree.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+template <int TPB_X, typename T>
+__global__ void coo_remove_zeros_kernel(const int *rows, const int *cols,
+                                        const T *vals, int nnz, int *crows,
+                                        int *ccols, T *cvals, int *ex_scan,
+                                        int *cur_ex_scan, int m) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (row < m) {
+    int start = cur_ex_scan[row];
+    int stop = get_stop_idx(row, m, nnz, cur_ex_scan);
+    int cur_out_idx = ex_scan[row];
+
+    for (int idx = start; idx < stop; idx++) {
+      if (vals[idx] != 0.0) {
+        crows[cur_out_idx] = rows[idx];
+        ccols[cur_out_idx] = cols[idx];
+        cvals[cur_out_idx] = vals[idx];
+        ++cur_out_idx;
+      }
+    }
+  }
+}
+
+template <int TPB_X, typename T>
+__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
+                                         const T *vals, int nnz, int *crows,
+                                         int *ccols, T *cvals, int *ex_scan,
+                                         int *cur_ex_scan, int m, T scalar) {
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (row < m) {
+    int start = cur_ex_scan[row];
+    int stop = get_stop_idx(row, m, nnz, cur_ex_scan);
+    int cur_out_idx = ex_scan[row];
+
+    for (int idx = start; idx < stop; idx++) {
+      if (vals[idx] != scalar) {
+        crows[cur_out_idx] = rows[idx];
+        ccols[cur_out_idx] = cols[idx];
+        cvals[cur_out_idx] = vals[idx];
+        ++cur_out_idx;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
+ *
+ * @param rows: input array of rows (size n)
+ * @param cols: input array of cols (size n)
+ * @param vals: input array of vals (size n)
+ * @param nnz: size of current rows/cols/vals arrays
+ * @param crows: compressed array of rows
+ * @param ccols: compressed array of cols
+ * @param cvals: compressed array of vals
+ * @param cnnz: array of non-zero counts per row
+ * @param cur_cnnz array of counts per row
+ * @param scalar: scalar to remove from arrays
+ * @param n: number of rows in dense matrix
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T>
+void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
+                       int *crows, int *ccols, T *cvals, int *cnnz,
+                       int *cur_cnnz, T scalar, int n,
+                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                       cudaStream_t stream) {
+  raft::mr::device::buffer<int> ex_scan(d_alloc, stream, n);
+  raft::mr::device::buffer<int> cur_ex_scan(d_alloc, stream, n);
+
+  CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
+
+  thrust::device_ptr<int> dev_cnnz = thrust::device_pointer_cast(cnnz);
+  thrust::device_ptr<int> dev_ex_scan =
+    thrust::device_pointer_cast(ex_scan.data());
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n,
+                         dev_ex_scan);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  thrust::device_ptr<int> dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz);
+  thrust::device_ptr<int> dev_cur_ex_scan =
+    thrust::device_pointer_cast(cur_ex_scan.data());
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz,
+                         dev_cur_cnnz + n, dev_cur_ex_scan);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(
+    rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(),
+    dev_cur_ex_scan.get(), n, scalar);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
+ *
+ * @param in: input COO matrix
+ * @param out: output COO matrix
+ * @param scalar: scalar to remove from arrays
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T>
+void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
+                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                       cudaStream_t stream) {
+  raft::mr::device::buffer<int> row_count_nz(d_alloc, stream, in->n_rows);
+  raft::mr::device::buffer<int> row_count(d_alloc, stream, in->n_rows);
+
+  CUDA_CHECK(
+    cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
+
+  linalg::coo_degree<TPB_X>(in->rows(), in->nnz, row_count.data(), stream);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  linalg::coo_degree_scalar<TPB_X>(in->rows(), in->vals(), in->nnz, scalar,
+                                   row_count_nz.data(), stream);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  thrust::device_ptr<int> d_row_count_nz =
+    thrust::device_pointer_cast(row_count_nz.data());
+  int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz,
+                               d_row_count_nz + in->n_rows);
+
+  out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
+
+  coo_remove_scalar<TPB_X, T>(in->rows(), in->cols(), in->vals(), in->nnz,
+                              out->rows(), out->cols(), out->vals(),
+                              row_count_nz.data(), row_count.data(), scalar,
+                              in->n_rows, d_alloc, stream);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Removes zeros from a COO formatted sparse matrix.
+ *
+ * @param in: input COO matrix
+ * @param out: output COO matrix
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <int TPB_X, typename T>
+void coo_remove_zeros(COO<T> *in, COO<T> *out,
+                      std::shared_ptr<raft::mr::device::allocator> d_alloc,
+                      cudaStream_t stream) {
+  coo_remove_scalar<TPB_X, T>(in, out, T(0.0), d_alloc, stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh
new file mode 100644
index 0000000000..9e5034dc28
--- /dev/null
+++ b/cpp/include/raft/sparse/op/row_op.cuh
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+template <typename T, int TPB_X = 256, typename Lambda = auto(T, T, T)->void>
+__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz,
+                                  Lambda op) {
+  T row = blockIdx.x * TPB_X + threadIdx.x;
+  if (row < n_rows) {
+    T start_idx = row_ind[row];
+    T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz;
+    op(row, start_idx, stop_idx);
+  }
+}
+
+/**
+ * @brief Perform a custom row operation on a CSR matrix in batches.
+ * @tparam T numerical type of row_ind array
+ * @tparam TPB_X number of threads per block to use for underlying kernel
+ * @tparam Lambda type of custom operation function
+ * @param row_ind the CSR row_ind array to perform parallel operations over
+ * @param n_rows total number vertices in graph
+ * @param nnz number of non-zeros
+ * @param op custom row operation functor accepting the row and beginning index.
+ * @param stream cuda stream to use
+ */
+template <typename Index_, int TPB_X = 256,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op,
+                cudaStream_t stream) {
+  dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+  csr_row_op_kernel<Index_, TPB_X>
+    <<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
+
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/slice.h
new file mode 100644
index 0000000000..46f4f41879
--- /dev/null
+++ b/cpp/include/raft/sparse/op/slice.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * Slice consecutive rows from a CSR array and populate newly sliced indptr array
+ * @tparam value_idx
+ * @param[in] start_row : beginning row to slice
+ * @param[in] stop_row : ending row to slice
+ * @param[in] indptr : indptr of input CSR to slice
+ * @param[out] indptr_out : output sliced indptr to populate
+ * @param[in] start_offset : beginning column offset of input indptr
+ * @param[in] stop_offset : ending column offset of input indptr
+ * @param[in] stream : cuda stream for ordering events
+ */
+template <typename value_idx>
+void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
+                          const value_idx *indptr, value_idx *indptr_out,
+                          value_idx *start_offset, value_idx *stop_offset,
+                          cudaStream_t stream) {
+  raft::update_host(start_offset, indptr + start_row, 1, stream);
+  raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  value_idx s_offset = *start_offset;
+
+  // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1,
+  // we add another 1 to stop row.
+  raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row,
+                   stream);
+
+  raft::linalg::unaryOp<value_idx>(
+    indptr_out, indptr_out, (stop_row + 2) - start_row,
+    [s_offset] __device__(value_idx input) { return input - s_offset; },
+    stream);
+}
+
+/**
+ * Slice rows from a CSR, populate column and data arrays
+ * @tparam[in] value_idx : data type of CSR index arrays
+ * @tparam[in] value_t : data type of CSR data array
+ * @param[in] start_offset : beginning column offset to slice
+ * @param[in] stop_offset : ending column offset to slice
+ * @param[in] indices : column indices array from input CSR
+ * @param[in] data : data array from input CSR
+ * @param[out] indices_out : output column indices array
+ * @param[out] data_out : output data array
+ * @param[in] stream : cuda stream for ordering events
+ */
+template <typename value_idx, typename value_t>
+void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset,
+                            const value_idx *indices, const value_t *data,
+                            value_idx *indices_out, value_t *data_out,
+                            cudaStream_t stream) {
+  raft::copy(indices_out, indices + start_offset, stop_offset - start_offset,
+             stream);
+  raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h
new file mode 100644
index 0000000000..b039e52517
--- /dev/null
+++ b/cpp/include/raft/sparse/op/sort.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/utils.h>
+#include <raft/sparse/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * @brief Sorts the arrays that comprise the coo matrix
+ * by row.
+ *
+ * @param m number of rows in coo matrix
+ * @param n number of cols in coo matrix
+ * @param nnz number of non-zeros
+ * @param rows rows array from coo matrix
+ * @param cols cols array from coo matrix
+ * @param vals vals array from coo matrix
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
+              std::shared_ptr<raft::mr::device::allocator> d_alloc,
+              cudaStream_t stream) {
+  cusparseHandle_t handle = NULL;
+
+  size_t pBufferSizeInBytes = 0;
+
+  CUSPARSE_CHECK(cusparseCreate(&handle));
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, rows, cols,
+                                                &pBufferSizeInBytes));
+
+  raft::mr::device::buffer<int> d_P(d_alloc, stream, nnz);
+  raft::mr::device::buffer<char> pBuffer(d_alloc, stream, pBufferSizeInBytes);
+
+  CUSPARSE_CHECK(cusparseCreateIdentityPermutation(handle, nnz, d_P.data()));
+
+  CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, rows, cols,
+                                       d_P.data(), pBuffer.data()));
+
+  raft::mr::device::buffer<T> vals_sorted(d_alloc, stream, nnz);
+
+  CUSPARSE_CHECK(raft::sparse::cusparsegthr<T>(
+    handle, nnz, vals, vals_sorted.data(), d_P.data(), stream));
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  raft::copy(vals, vals_sorted.data(), nnz, stream);
+
+  CUSPARSE_CHECK(cusparseDestroy(handle));
+}
+
+/**
+ * @brief Sort the underlying COO arrays by row
+ * @tparam T: the type name of the underlying value array
+ * @param in: COO to sort by row
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: the cuda stream to use
+ */
+template <typename T>
+void coo_sort(COO<T> *const in,
+              std::shared_ptr<raft::mr::device::allocator> d_alloc,
+              cudaStream_t stream) {
+  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(),
+              in->vals(), d_alloc, stream);
+}
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
new file mode 100644
index 0000000000..3e8fa2bd6f
--- /dev/null
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/sparse/op/slice.h>
+#include <raft/sparse/utils.h>
+#include <raft/sparse/coo.cuh>
+#include <raft/sparse/csr.cuh>
+#include <raft/sparse/distance/distance.cuh>
+#include <raft/sparse/selection/selection.cuh>
+#include <raft/spatial/knn/detail/brute_force_knn.hpp>
+#include <raft/spatial/knn/knn.hpp>
+
+#include <raft/linalg/distance_type.h>
+
+#include <raft/cudart_utils.h>
+
+#include <raft/cuda_utils.cuh>
+
+#include <raft/sparse/cusparse_wrappers.h>
+
+#include <cusparse_v2.h>
+
+namespace raft {
+namespace sparse {
+namespace selection {
+
+template <typename value_idx, typename value_t>
+struct csr_batcher_t {
+  csr_batcher_t(value_idx batch_size, value_idx n_rows,
+                const value_idx *csr_indptr, const value_idx *csr_indices,
+                const value_t *csr_data)
+    : batch_start_(0),
+      batch_stop_(0),
+      batch_rows_(0),
+      total_rows_(n_rows),
+      batch_size_(batch_size),
+      csr_indptr_(csr_indptr),
+      csr_indices_(csr_indices),
+      csr_data_(csr_data),
+      batch_csr_start_offset_(0),
+      batch_csr_stop_offset_(0) {}
+
+  void set_batch(int batch_num) {
+    batch_start_ = batch_num * batch_size_;
+    batch_stop_ = batch_start_ + batch_size_ - 1;  // zero-based indexing
+
+    if (batch_stop_ >= total_rows_)
+      batch_stop_ = total_rows_ - 1;  // zero-based indexing
+
+    batch_rows_ = (batch_stop_ - batch_start_) + 1;
+  }
+
+  value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr,
+                                     cudaStream_t stream) {
+    raft::sparse::op::csr_row_slice_indptr(
+      batch_start_, batch_stop_, csr_indptr_, batch_indptr,
+      &batch_csr_start_offset_, &batch_csr_stop_offset_, stream);
+
+    return batch_csr_stop_offset_ - batch_csr_start_offset_;
+  }
+
+  void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data,
+                                  cudaStream_t stream) {
+    raft::sparse::op::csr_row_slice_populate(
+      batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_,
+      csr_indices, csr_data, stream);
+  }
+
+  value_idx batch_rows() const { return batch_rows_; }
+
+  value_idx batch_start() const { return batch_start_; }
+
+  value_idx batch_stop() const { return batch_stop_; }
+
+ private:
+  value_idx batch_size_;
+  value_idx batch_start_;
+  value_idx batch_stop_;
+  value_idx batch_rows_;
+
+  value_idx total_rows_;
+
+  const value_idx *csr_indptr_;
+  const value_idx *csr_indices_;
+  const value_t *csr_data_;
+
+  value_idx batch_csr_start_offset_;
+  value_idx batch_csr_stop_offset_;
+};
+
+template <typename value_idx, typename value_t>
+class sparse_knn_t {
+ public:
+  sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_,
+               const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_,
+               int n_idx_cols_, const value_idx *queryIndptr_,
+               const value_idx *queryIndices_, const value_t *queryData_,
+               size_t queryNNZ_, int n_query_rows_, int n_query_cols_,
+               value_idx *output_indices_, value_t *output_dists_, int k_,
+               cusparseHandle_t cusparseHandle_,
+               std::shared_ptr<raft::mr::device::allocator> allocator_,
+               cudaStream_t stream_,
+               size_t batch_size_index_ = 2 << 14,  // approx 1M
+               size_t batch_size_query_ = 2 << 14,
+               raft::distance::DistanceType metric_ =
+                 raft::distance::DistanceType::L2Expanded,
+               float metricArg_ = 0, bool expanded_form_ = false)
+    : idxIndptr(idxIndptr_),
+      idxIndices(idxIndices_),
+      idxData(idxData_),
+      idxNNZ(idxNNZ_),
+      n_idx_rows(n_idx_rows_),
+      n_idx_cols(n_idx_cols_),
+      queryIndptr(queryIndptr_),
+      queryIndices(queryIndices_),
+      queryData(queryData_),
+      queryNNZ(queryNNZ_),
+      n_query_rows(n_query_rows_),
+      n_query_cols(n_query_cols_),
+      output_indices(output_indices_),
+      output_dists(output_dists_),
+      k(k_),
+      cusparseHandle(cusparseHandle_),
+      allocator(allocator_),
+      stream(stream_),
+      batch_size_index(batch_size_index_),
+      batch_size_query(batch_size_query_),
+      metric(metric_),
+      metricArg(metricArg_),
+      expanded_form(expanded_form_) {}
+
+  void run() {
+    using namespace raft::sparse;
+
+    int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query);
+    csr_batcher_t<value_idx, value_t> query_batcher(
+      batch_size_query, n_query_rows, queryIndptr, queryIndices, queryData);
+
+    size_t rows_processed = 0;
+
+    for (int i = 0; i < n_batches_query; i++) {
+      /**
+        * Compute index batch info
+        */
+      query_batcher.set_batch(i);
+
+      /**
+        * Slice CSR to rows in batch
+        */
+
+      raft::mr::device::buffer<value_idx> query_batch_indptr(
+        allocator, stream, query_batcher.batch_rows() + 1);
+
+      value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz(
+        query_batch_indptr.data(), stream);
+
+      raft::mr::device::buffer<value_idx> query_batch_indices(
+        allocator, stream, n_query_batch_nnz);
+      raft::mr::device::buffer<value_t> query_batch_data(allocator, stream,
+                                                         n_query_batch_nnz);
+
+      query_batcher.get_batch_csr_indices_data(query_batch_indices.data(),
+                                               query_batch_data.data(), stream);
+
+      // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent
+      // batches and 1 space for the results of the merge, which get copied back to the top
+      raft::mr::device::buffer<value_idx> merge_buffer_indices(allocator,
+                                                               stream, 0);
+      raft::mr::device::buffer<value_t> merge_buffer_dists(allocator, stream,
+                                                           0);
+
+      value_t *dists_merge_buffer_ptr;
+      value_idx *indices_merge_buffer_ptr;
+
+      int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index);
+      csr_batcher_t<value_idx, value_t> idx_batcher(
+        batch_size_index, n_idx_rows, idxIndptr, idxIndices, idxData);
+
+      for (int j = 0; j < n_batches_idx; j++) {
+        idx_batcher.set_batch(j);
+
+        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, stream);
+        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, stream);
+
+        /**
+          * Slice CSR to rows in batch
+        */
+        raft::mr::device::buffer<value_idx> idx_batch_indptr(
+          allocator, stream, idx_batcher.batch_rows() + 1);
+        raft::mr::device::buffer<value_idx> idx_batch_indices(allocator, stream,
+                                                              0);
+        raft::mr::device::buffer<value_t> idx_batch_data(allocator, stream, 0);
+
+        value_idx idx_batch_nnz =
+          idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), stream);
+
+        idx_batch_indices.resize(idx_batch_nnz, stream);
+        idx_batch_data.resize(idx_batch_nnz, stream);
+
+        idx_batcher.get_batch_csr_indices_data(idx_batch_indices.data(),
+                                               idx_batch_data.data(), stream);
+
+        /**
+           * Compute distances
+           */
+        size_t dense_size =
+          idx_batcher.batch_rows() * query_batcher.batch_rows();
+        raft::mr::device::buffer<value_t> batch_dists(allocator, stream,
+                                                      dense_size);
+
+        CUDA_CHECK(cudaMemset(batch_dists.data(), 0,
+                              batch_dists.size() * sizeof(value_t)));
+
+        compute_distances(idx_batcher, query_batcher, idx_batch_nnz,
+                          n_query_batch_nnz, idx_batch_indptr.data(),
+                          idx_batch_indices.data(), idx_batch_data.data(),
+                          query_batch_indptr.data(), query_batch_indices.data(),
+                          query_batch_data.data(), batch_dists.data());
+
+        idx_batch_indptr.release(stream);
+        idx_batch_indices.release(stream);
+        idx_batch_data.release(stream);
+
+        // Build batch indices array
+        raft::mr::device::buffer<value_idx> batch_indices(allocator, stream,
+                                                          batch_dists.size());
+
+        // populate batch indices array
+        value_idx batch_rows = query_batcher.batch_rows(),
+                  batch_cols = idx_batcher.batch_rows();
+
+        iota_fill(batch_indices.data(), batch_rows, batch_cols, stream);
+
+        /**
+         * Perform k-selection on batch & merge with other k-selections
+         */
+        size_t merge_buffer_offset = batch_rows * k;
+        dists_merge_buffer_ptr =
+          merge_buffer_dists.data() + merge_buffer_offset;
+        indices_merge_buffer_ptr =
+          merge_buffer_indices.data() + merge_buffer_offset;
+
+        perform_k_selection(idx_batcher, query_batcher, batch_dists.data(),
+                            batch_indices.data(), dists_merge_buffer_ptr,
+                            indices_merge_buffer_ptr);
+
+        perform_postprocessing(dists_merge_buffer_ptr, batch_rows);
+
+        value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr;
+        value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
+
+        // Merge results of difference batches if necessary
+        if (idx_batcher.batch_start() > 0) {
+          size_t merge_buffer_tmp_out = batch_rows * k * 2;
+          dists_merge_buffer_tmp_ptr =
+            merge_buffer_dists.data() + merge_buffer_tmp_out;
+          indices_merge_buffer_tmp_ptr =
+            merge_buffer_indices.data() + merge_buffer_tmp_out;
+
+          merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(),
+                        merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr,
+                        indices_merge_buffer_tmp_ptr);
+        }
+
+        // copy merged output back into merge buffer partition for next iteration
+        raft::copy_async<value_idx>(merge_buffer_indices.data(),
+                                    indices_merge_buffer_tmp_ptr,
+                                    batch_rows * k, stream);
+        raft::copy_async<value_t>(merge_buffer_dists.data(),
+                                  dists_merge_buffer_tmp_ptr, batch_rows * k,
+                                  stream);
+      }
+
+      // Copy final merged batch to output array
+      raft::copy_async<value_idx>(output_indices + (rows_processed * k),
+                                  merge_buffer_indices.data(),
+                                  query_batcher.batch_rows() * k, stream);
+      raft::copy_async<value_t>(output_dists + (rows_processed * k),
+                                merge_buffer_dists.data(),
+                                query_batcher.batch_rows() * k, stream);
+
+      rows_processed += query_batcher.batch_rows();
+    }
+  }
+
+  void perform_postprocessing(value_t *dists, size_t batch_rows) {
+    // Perform necessary post-processing
+    if (metric == raft::distance::DistanceType::L2Expanded && !expanded_form) {
+      /**
+        * post-processing
+        */
+      value_t p = 0.5;  // standard l2
+      raft::linalg::unaryOp<value_t>(
+        dists, dists, batch_rows * k,
+        [p] __device__(value_t input) {
+          int neg = input < 0 ? -1 : 1;
+          return powf(fabs(input), p) * neg;
+        },
+        stream);
+    }
+  }
+
+ private:
+  void merge_batches(csr_batcher_t<value_idx, value_t> &idx_batcher,
+                     csr_batcher_t<value_idx, value_t> &query_batcher,
+                     value_t *merge_buffer_dists,
+                     value_idx *merge_buffer_indices, value_t *out_dists,
+                     value_idx *out_indices) {
+    // build translation buffer to shift resulting indices by the batch
+    std::vector<value_idx> id_ranges;
+    id_ranges.push_back(0);
+    id_ranges.push_back(idx_batcher.batch_start());
+
+    raft::mr::device::buffer<value_idx> trans(allocator, stream,
+                                              id_ranges.size());
+    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(),
+                        stream);
+
+    // combine merge buffers only if there's more than 1 partition to combine
+    raft::spatial::knn::detail::knn_merge_parts(
+      merge_buffer_dists, merge_buffer_indices, out_dists, out_indices,
+      query_batcher.batch_rows(), 2, k, stream, trans.data());
+  }
+
+  void perform_k_selection(csr_batcher_t<value_idx, value_t> idx_batcher,
+                           csr_batcher_t<value_idx, value_t> query_batcher,
+                           value_t *batch_dists, value_idx *batch_indices,
+                           value_t *out_dists, value_idx *out_indices) {
+    // populate batch indices array
+    value_idx batch_rows = query_batcher.batch_rows(),
+              batch_cols = idx_batcher.batch_rows();
+
+    // build translation buffer to shift resulting indices by the batch
+    std::vector<value_idx> id_ranges;
+    id_ranges.push_back(0);
+    id_ranges.push_back(idx_batcher.batch_start());
+
+    // in the case where the number of idx rows in the batch is < k, we
+    // want to adjust k.
+    value_idx n_neighbors = min(k, batch_cols);
+
+    bool ascending = true;
+    if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;
+
+    // kernel to slice first (min) k cols and copy into batched merge buffer
+    select_k(batch_dists, batch_indices, batch_rows, batch_cols, out_dists,
+             out_indices, ascending, n_neighbors, stream);
+  }
+
+  void compute_distances(csr_batcher_t<value_idx, value_t> &idx_batcher,
+                         csr_batcher_t<value_idx, value_t> &query_batcher,
+                         size_t idx_batch_nnz, size_t query_batch_nnz,
+                         value_idx *idx_batch_indptr,
+                         value_idx *idx_batch_indices, value_t *idx_batch_data,
+                         value_idx *query_batch_indptr,
+                         value_idx *query_batch_indices,
+                         value_t *query_batch_data, value_t *batch_dists) {
+    /**
+     * Compute distances
+     */
+    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config;
+    dist_config.b_nrows = idx_batcher.batch_rows();
+    dist_config.b_ncols = n_idx_cols;
+    dist_config.b_nnz = idx_batch_nnz;
+
+    dist_config.b_indptr = idx_batch_indptr;
+    dist_config.b_indices = idx_batch_indices;
+    dist_config.b_data = idx_batch_data;
+
+    dist_config.a_nrows = query_batcher.batch_rows();
+    dist_config.a_ncols = n_query_cols;
+    dist_config.a_nnz = query_batch_nnz;
+
+    dist_config.a_indptr = query_batch_indptr;
+    dist_config.a_indices = query_batch_indices;
+    dist_config.a_data = query_batch_data;
+
+    dist_config.handle = cusparseHandle;
+    dist_config.allocator = allocator;
+    dist_config.stream = stream;
+
+    raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric,
+                                             metricArg);
+  }
+
+  const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices;
+  value_idx *output_indices;
+  const value_t *idxData, *queryData;
+  value_t *output_dists;
+
+  size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query;
+
+  raft::distance::DistanceType metric;
+
+  float metricArg;
+
+  bool expanded_form;
+
+  int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k;
+
+  cusparseHandle_t cusparseHandle;
+
+  std::shared_ptr<raft::mr::device::allocator> allocator;
+
+  cudaStream_t stream;
+};
+
+/**
+   * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
+   * using some distance implementation
+   * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
+   * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
+   * @param[in] idxData csr data array of the index matrix (size idxNNZ)
+   * @param[in] idxNNA number of non-zeros for sparse index matrix
+   * @param[in] n_idx_rows number of data samples in index matrix
+   * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
+   * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
+   * @param[in] queryData csr data array of the query matrix (size queryNNZ)
+   * @param[in] queryNNZ number of non-zeros for sparse query matrix
+   * @param[in] n_query_rows number of data samples in query matrix
+   * @param[in] n_query_cols number of features in query matrix
+   * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
+   * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
+   * @param[in] k the number of neighbors to query
+   * @param[in] cusparseHandle the initialized cusparseHandle instance to use
+   * @param[in] allocator device allocator instance to use
+   * @param[in] stream CUDA stream to order operations with respect to
+   * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
+   * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
+   * @param[in] metric distance metric/measure to use
+   * @param[in] metricArg potential argument for metric (currently unused)
+   * @param[in] expanded_form whether or not Lp variants should be reduced by the pth-root
+   */
+template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
+void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices,
+                     const value_t *idxData, size_t idxNNZ, int n_idx_rows,
+                     int n_idx_cols, const value_idx *queryIndptr,
+                     const value_idx *queryIndices, const value_t *queryData,
+                     size_t queryNNZ, int n_query_rows, int n_query_cols,
+                     value_idx *output_indices, value_t *output_dists, int k,
+                     cusparseHandle_t cusparseHandle,
+                     std::shared_ptr<raft::mr::device::allocator> allocator,
+                     cudaStream_t stream,
+                     size_t batch_size_index = 2 << 14,  // approx 1M
+                     size_t batch_size_query = 2 << 14,
+                     raft::distance::DistanceType metric =
+                       raft::distance::DistanceType::L2Expanded,
+                     float metricArg = 0, bool expanded_form = false) {
+  sparse_knn_t<value_idx, value_t>(
+    idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr,
+    queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols,
+    output_indices, output_dists, k, cusparseHandle, allocator, stream,
+    batch_size_index, batch_size_query, metric, metricArg, expanded_form)
+    .run();
+}
+
+};  // namespace selection
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/selection/selection.cuh b/cpp/include/raft/sparse/selection/selection.cuh
new file mode 100644
index 0000000000..6066a36289
--- /dev/null
+++ b/cpp/include/raft/sparse/selection/selection.cuh
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/matrix/matrix.cuh>
+
+#include <raft/sparse/coo.cuh>
+#include <raft/sparse/csr.cuh>
+#include <raft/sparse/distance/distance.cuh>
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+
+#include <cusparse_v2.h>
+
+namespace raft {
+namespace sparse {
+namespace selection {
+
+template <typename K, typename IndexType, bool select_min, int warp_q,
+          int thread_q, int tpb>
+__global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows,
+                                size_t n_cols, K *outK, IndexType *outV,
+                                K initK, IndexType initV, int k) {
+  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+
+  __shared__ K smemK[kNumWarps * warp_q];
+  __shared__ IndexType smemV[kNumWarps * warp_q];
+
+  faiss::gpu::BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>,
+                          warp_q, thread_q, tpb>
+    heap(initK, initV, smemK, smemV, k);
+
+  // Grid is exactly sized to rows available
+  int row = blockIdx.x;
+  int i = threadIdx.x;
+
+  int idx = row * n_cols;
+  K *inKStart = inK + idx + i;
+  IndexType *inVStart = inV + idx + i;
+
+  // Whole warps must participate in the selection
+  int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize);
+
+  for (; i < limit; i += tpb) {
+    inKStart = inK + idx + i;
+    inVStart = inV + idx + i;
+
+    heap.add(*inKStart, *inVStart);
+  }
+
+  // Handle last remainder fraction of a warp of elements
+  if (i < n_cols) {
+    inKStart = inK + idx + i;
+    inVStart = inV + idx + i;
+    heap.addThreadQ(*inKStart, *inVStart);
+  }
+
+  heap.reduce();
+
+  for (int i = threadIdx.x; i < k; i += tpb) {
+    outK[row * k + i] = smemK[i];
+    outV[row * k + i] = smemV[i];
+  }
+}
+
+template <typename value_idx = int, typename value_t = float, int warp_q,
+          int thread_q>
+inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows,
+                          size_t n_cols, value_t *outK, value_idx *outV,
+                          bool select_min, int k, cudaStream_t stream) {
+  auto grid = dim3(n_rows);
+
+  constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
+  auto block = dim3(n_threads);
+
+  auto kInit = select_min ? faiss::gpu::Limits<value_t>::getMax()
+                          : faiss::gpu::Limits<value_t>::getMin();
+  auto vInit = -1;
+  if (select_min) {
+    select_k_kernel<value_t, value_idx, false, warp_q, thread_q, n_threads>
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit,
+                                   vInit, k);
+  } else {
+    select_k_kernel<value_t, value_idx, true, warp_q, thread_q, n_threads>
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit,
+                                   vInit, k);
+  }
+  CUDA_CHECK(cudaGetLastError());
+}
+
+/**
+ * @brief Select the k-nearest neighbors from dense
+ * distance and index matrices.
+ *
+ * @param[in] inK partitioned knn distance matrix
+ * @param[in] inV partitioned knn index matrix
+ * @param[in] n_rows number of rows in distance and index matrices
+ * @param[in] n_cols number of columns in distance and index matrices
+ * @param[out] outK merged knn distance matrix
+ * @param[out] outV merged knn index matrix
+ * @param[in] select_min whether to select the min or the max distances
+ * @param[in] k number of neighbors per partition (also number of merged neighbors)
+ * @param[in] stream CUDA stream to use
+ */
+template <typename value_idx = int, typename value_t = float>
+inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
+                     value_t *outK, value_idx *outV, bool select_min, int k,
+                     cudaStream_t stream) {
+  if (k == 1)
+    select_k_impl<value_idx, value_t, 1, 1>(inK, inV, n_rows, n_cols, outK,
+                                            outV, select_min, k, stream);
+  else if (k <= 32)
+    select_k_impl<value_idx, value_t, 32, 2>(inK, inV, n_rows, n_cols, outK,
+                                             outV, select_min, k, stream);
+  else if (k <= 64)
+    select_k_impl<value_idx, value_t, 64, 3>(inK, inV, n_rows, n_cols, outK,
+                                             outV, select_min, k, stream);
+  else if (k <= 128)
+    select_k_impl<value_idx, value_t, 128, 3>(inK, inV, n_rows, n_cols, outK,
+                                              outV, select_min, k, stream);
+  else if (k <= 256)
+    select_k_impl<value_idx, value_t, 256, 4>(inK, inV, n_rows, n_cols, outK,
+                                              outV, select_min, k, stream);
+  else if (k <= 512)
+    select_k_impl<value_idx, value_t, 512, 8>(inK, inV, n_rows, n_cols, outK,
+                                              outV, select_min, k, stream);
+  else if (k <= 1024)
+    select_k_impl<value_idx, value_t, 1024, 8>(inK, inV, n_rows, n_cols, outK,
+                                               outV, select_min, k, stream);
+}
+
+};  // namespace selection
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/utils.h
new file mode 100644
index 0000000000..63578bf1f3
--- /dev/null
+++ b/cpp/include/raft/sparse/utils.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft {
+namespace sparse {
+
+/**
+ * Quantizes ncols to a valid blockdim, which is
+ * a multiple of 32.
+ *
+ * @param[in] ncols number of blocks to quantize
+ */
+template <typename value_idx>
+inline int block_dim(value_idx ncols) {
+  int blockdim;
+  if (ncols <= 32)
+    blockdim = 32;
+  else if (ncols <= 64)
+    blockdim = 64;
+  else if (ncols <= 128)
+    blockdim = 128;
+  else if (ncols <= 256)
+    blockdim = 256;
+  else if (ncols <= 512)
+    blockdim = 512;
+  else
+    blockdim = 1024;
+
+  return blockdim;
+}
+
+// add similar semantics for __match_any_sync pre-volta (SM_70)
+#if __CUDA_ARCH__ < 700
+/**
+ * Returns a warp-level mask with 1's for all the threads
+ * in the current warp that have the same key.
+ * @tparam G
+ * @param key
+ * @return
+ */
+template <typename G>
+__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask,
+                                                    G key) {
+  unsigned int mask = __ballot_sync(init_mask, true);
+  unsigned int peer_group = 0;
+  bool is_peer;
+
+  do {
+    // fetch key of first unclaimed lane and compare with this key
+    is_peer = (key == __shfl_sync(mask, key, __ffs(mask) - 1));
+
+    // determine which lanes had a match
+    peer_group = __ballot_sync(mask, is_peer);
+
+    // remove lanes with matching keys from the pool
+    mask = mask ^ peer_group;
+
+    // quit if we had a match
+  } while (!is_peer);
+
+  return peer_group;
+}
+#endif
+
+__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) {
+  return __ffs(peer_group) - 1;
+}
+
+template <typename value_idx>
+__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) {
+  int row = blockIdx.x;
+  int tid = threadIdx.x;
+
+  for (int i = tid; i < ncols; i += blockDim.x) {
+    indices[row * ncols + i] = i;
+  }
+}
+
+template <typename value_idx>
+void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols,
+               cudaStream_t stream) {
+  int blockdim = block_dim(ncols);
+
+  iota_fill_block_kernel<<<nrows, blockdim, 0, stream>>>(indices, ncols);
+}
+
+template <typename T>
+__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) {
+  int stop_idx = 0;
+  if (row < (m - 1))
+    stop_idx = ind[row + 1];
+  else
+    stop_idx = nnz;
+
+  return stop_idx;
+}
+
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index ccee635701..5b77239dac 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -50,7 +50,7 @@ using deviceAllocator = raft::mr::device::allocator;
  * @param[in] expanded should lp-based distances be returned in their expanded
  * 					 form (e.g., without raising to the 1/p power).
  */
-void brute_force_knn(
+inline void brute_force_knn(
   raft::handle_t &handle, std::vector<float *> &input, std::vector<int> &sizes,
   int D, float *search_items, int n, int64_t *res_I, float *res_D, int k,
   bool rowMajorIndex = false, bool rowMajorQuery = false,
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
new file mode 100644
index 0000000000..713708d4cd
--- /dev/null
+++ b/cpp/test/sparse/add.cu
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <raft/sparse/csr.cuh>
+#include <raft/sparse/linalg/add.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+template <typename Type_f, typename Index_>
+struct CSRMatrixVal {
+  std::vector<Index_> row_ind;
+  std::vector<Index_> row_ind_ptr;
+  std::vector<Type_f> values;
+};
+
+template <typename Type_f, typename Index_>
+struct CSRAddInputs {
+  CSRMatrixVal<Type_f, Index_> matrix_a;
+  CSRMatrixVal<Type_f, Index_> matrix_b;
+  CSRMatrixVal<Type_f, Index_> matrix_verify;
+};
+
+template <typename Type_f, typename Index_>
+class CSRAddTest
+  : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam();
+    n_rows = params.matrix_a.row_ind.size();
+    nnz_a = params.matrix_a.row_ind_ptr.size();
+    nnz_b = params.matrix_b.row_ind_ptr.size();
+    nnz_result = params.matrix_verify.row_ind_ptr.size();
+
+    cudaStreamCreate(&stream);
+
+    raft::allocate(ind_a, n_rows);
+    raft::allocate(ind_ptr_a, nnz_a);
+    raft::allocate(values_a, nnz_a);
+
+    raft::allocate(ind_b, n_rows);
+    raft::allocate(ind_ptr_b, nnz_b);
+    raft::allocate(values_b, nnz_b);
+
+    raft::allocate(ind_verify, n_rows);
+    raft::allocate(ind_ptr_verify, nnz_result);
+    raft::allocate(values_verify, nnz_result);
+
+    raft::allocate(ind_result, n_rows);
+    raft::allocate(ind_ptr_result, nnz_result);
+    raft::allocate(values_result, nnz_result);
+  }
+
+  void Run() {
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
+
+    raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream);
+    raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a,
+                        stream);
+    raft::update_device(values_a, params.matrix_a.values.data(), nnz_a, stream);
+
+    raft::update_device(ind_b, params.matrix_b.row_ind.data(), n_rows, stream);
+    raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b,
+                        stream);
+    raft::update_device(values_b, params.matrix_b.values.data(), nnz_b, stream);
+
+    raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows,
+                        stream);
+    raft::update_device(ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(),
+                        nnz_result, stream);
+    raft::update_device(values_verify, params.matrix_verify.values.data(),
+                        nnz_result, stream);
+
+    Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(
+      ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
+      n_rows, ind_result, alloc, stream);
+
+    ASSERT_TRUE(nnz == nnz_result);
+    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify, ind_result, n_rows,
+                                          raft::Compare<Index_>()));
+
+    linalg::csr_add_finalize<Type_f, 32>(
+      ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
+      n_rows, ind_result, ind_ptr_result, values_result, stream);
+
+    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_ptr_verify, ind_ptr_result, nnz,
+                                          raft::Compare<Index_>()));
+    ASSERT_TRUE(raft::devArrMatch<Type_f>(values_verify, values_result, nnz,
+                                          raft::Compare<Type_f>()));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(ind_a));
+    CUDA_CHECK(cudaFree(ind_b));
+    CUDA_CHECK(cudaFree(ind_result));
+    CUDA_CHECK(cudaFree(ind_ptr_a));
+    CUDA_CHECK(cudaFree(ind_ptr_b));
+    CUDA_CHECK(cudaFree(ind_ptr_verify));
+    CUDA_CHECK(cudaFree(ind_ptr_result));
+    CUDA_CHECK(cudaFree(values_a));
+    CUDA_CHECK(cudaFree(values_b));
+    CUDA_CHECK(cudaFree(values_verify));
+    CUDA_CHECK(cudaFree(values_result));
+    cudaStreamDestroy(stream);
+  }
+
+ protected:
+  CSRAddInputs<Type_f, Index_> params;
+  cudaStream_t stream;
+  Index_ n_rows, nnz_a, nnz_b, nnz_result;
+  Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b,
+    *ind_ptr_verify, *ind_ptr_result;
+  Type_f *values_a, *values_b, *values_verify, *values_result;
+};
+
+using CSRAddTestF = CSRAddTest<float, int>;
+TEST_P(CSRAddTestF, Result) { Run(); }
+
+using CSRAddTestD = CSRAddTest<double, int>;
+TEST_P(CSRAddTestD, Result) { Run(); }
+
+const std::vector<CSRAddInputs<float, int>> csradd_inputs_f = {
+  {{{0, 4, 8, 9},
+    {1, 2, 3, 4, 1, 2, 3, 5, 0, 1},
+    {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}},
+   {{0, 4, 8, 9},
+    {1, 2, 5, 4, 0, 2, 3, 5, 1, 0},
+    {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}},
+   {{0, 5, 10, 12},
+    {1, 2, 3, 4, 5, 1, 2, 3, 5, 0, 0, 1, 1, 0},
+    {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}},
+};
+const std::vector<CSRAddInputs<double, int>> csradd_inputs_d = {
+  {{{0, 4, 8, 9},
+    {1, 2, 3, 4, 1, 2, 3, 5, 0, 1},
+    {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}},
+   {{0, 4, 8, 9},
+    {1, 2, 5, 4, 0, 2, 3, 5, 1, 0},
+    {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}},
+   {{0, 5, 10, 12},
+    {1, 2, 3, 4, 5, 1, 2, 3, 5, 0, 0, 1, 1, 0},
+    {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}},
+};
+
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF,
+                        ::testing::ValuesIn(csradd_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD,
+                        ::testing::ValuesIn(csradd_inputs_d));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
new file mode 100644
index 0000000000..ea69ecfc53
--- /dev/null
+++ b/cpp/test/sparse/convert_coo.cu
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/csr.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+
+#include "../test_utils.h"
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+template <typename Index_>
+struct CSRtoCOOInputs {
+  std::vector<Index_> ex_scan;
+  std::vector<Index_> verify;
+};
+
+template <typename Index_>
+class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<CSRtoCOOInputs<Index_>>::GetParam();
+
+    cudaStreamCreate(&stream);
+    raft::allocate(ex_scan, params.ex_scan.size());
+    raft::allocate(verify, params.verify.size());
+    raft::allocate(result, params.verify.size(), true);
+  }
+
+  void Run() {
+    Index_ n_rows = params.ex_scan.size();
+    Index_ nnz = params.verify.size();
+
+    raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
+    raft::update_device(verify, params.verify.data(), nnz, stream);
+
+    convert::csr_to_coo<Index_, 32>(ex_scan, n_rows, result, nnz, stream);
+
+    ASSERT_TRUE(raft::devArrMatch<Index_>(verify, result, nnz,
+                                          raft::Compare<float>(), stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(ex_scan));
+    CUDA_CHECK(cudaFree(verify));
+    CUDA_CHECK(cudaFree(result));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+ protected:
+  CSRtoCOOInputs<Index_> params;
+  cudaStream_t stream;
+  Index_ *ex_scan, *verify, *result;
+};
+
+using CSRtoCOOTestI = CSRtoCOOTest<int>;
+TEST_P(CSRtoCOOTestI, Result) { Run(); }
+
+using CSRtoCOOTestL = CSRtoCOOTest<int64_t>;
+TEST_P(CSRtoCOOTestL, Result) { Run(); }
+
+const std::vector<CSRtoCOOInputs<int>> csrtocoo_inputs_32 = {
+  {{0, 0, 2, 2}, {1, 1, 3}},
+  {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}},
+};
+const std::vector<CSRtoCOOInputs<int64_t>> csrtocoo_inputs_64 = {
+  {{0, 0, 2, 2}, {1, 1, 3}},
+  {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}},
+};
+
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI,
+                        ::testing::ValuesIn(csrtocoo_inputs_32));
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL,
+                        ::testing::ValuesIn(csrtocoo_inputs_64));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
new file mode 100644
index 0000000000..553ef2ddee
--- /dev/null
+++ b/cpp/test/sparse/convert_csr.cu
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+#include <raft/mr/device/allocator.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/coo.cuh>
+
+#include <iostream>
+
+namespace raft {
+namespace sparse {
+
+/**************************** sorted COO to CSR ****************************/
+
+template <typename T>
+struct SparseConvertCSRInputs {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &os,
+                           const SparseConvertCSRInputs<T> &dims) {
+  return os;
+}
+
+template <typename T>
+class SparseConvertCSRTest
+  : public ::testing::TestWithParam<SparseConvertCSRInputs<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseConvertCSRInputs<T> params;
+};
+
+const std::vector<SparseConvertCSRInputs<float>> inputsf = {
+  {5, 10, 5, 1234ULL}};
+
+typedef SparseConvertCSRTest<float> SortedCOOToCSR;
+TEST_P(SortedCOOToCSR, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  std::shared_ptr<raft::mr::device::allocator> alloc(
+    new raft::mr::device::default_allocator);
+
+  int nnz = 8;
+
+  int *in, *out, *exp;
+
+  int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int *exp_h = new int[4]{0, 2, 4, 6};
+
+  raft::allocate(in, nnz, true);
+  raft::allocate(exp, 4, true);
+  raft::allocate(out, 4, true);
+
+  raft::update_device(in, in_h, nnz, stream);
+  raft::update_device(exp, exp_h, 4, stream);
+
+  convert::sorted_coo_to_csr<int>(in, nnz, out, 4, alloc, stream);
+
+  ASSERT_TRUE(raft::devArrMatch<int>(out, exp, 4, raft::Compare<int>()));
+
+  cudaStreamDestroy(stream);
+
+  delete[] in_h;
+  delete[] exp_h;
+
+  CUDA_CHECK(cudaFree(in));
+  CUDA_CHECK(cudaFree(exp));
+  CUDA_CHECK(cudaFree(out));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR,
+                        ::testing::ValuesIn(inputsf));
+
+/******************************** adj graph ********************************/
+
+template <typename Index_>
+struct CSRAdjGraphInputs {
+  Index_ n_rows;
+  Index_ n_cols;
+  std::vector<Index_> row_ind;
+  std::vector<uint8_t> adj;  // To avoid vector<bool> optimization
+  std::vector<Index_> verify;
+};
+
+template <typename Index_>
+class CSRAdjGraphTest
+  : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<CSRAdjGraphInputs<Index_>>::GetParam();
+    cudaStreamCreate(&stream);
+    nnz = params.verify.size();
+
+    raft::allocate(row_ind, params.n_rows);
+    raft::allocate(adj, params.n_rows * params.n_cols);
+    raft::allocate(result, nnz, true);
+    raft::allocate(verify, nnz);
+  }
+
+  void Run() {
+    raft::update_device(row_ind, params.row_ind.data(), params.n_rows, stream);
+    raft::update_device(adj, reinterpret_cast<bool *>(params.adj.data()),
+                        params.n_rows * params.n_cols, stream);
+    raft::update_device(verify, params.verify.data(), nnz, stream);
+
+    convert::csr_adj_graph_batched<Index_, 32>(
+      row_ind, params.n_cols, nnz, params.n_rows, adj, result, stream);
+
+    ASSERT_TRUE(
+      raft::devArrMatch<Index_>(verify, result, nnz, raft::Compare<Index_>()));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(row_ind));
+    CUDA_CHECK(cudaFree(adj));
+    CUDA_CHECK(cudaFree(verify));
+    CUDA_CHECK(cudaFree(result));
+    cudaStreamDestroy(stream);
+  }
+
+ protected:
+  CSRAdjGraphInputs<Index_> params;
+  cudaStream_t stream;
+  Index_ nnz;
+  Index_ *row_ind, *result, *verify;
+  bool *adj;
+};
+
+using CSRAdjGraphTestI = CSRAdjGraphTest<int>;
+TEST_P(CSRAdjGraphTestI, Result) { Run(); }
+
+using CSRAdjGraphTestL = CSRAdjGraphTest<int64_t>;
+TEST_P(CSRAdjGraphTestL, Result) { Run(); }
+
+const std::vector<CSRAdjGraphInputs<int>> csradjgraph_inputs_i = {
+  {3,
+   6,
+   {0, 3, 6},
+   {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+   {0, 1, 2, 0, 1, 2, 0, 1, 2}},
+};
+const std::vector<CSRAdjGraphInputs<int64_t>> csradjgraph_inputs_l = {
+  {3,
+   6,
+   {0, 3, 6},
+   {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+   {0, 1, 2, 0, 1, 2, 0, 1, 2}},
+};
+
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI,
+                        ::testing::ValuesIn(csradjgraph_inputs_i));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL,
+                        ::testing::ValuesIn(csradjgraph_inputs_l));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
new file mode 100644
index 0000000000..625772a842
--- /dev/null
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+
+#include <gtest/gtest.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/sparse/op/slice.h>
+
+#include "../test_utils.h"
+
+namespace raft {
+namespace sparse {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct CSRRowSliceInputs {
+  value_idx start_row;
+  value_idx stop_row;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_idx> out_indptr_ref_h;
+  std::vector<value_idx> out_indices_ref_h;
+  std::vector<value_t> out_data_ref_h;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream &operator<<(::std::ostream &os,
+                           const CSRRowSliceInputs<value_idx, value_t> &dims) {
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class CSRRowSliceTest
+  : public ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>> {
+ protected:
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h = params.data_h;
+
+    allocate(indptr, indptr_h.size());
+    allocate(indices, indices_h.size());
+    allocate(data, data_h.size());
+
+    update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices, indices_h.data(), indices_h.size(), stream);
+    update_device(data, data_h.data(), data_h.size(), stream);
+
+    std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
+    std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
+    std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
+
+    allocate(out_indptr_ref, out_indptr_ref_h.size());
+    allocate(out_indices_ref, out_indices_ref_h.size());
+    allocate(out_data_ref, out_data_ref_h.size());
+
+    update_device(out_indptr_ref, out_indptr_ref_h.data(),
+                  out_indptr_ref_h.size(), stream);
+    update_device(out_indices_ref, out_indices_ref_h.data(),
+                  out_indices_ref_h.size(), stream);
+    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
+                  stream);
+
+    allocate(out_indptr, out_indptr_ref_h.size());
+    allocate(out_indices, out_indices_ref_h.size());
+    allocate(out_data, out_data_ref_h.size());
+  }
+
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      CSRRowSliceInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    make_data();
+
+    int csr_start_offset;
+    int csr_stop_offset;
+
+    raft::sparse::op::csr_row_slice_indptr(
+      params.start_row, params.stop_row, indptr, out_indptr, &csr_start_offset,
+      &csr_stop_offset, stream);
+
+    raft::sparse::op::csr_row_slice_populate(csr_start_offset, csr_stop_offset,
+                                             indices, data, out_indices,
+                                             out_data, stream);
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(indptr));
+    CUDA_CHECK(cudaFree(indices));
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(out_indptr));
+    CUDA_CHECK(cudaFree(out_indices));
+    CUDA_CHECK(cudaFree(out_data));
+    CUDA_CHECK(cudaFree(out_indptr_ref));
+    CUDA_CHECK(cudaFree(out_indices_ref));
+    CUDA_CHECK(cudaFree(out_data_ref));
+  }
+
+  void compare() {
+    ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref,
+                            params.out_indptr_ref_h.size(),
+                            Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref,
+                            params.out_indices_ref_h.size(),
+                            Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_data, out_data_ref,
+                            params.out_data_ref_h.size(), Compare<value_t>()));
+  }
+
+ protected:
+  cudaStream_t stream;
+
+  // input data
+  value_idx *indptr, *indices;
+  value_t *data;
+
+  // output data
+  value_idx *out_indptr, *out_indices;
+  value_t *out_data;
+
+  // expected output data
+  value_idx *out_indptr_ref, *out_indices_ref;
+  value_t *out_data_ref;
+
+  CSRRowSliceInputs<value_idx, value_t> params;
+};
+
+const std::vector<CSRRowSliceInputs<int, float>> inputs_i32_f = {
+  {1,
+   3,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {0, 2, 4, 6},
+   {0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}},
+  {
+    2,
+    3,
+    {0, 2, 4, 6, 8},
+    {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+    {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+    {0, 2, 4},
+    {0, 1, 0, 1},  // indices
+    {50.0f, 28.0f, 16.0f, 2.0f},
+  }
+
+};
+typedef CSRRowSliceTest<int, float> CSRRowSliceTestF;
+TEST_P(CSRRowSliceTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
+
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
new file mode 100644
index 0000000000..5535df4fe3
--- /dev/null
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <gtest/gtest.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/convert/dense.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace sparse {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct CSRToDenseInputs {
+  value_idx nrows;
+  value_idx ncols;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_t> out_ref_h;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream &operator<<(::std::ostream &os,
+                           const CSRToDenseInputs<value_idx, value_t> &dims) {
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class CSRToDenseTest
+  : public ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>> {
+ protected:
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h = params.data_h;
+
+    allocate(indptr, indptr_h.size());
+    allocate(indices, indices_h.size());
+    allocate(data, data_h.size());
+
+    update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices, indices_h.data(), indices_h.size(), stream);
+    update_device(data, data_h.data(), data_h.size(), stream);
+
+    std::vector<value_t> out_ref_h = params.out_ref_h;
+
+    allocate(out_ref, out_ref_h.size());
+
+    update_device(out_ref, out_ref_h.data(), out_ref_h.size(), stream);
+
+    allocate(out, out_ref_h.size());
+  }
+
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      CSRToDenseInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    CUSPARSE_CHECK(cusparseCreate(&handle));
+
+    make_data();
+
+    convert::csr_to_dense(handle, params.nrows, params.ncols, indptr, indices,
+                          data, params.nrows, out, stream, true);
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUSPARSE_CHECK(cusparseDestroy(handle));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(indptr));
+    CUDA_CHECK(cudaFree(indices));
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(out));
+    CUDA_CHECK(cudaFree(out_ref));
+  }
+
+  void compare() {
+    ASSERT_TRUE(
+      devArrMatch(out, out_ref, params.out_ref_h.size(), Compare<value_t>()));
+  }
+
+ protected:
+  cudaStream_t stream;
+  cusparseHandle_t handle;
+
+  // input data
+  value_idx *indptr, *indices;
+  value_t *data;
+
+  // output data
+  value_t *out;
+
+  // expected output data
+  value_t *out_ref;
+
+  CSRToDenseInputs<value_idx, value_t> params;
+};
+
+const std::vector<CSRToDenseInputs<int, float>> inputs_i32_f = {
+  {4,
+   4,
+   {0, 2, 4, 6, 8},
+   {0, 1, 2, 3, 0, 1, 2, 3},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 5.0f, 50.0f, 28.0f, 0.0f, 0.0f,
+    0.0f, 0.0f, 16.0f, 2.0f}},
+};
+typedef CSRToDenseTest<int, float> CSRToDenseTestF;
+TEST_P(CSRToDenseTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
+
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
new file mode 100644
index 0000000000..c257d6eb3c
--- /dev/null
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cusparse_v2.h>
+
+#include <gtest/gtest.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+#include <raft/sparse/linalg/transpose.h>
+
+#include "../test_utils.h"
+
+namespace raft {
+namespace sparse {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct CSRTransposeInputs {
+  value_idx nrows;
+  value_idx ncols;
+  value_idx nnz;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_idx> out_indptr_ref_h;
+  std::vector<value_idx> out_indices_ref_h;
+  std::vector<value_t> out_data_ref_h;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream &operator<<(::std::ostream &os,
+                           const CSRTransposeInputs<value_idx, value_t> &dims) {
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class CSRTransposeTest
+  : public ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>> {
+ protected:
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h = params.data_h;
+
+    allocate(indptr, indptr_h.size());
+    allocate(indices, indices_h.size());
+    allocate(data, data_h.size());
+
+    update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices, indices_h.data(), indices_h.size(), stream);
+    update_device(data, data_h.data(), data_h.size(), stream);
+
+    std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
+    std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
+    std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
+
+    allocate(out_indptr_ref, out_indptr_ref_h.size());
+    allocate(out_indices_ref, out_indices_ref_h.size());
+    allocate(out_data_ref, out_data_ref_h.size());
+
+    update_device(out_indptr_ref, out_indptr_ref_h.data(),
+                  out_indptr_ref_h.size(), stream);
+    update_device(out_indices_ref, out_indices_ref_h.data(),
+                  out_indices_ref_h.size(), stream);
+    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
+                  stream);
+
+    allocate(out_indptr, out_indptr_ref_h.size());
+    allocate(out_indices, out_indices_ref_h.size());
+    allocate(out_data, out_data_ref_h.size());
+  }
+
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      CSRTransposeInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    CUSPARSE_CHECK(cusparseCreate(&handle));
+
+    make_data();
+
+    raft::sparse::linalg::csr_transpose(
+      handle, indptr, indices, data, out_indptr, out_indices, out_data,
+      params.nrows, params.ncols, params.nnz, alloc, stream);
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUSPARSE_CHECK(cusparseDestroy(handle));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(indptr));
+    CUDA_CHECK(cudaFree(indices));
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(out_indptr));
+    CUDA_CHECK(cudaFree(out_indices));
+    CUDA_CHECK(cudaFree(out_data));
+    CUDA_CHECK(cudaFree(out_indptr_ref));
+    CUDA_CHECK(cudaFree(out_indices_ref));
+    CUDA_CHECK(cudaFree(out_data_ref));
+  }
+
+  void compare() {
+    ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref,
+                            params.out_indptr_ref_h.size(),
+                            Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref,
+                            params.out_indices_ref_h.size(),
+                            Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_data, out_data_ref,
+                            params.out_data_ref_h.size(), Compare<value_t>()));
+  }
+
+ protected:
+  cudaStream_t stream;
+  cusparseHandle_t handle;
+
+  // input data
+  value_idx *indptr, *indices;
+  value_t *data;
+
+  // output data
+  value_idx *out_indptr, *out_indices;
+  value_t *out_data;
+
+  // expected output data
+  value_idx *out_indptr_ref, *out_indices_ref;
+  value_t *out_data_ref;
+
+  CSRTransposeInputs<value_idx, value_t> params;
+};
+
+const std::vector<CSRTransposeInputs<int, float>> inputs_i32_f = {
+  {
+    4,
+    2,
+    8,
+    {0, 2, 4, 6, 8},
+    {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+    {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+    {0, 4, 8},
+    {0, 1, 2, 3, 0, 1, 2, 3},  // indices
+    {1.0f, 1.0f, 50.0f, 16.0f, 3.0f, 5.0f, 28.0f, 2.0f},
+  },
+};
+typedef CSRTransposeTest<int, float> CSRTransposeTestF;
+TEST_P(CSRTransposeTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
+
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
new file mode 100644
index 0000000000..5d687ad92b
--- /dev/null
+++ b/cpp/test/sparse/degree.cu
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+#include <raft/sparse/linalg/degree.cuh>
+
+#include <iostream>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseDegreeInputs {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseDegreeTests
+  : public ::testing::TestWithParam<SparseDegreeInputs<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseDegreeInputs<T> params;
+};
+
+const std::vector<SparseDegreeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
+
+typedef SparseDegreeTests<float> COODegree;
+TEST_P(COODegree, Result) {
+  int *in_rows, *verify, *results;
+
+  int in_rows_h[5] = {0, 0, 1, 2, 2};
+  int verify_h[5] = {2, 1, 2, 0, 0};
+
+  raft::allocate(in_rows, 5);
+  raft::allocate(verify, 5, true);
+  raft::allocate(results, 5, true);
+
+  raft::update_device(in_rows, *&in_rows_h, 5, 0);
+  raft::update_device(verify, *&verify_h, 5, 0);
+
+  linalg::coo_degree<32>(in_rows, 5, results, 0);
+  cudaDeviceSynchronize();
+
+  ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
+
+  CUDA_CHECK(cudaFree(in_rows));
+  CUDA_CHECK(cudaFree(verify));
+}
+
+typedef SparseDegreeTests<float> COODegreeNonzero;
+TEST_P(COODegreeNonzero, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  int *in_rows, *verify, *results;
+  float *in_vals;
+
+  int in_rows_h[5] = {0, 0, 1, 2, 2};
+  float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0};
+  int verify_h[5] = {1, 0, 2, 0, 0};
+
+  raft::allocate(in_rows, 5);
+  raft::allocate(verify, 5, true);
+  raft::allocate(results, 5, true);
+  raft::allocate(in_vals, 5, true);
+
+  raft::update_device(in_rows, *&in_rows_h, 5, 0);
+  raft::update_device(verify, *&verify_h, 5, 0);
+  raft::update_device(in_vals, *&in_vals_h, 5, 0);
+
+  linalg::coo_degree_nz<32, float>(in_rows, in_vals, 5, results, stream);
+  cudaDeviceSynchronize();
+
+  ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
+
+  CUDA_CHECK(cudaFree(in_rows));
+  CUDA_CHECK(cudaFree(verify));
+
+  CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree,
+                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero,
+                        ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
new file mode 100644
index 0000000000..a841da661d
--- /dev/null
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/mr/device/allocator.hpp>
+
+#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/distance/coo_spmv.cuh>
+#include <raft/sparse/distance/operators.cuh>
+
+#include "../test_utils.h"
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct SparseDistanceCOOSPMVInputs {
+  value_idx n_cols;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_t> out_dists_ref_h;
+
+  raft::distance::DistanceType metric;
+
+  float metric_arg = 0.0;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream &operator<<(
+  ::std::ostream &os,
+  const SparseDistanceCOOSPMVInputs<value_idx, value_t> &dims) {
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class SparseDistanceCOOSPMVTest
+  : public ::testing::TestWithParam<
+      SparseDistanceCOOSPMVInputs<value_idx, value_t>> {
+ public:
+  template <typename reduce_f, typename accum_f, typename write_f>
+  void compute_dist(reduce_f reduce_func, accum_f accum_func,
+                    write_f write_func, bool rev = true) {
+    raft::mr::device::buffer<value_idx> coo_rows(
+      dist_config.allocator, dist_config.stream,
+      max(dist_config.b_nnz, dist_config.a_nnz));
+
+    raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows,
+                                      coo_rows.data(), dist_config.b_nnz,
+                                      dist_config.stream);
+
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+      out_dists, dist_config, coo_rows.data(), reduce_func, accum_func,
+      write_func);
+
+    if (rev) {
+      raft::sparse::convert::csr_to_coo(dist_config.a_indptr,
+                                        dist_config.a_nrows, coo_rows.data(),
+                                        dist_config.a_nnz, dist_config.stream);
+
+      balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
+        out_dists, dist_config, coo_rows.data(), reduce_func, accum_func,
+        write_func);
+    }
+  }
+
+  void run_spmv() {
+    switch (params.metric) {
+      case raft::distance::DistanceType::InnerProduct:
+        compute_dist(Product(), Sum(), AtomicAdd(), true);
+        break;
+      case raft::distance::DistanceType::L2Unexpanded:
+        compute_dist(SqDiff(), Sum(), AtomicAdd());
+        break;
+      case raft::distance::DistanceType::Canberra:
+        compute_dist(
+          [] __device__(value_t a, value_t b) {
+            return fabsf(a - b) / (fabsf(a) + fabsf(b));
+          },
+          Sum(), AtomicAdd());
+        break;
+      case raft::distance::DistanceType::L1:
+        compute_dist(AbsDiff(), Sum(), AtomicAdd());
+        break;
+      case raft::distance::DistanceType::Linf:
+        compute_dist(AbsDiff(), Max(), AtomicMax());
+        break;
+      case raft::distance::DistanceType::LpUnexpanded: {
+        compute_dist(PDiff(params.metric_arg), Sum(), AtomicAdd());
+        float p = 1.0f / params.metric_arg;
+        raft::linalg::unaryOp<value_t>(
+          out_dists, out_dists, dist_config.a_nrows * dist_config.b_nrows,
+          [=] __device__(value_t input) { return powf(input, p); },
+          dist_config.stream);
+
+      } break;
+      default:
+        throw raft::exception("Unknown distance");
+    }
+  }
+
+ protected:
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h = params.data_h;
+
+    allocate(indptr, indptr_h.size());
+    allocate(indices, indices_h.size());
+    allocate(data, data_h.size());
+
+    update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices, indices_h.data(), indices_h.size(), stream);
+    update_device(data, data_h.data(), data_h.size(), stream);
+
+    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+
+    allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1));
+
+    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
+                  stream);
+  }
+
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      SparseDistanceCOOSPMVInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    CUSPARSE_CHECK(cusparseCreate(&cusparseHandle));
+
+    make_data();
+
+    dist_config.b_nrows = params.indptr_h.size() - 1;
+    dist_config.b_ncols = params.n_cols;
+    dist_config.b_nnz = params.indices_h.size();
+    dist_config.b_indptr = indptr;
+    dist_config.b_indices = indices;
+    dist_config.b_data = data;
+    dist_config.a_nrows = params.indptr_h.size() - 1;
+    dist_config.a_ncols = params.n_cols;
+    dist_config.a_nnz = params.indices_h.size();
+    dist_config.a_indptr = indptr;
+    dist_config.a_indices = indices;
+    dist_config.a_data = data;
+    dist_config.handle = cusparseHandle;
+    dist_config.allocator = alloc;
+    dist_config.stream = stream;
+
+    int out_size = dist_config.a_nrows * dist_config.b_nrows;
+
+    allocate(out_dists, out_size);
+
+    run_spmv();
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(indptr));
+    CUDA_CHECK(cudaFree(indices));
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(out_dists));
+    CUDA_CHECK(cudaFree(out_dists_ref));
+  }
+
+  void compare() {
+    raft::print_device_vector("expected: ", out_dists_ref,
+                              params.out_dists_ref_h.size(), std::cout);
+    raft::print_device_vector("out_dists: ", out_dists,
+                              params.out_dists_ref_h.size(), std::cout);
+    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
+                            params.out_dists_ref_h.size(),
+                            CompareApprox<value_t>(1e-3)));
+  }
+
+ protected:
+  cudaStream_t stream;
+  cusparseHandle_t cusparseHandle;
+
+  // input data
+  value_idx *indptr, *indices;
+  value_t *data;
+
+  // output data
+  value_t *out_dists, *out_dists_ref;
+
+  raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config;
+
+  SparseDistanceCOOSPMVInputs<value_idx, value_t> params;
+};
+
+const std::vector<SparseDistanceCOOSPMVInputs<int, float>> inputs_i32_f = {
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},
+   {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
+   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+    5.0},
+   raft::distance::DistanceType::InnerProduct},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {
+     // dense output
+     0.0,
+     4.0,
+     3026.0,
+     226.0,
+     4.0,
+     0.0,
+     2930.0,
+     234.0,
+     3026.0,
+     2930.0,
+     0.0,
+     1832.0,
+     226.0,
+     234.0,
+     1832.0,
+     0.0,
+   },
+   raft::distance::DistanceType::L2Unexpanded},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    3.3954660629919076,
+    5.6469232737388815,
+    6.373112846266441,
+    4.0212880272531715,
+    6.916281504639404,
+    5.741508386786526,
+    5.411470999663036,
+    9.0,
+    4.977014354725805,
+    3.3954660629919076,
+    0.0,
+    7.56256082439209,
+    5.540261147481582,
+    4.832322929216881,
+    4.62003193872216,
+    6.498056792320361,
+    4.309846252268695,
+    6.317531174829905,
+    6.016362684141827,
+    5.6469232737388815,
+    7.56256082439209,
+    0.0,
+    5.974878731322299,
+    4.898357301336036,
+    6.442097410320605,
+    5.227077347287883,
+    7.134101195584642,
+    5.457753923371659,
+    7.0,
+    6.373112846266441,
+    5.540261147481582,
+    5.974878731322299,
+    0.0,
+    5.5507273748583,
+    4.897749658726415,
+    9.0,
+    8.398776718824767,
+    3.908281400328807,
+    4.83431066343688,
+    4.0212880272531715,
+    4.832322929216881,
+    4.898357301336036,
+    5.5507273748583,
+    0.0,
+    6.632989819428174,
+    7.438852294822894,
+    5.6631570310967465,
+    7.579428202635459,
+    6.760811985364303,
+    6.916281504639404,
+    4.62003193872216,
+    6.442097410320605,
+    4.897749658726415,
+    6.632989819428174,
+    0.0,
+    5.249404187382862,
+    6.072559523278559,
+    4.07661278488929,
+    6.19678948003145,
+    5.741508386786526,
+    6.498056792320361,
+    5.227077347287883,
+    9.0,
+    7.438852294822894,
+    5.249404187382862,
+    0.0,
+    3.854811639654704,
+    6.652724827169063,
+    5.298236851430971,
+    5.411470999663036,
+    4.309846252268695,
+    7.134101195584642,
+    8.398776718824767,
+    5.6631570310967465,
+    6.072559523278559,
+    3.854811639654704,
+    0.0,
+    7.529184598969917,
+    6.903282911791188,
+    9.0,
+    6.317531174829905,
+    5.457753923371659,
+    3.908281400328807,
+    7.579428202635459,
+    4.07661278488929,
+    6.652724827169063,
+    7.529184598969917,
+    0.0,
+    7.0,
+    4.977014354725805,
+    6.016362684141827,
+    7.0,
+    4.83431066343688,
+    6.760811985364303,
+    6.19678948003145,
+    5.298236851430971,
+    6.903282911791188,
+    7.0,
+    0.0},
+   raft::distance::DistanceType::Canberra},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    1.31462855332296,
+    1.3690307816129905,
+    1.698603990921237,
+    1.3460470789553531,
+    1.6636670712582544,
+    1.2651744044972217,
+    1.1938329352055201,
+    1.8811409082590185,
+    1.3653115050624267,
+    1.31462855332296,
+    0.0,
+    1.9447722703291133,
+    1.42818777206562,
+    1.4685491458946494,
+    1.3071999866010466,
+    1.4988622861692171,
+    0.9698559287406783,
+    1.4972023224597841,
+    1.5243383567266802,
+    1.3690307816129905,
+    1.9447722703291133,
+    0.0,
+    1.2748400840107568,
+    1.0599569946448246,
+    1.546591282841402,
+    1.147526531928459,
+    1.447002179128145,
+    1.5982242387673176,
+    1.3112533607072414,
+    1.698603990921237,
+    1.42818777206562,
+    1.2748400840107568,
+    0.0,
+    1.038121552545461,
+    1.011788365364402,
+    1.3907391109256988,
+    1.3128200942311496,
+    1.19595706584447,
+    1.3233328139624725,
+    1.3460470789553531,
+    1.4685491458946494,
+    1.0599569946448246,
+    1.038121552545461,
+    0.0,
+    1.3642741698145529,
+    1.3493868683808095,
+    1.394942694628328,
+    1.572881849642552,
+    1.380122665319464,
+    1.6636670712582544,
+    1.3071999866010466,
+    1.546591282841402,
+    1.011788365364402,
+    1.3642741698145529,
+    0.0,
+    1.018961640373018,
+    1.0114394258945634,
+    0.8338711034820684,
+    1.1247823842299223,
+    1.2651744044972217,
+    1.4988622861692171,
+    1.147526531928459,
+    1.3907391109256988,
+    1.3493868683808095,
+    1.018961640373018,
+    0.0,
+    0.7701238110357329,
+    1.245486437864406,
+    0.5551259549534626,
+    1.1938329352055201,
+    0.9698559287406783,
+    1.447002179128145,
+    1.3128200942311496,
+    1.394942694628328,
+    1.0114394258945634,
+    0.7701238110357329,
+    0.0,
+    1.1886800117391216,
+    1.0083692448135637,
+    1.8811409082590185,
+    1.4972023224597841,
+    1.5982242387673176,
+    1.19595706584447,
+    1.572881849642552,
+    0.8338711034820684,
+    1.245486437864406,
+    1.1886800117391216,
+    0.0,
+    1.3661374102525012,
+    1.3653115050624267,
+    1.5243383567266802,
+    1.3112533607072414,
+    1.3233328139624725,
+    1.380122665319464,
+    1.1247823842299223,
+    0.5551259549534626,
+    1.0083692448135637,
+    1.3661374102525012,
+    0.0},
+   raft::distance::DistanceType::LpUnexpanded,
+   2.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    0.9251771844789913,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.717493881903289,
+    0.6920214832303888,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.0,
+    0.9036452083899731,
+    0.8655339692155823,
+    0.8706483735804971,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.6329837991017668,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.0,
+    0.7988276152181608,
+    0.7028075145996631,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.8429599432532096,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.7988276152181608,
+    0.0,
+    0.48376552205293305,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8429599432532096,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.7028075145996631,
+    0.48376552205293305,
+    0.0,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8429599432532096,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.0,
+    0.8853924473642432,
+    0.535821510936138,
+    0.6497196601457607,
+    0.8853924473642432,
+    0.717493881903289,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.0,
+    0.5279604218147174,
+    0.6658348373853169,
+    0.33799874888632914,
+    0.6920214832303888,
+    0.6329837991017668,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.535821510936138,
+    0.5279604218147174,
+    0.0,
+    0.662579808115858,
+    0.5079750812968089,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.6497196601457607,
+    0.6658348373853169,
+    0.662579808115858,
+    0.0,
+    0.8429599432532096,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.33799874888632914,
+    0.5079750812968089,
+    0.8429599432532096,
+    0.0},
+   raft::distance::DistanceType::Linf},
+
+  {4,
+   {0, 1, 1, 2, 4},
+   {3, 2, 0, 1},  // indices
+   {0.99296, 0.42180, 0.11687, 0.305869},
+   {
+     // dense output
+     0.0,
+     0.99296,
+     1.41476,
+     1.415707,
+     0.99296,
+     0.0,
+     0.42180,
+     0.42274,
+     1.41476,
+     0.42180,
+     0.0,
+     0.84454,
+     1.41570,
+     0.42274,
+     0.84454,
+     0.0,
+   },
+   raft::distance::DistanceType::L1}
+
+};
+
+typedef SparseDistanceCOOSPMVTest<int, float> SparseDistanceCOOSPMVTestF;
+TEST_P(SparseDistanceCOOSPMVTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(SparseDistanceCOOSPMVTests, SparseDistanceCOOSPMVTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
+
+};  // namespace distance
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/sparse/dist_csr_spmv.cu b/cpp/test/sparse/dist_csr_spmv.cu
new file mode 100644
index 0000000000..2405909c40
--- /dev/null
+++ b/cpp/test/sparse/dist_csr_spmv.cu
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/mr/device/allocator.hpp>
+
+#include <raft/sparse/distance/csr_spmv.cuh>
+#include <raft/sparse/distance/operators.cuh>
+
+#include "../test_utils.h"
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct SparseDistanceCSRSPMVInputs {
+  value_idx n_cols;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_t> out_dists_ref_h;
+
+  raft::distance::DistanceType metric;
+
+  float metric_arg = 0.0;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream &operator<<(
+  ::std::ostream &os,
+  const SparseDistanceCSRSPMVInputs<value_idx, value_t> &dims) {
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class SparseDistanceCSRSPMVTest
+  : public ::testing::TestWithParam<
+      SparseDistanceCSRSPMVInputs<value_idx, value_t>> {
+ public:
+  template <typename reduce_f, typename accum_f>
+  void compute_dist(reduce_f reduce_func, accum_f accum_func) {
+    generalized_csr_pairwise_semiring<value_idx, value_t>(
+      out_dists, dist_config, reduce_func, accum_func);
+  }
+
+  void run_spmv() {
+    switch (params.metric) {
+      case raft::distance::DistanceType::InnerProduct:
+        compute_dist(Product(), Sum());
+        break;
+      case raft::distance::DistanceType::L2Unexpanded:
+        compute_dist(SqDiff(), Sum());
+        break;
+      case raft::distance::DistanceType::Canberra:
+        compute_dist(
+          [] __device__(value_t a, value_t b) {
+            value_t d = fabsf(a) + fabsf(b);
+            return ((d != 0) * fabsf(a - b)) / (d + (d == 0));
+          },
+          Sum());
+        break;
+      case raft::distance::DistanceType::L1:
+        compute_dist(AbsDiff(), Sum());
+        break;
+      case raft::distance::DistanceType::Linf:
+        compute_dist(AbsDiff(), Max());
+        break;
+      case raft::distance::DistanceType::LpUnexpanded: {
+        compute_dist(PDiff(params.metric_arg), Sum());
+        float pow = 1.0f / params.metric_arg;
+        raft::linalg::unaryOp<value_t>(
+          out_dists, out_dists, dist_config.a_nrows * dist_config.b_nrows,
+          [=] __device__(value_t input) { return powf(input, pow); },
+          dist_config.stream);
+
+      } break;
+      default:
+        throw raft::exception("Unknown distance");
+    }
+  }
+
+ protected:
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h = params.data_h;
+
+    allocate(indptr, indptr_h.size());
+    allocate(indices, indices_h.size());
+    allocate(data, data_h.size());
+
+    update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices, indices_h.data(), indices_h.size(), stream);
+    update_device(data, data_h.data(), data_h.size(), stream);
+
+    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+
+    allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1));
+
+    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
+                  stream);
+  }
+
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      SparseDistanceCSRSPMVInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    CUSPARSE_CHECK(cusparseCreate(&cusparseHandle));
+
+    make_data();
+
+    dist_config.b_nrows = params.indptr_h.size() - 1;
+    dist_config.b_ncols = params.n_cols;
+    dist_config.b_nnz = params.indices_h.size();
+    dist_config.b_indptr = indptr;
+    dist_config.b_indices = indices;
+    dist_config.b_data = data;
+    dist_config.a_nrows = params.indptr_h.size() - 1;
+    dist_config.a_ncols = params.n_cols;
+    dist_config.a_nnz = params.indices_h.size();
+    dist_config.a_indptr = indptr;
+    dist_config.a_indices = indices;
+    dist_config.a_data = data;
+    dist_config.handle = cusparseHandle;
+    dist_config.allocator = alloc;
+    dist_config.stream = stream;
+
+    int out_size = dist_config.a_nrows * dist_config.b_nrows;
+
+    allocate(out_dists, out_size);
+
+    run_spmv();
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(indptr));
+    CUDA_CHECK(cudaFree(indices));
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(out_dists));
+    CUDA_CHECK(cudaFree(out_dists_ref));
+  }
+
+  void compare() {
+    raft::print_device_vector("expected: ", out_dists_ref,
+                              params.out_dists_ref_h.size(), std::cout);
+    raft::print_device_vector("out_dists: ", out_dists,
+                              params.out_dists_ref_h.size(), std::cout);
+    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
+                            params.out_dists_ref_h.size(),
+                            CompareApprox<value_t>(1e-3)));
+  }
+
+ protected:
+  cudaStream_t stream;
+  cusparseHandle_t cusparseHandle;
+
+  // input data
+  value_idx *indptr, *indices;
+  value_t *data;
+
+  // output data
+  value_t *out_dists, *out_dists_ref;
+
+  raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config;
+
+  SparseDistanceCSRSPMVInputs<value_idx, value_t> params;
+};
+
+const std::vector<SparseDistanceCSRSPMVInputs<int, float>> inputs_i32_f = {
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},
+   {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
+   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+    5.0},
+   raft::distance::DistanceType::InnerProduct},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {
+     // dense output
+     0.0,
+     4.0,
+     3026.0,
+     226.0,
+     4.0,
+     0.0,
+     2930.0,
+     234.0,
+     3026.0,
+     2930.0,
+     0.0,
+     1832.0,
+     226.0,
+     234.0,
+     1832.0,
+     0.0,
+   },
+   raft::distance::DistanceType::L2Unexpanded},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    3.3954660629919076,
+    5.6469232737388815,
+    6.373112846266441,
+    4.0212880272531715,
+    6.916281504639404,
+    5.741508386786526,
+    5.411470999663036,
+    9.0,
+    4.977014354725805,
+    3.3954660629919076,
+    0.0,
+    7.56256082439209,
+    5.540261147481582,
+    4.832322929216881,
+    4.62003193872216,
+    6.498056792320361,
+    4.309846252268695,
+    6.317531174829905,
+    6.016362684141827,
+    5.6469232737388815,
+    7.56256082439209,
+    0.0,
+    5.974878731322299,
+    4.898357301336036,
+    6.442097410320605,
+    5.227077347287883,
+    7.134101195584642,
+    5.457753923371659,
+    7.0,
+    6.373112846266441,
+    5.540261147481582,
+    5.974878731322299,
+    0.0,
+    5.5507273748583,
+    4.897749658726415,
+    9.0,
+    8.398776718824767,
+    3.908281400328807,
+    4.83431066343688,
+    4.0212880272531715,
+    4.832322929216881,
+    4.898357301336036,
+    5.5507273748583,
+    0.0,
+    6.632989819428174,
+    7.438852294822894,
+    5.6631570310967465,
+    7.579428202635459,
+    6.760811985364303,
+    6.916281504639404,
+    4.62003193872216,
+    6.442097410320605,
+    4.897749658726415,
+    6.632989819428174,
+    0.0,
+    5.249404187382862,
+    6.072559523278559,
+    4.07661278488929,
+    6.19678948003145,
+    5.741508386786526,
+    6.498056792320361,
+    5.227077347287883,
+    9.0,
+    7.438852294822894,
+    5.249404187382862,
+    0.0,
+    3.854811639654704,
+    6.652724827169063,
+    5.298236851430971,
+    5.411470999663036,
+    4.309846252268695,
+    7.134101195584642,
+    8.398776718824767,
+    5.6631570310967465,
+    6.072559523278559,
+    3.854811639654704,
+    0.0,
+    7.529184598969917,
+    6.903282911791188,
+    9.0,
+    6.317531174829905,
+    5.457753923371659,
+    3.908281400328807,
+    7.579428202635459,
+    4.07661278488929,
+    6.652724827169063,
+    7.529184598969917,
+    0.0,
+    7.0,
+    4.977014354725805,
+    6.016362684141827,
+    7.0,
+    4.83431066343688,
+    6.760811985364303,
+    6.19678948003145,
+    5.298236851430971,
+    6.903282911791188,
+    7.0,
+    0.0},
+   raft::distance::DistanceType::Canberra},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    1.31462855332296,
+    1.3690307816129905,
+    1.698603990921237,
+    1.3460470789553531,
+    1.6636670712582544,
+    1.2651744044972217,
+    1.1938329352055201,
+    1.8811409082590185,
+    1.3653115050624267,
+    1.31462855332296,
+    0.0,
+    1.9447722703291133,
+    1.42818777206562,
+    1.4685491458946494,
+    1.3071999866010466,
+    1.4988622861692171,
+    0.9698559287406783,
+    1.4972023224597841,
+    1.5243383567266802,
+    1.3690307816129905,
+    1.9447722703291133,
+    0.0,
+    1.2748400840107568,
+    1.0599569946448246,
+    1.546591282841402,
+    1.147526531928459,
+    1.447002179128145,
+    1.5982242387673176,
+    1.3112533607072414,
+    1.698603990921237,
+    1.42818777206562,
+    1.2748400840107568,
+    0.0,
+    1.038121552545461,
+    1.011788365364402,
+    1.3907391109256988,
+    1.3128200942311496,
+    1.19595706584447,
+    1.3233328139624725,
+    1.3460470789553531,
+    1.4685491458946494,
+    1.0599569946448246,
+    1.038121552545461,
+    0.0,
+    1.3642741698145529,
+    1.3493868683808095,
+    1.394942694628328,
+    1.572881849642552,
+    1.380122665319464,
+    1.6636670712582544,
+    1.3071999866010466,
+    1.546591282841402,
+    1.011788365364402,
+    1.3642741698145529,
+    0.0,
+    1.018961640373018,
+    1.0114394258945634,
+    0.8338711034820684,
+    1.1247823842299223,
+    1.2651744044972217,
+    1.4988622861692171,
+    1.147526531928459,
+    1.3907391109256988,
+    1.3493868683808095,
+    1.018961640373018,
+    0.0,
+    0.7701238110357329,
+    1.245486437864406,
+    0.5551259549534626,
+    1.1938329352055201,
+    0.9698559287406783,
+    1.447002179128145,
+    1.3128200942311496,
+    1.394942694628328,
+    1.0114394258945634,
+    0.7701238110357329,
+    0.0,
+    1.1886800117391216,
+    1.0083692448135637,
+    1.8811409082590185,
+    1.4972023224597841,
+    1.5982242387673176,
+    1.19595706584447,
+    1.572881849642552,
+    0.8338711034820684,
+    1.245486437864406,
+    1.1886800117391216,
+    0.0,
+    1.3661374102525012,
+    1.3653115050624267,
+    1.5243383567266802,
+    1.3112533607072414,
+    1.3233328139624725,
+    1.380122665319464,
+    1.1247823842299223,
+    0.5551259549534626,
+    1.0083692448135637,
+    1.3661374102525012,
+    0.0},
+   raft::distance::DistanceType::LpUnexpanded,
+   2.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    0.9251771844789913,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.717493881903289,
+    0.6920214832303888,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.0,
+    0.9036452083899731,
+    0.8655339692155823,
+    0.8706483735804971,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.6329837991017668,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.0,
+    0.7988276152181608,
+    0.7028075145996631,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.8429599432532096,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.7988276152181608,
+    0.0,
+    0.48376552205293305,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8429599432532096,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.7028075145996631,
+    0.48376552205293305,
+    0.0,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8429599432532096,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.0,
+    0.8853924473642432,
+    0.535821510936138,
+    0.6497196601457607,
+    0.8853924473642432,
+    0.717493881903289,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.0,
+    0.5279604218147174,
+    0.6658348373853169,
+    0.33799874888632914,
+    0.6920214832303888,
+    0.6329837991017668,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.535821510936138,
+    0.5279604218147174,
+    0.0,
+    0.662579808115858,
+    0.5079750812968089,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.6497196601457607,
+    0.6658348373853169,
+    0.662579808115858,
+    0.0,
+    0.8429599432532096,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.33799874888632914,
+    0.5079750812968089,
+    0.8429599432532096,
+    0.0},
+   raft::distance::DistanceType::Linf},
+
+  {4,
+   {0, 1, 1, 2, 4},
+   {3, 2, 0, 1},  // indices
+   {0.99296, 0.42180, 0.11687, 0.305869},
+   {
+     // dense output
+     0.0,
+     0.99296,
+     1.41476,
+     1.415707,
+     0.99296,
+     0.0,
+     0.42180,
+     0.42274,
+     1.41476,
+     0.42180,
+     0.0,
+     0.84454,
+     1.41570,
+     0.42274,
+     0.84454,
+     0.0,
+   },
+   raft::distance::DistanceType::L1}
+
+};
+
+typedef SparseDistanceCSRSPMVTest<int, float> SparseDistanceCSRSPMVTestF;
+TEST_P(SparseDistanceCSRSPMVTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(SparseDistanceCSRSPMVTests, SparseDistanceCSRSPMVTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
+
+};  // namespace distance
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
new file mode 100644
index 0000000000..53e8838b65
--- /dev/null
+++ b/cpp/test/sparse/distance.cu
@@ -0,0 +1,764 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cusparse_v2.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/mr/device/allocator.hpp>
+
+#include <raft/sparse/distance/distance.cuh>
+
+#include "../test_utils.h"
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct SparseDistanceInputs {
+  value_idx n_cols;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_t> out_dists_ref_h;
+
+  raft::distance::DistanceType metric;
+
+  float metric_arg = 0.0;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream &operator<<(
+  ::std::ostream &os, const SparseDistanceInputs<value_idx, value_t> &dims) {
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class SparseDistanceTest
+  : public ::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>> {
+ protected:
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h = params.data_h;
+
+    allocate(indptr, indptr_h.size());
+    allocate(indices, indices_h.size());
+    allocate(data, data_h.size());
+
+    update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices, indices_h.data(), indices_h.size(), stream);
+    update_device(data, data_h.data(), data_h.size(), stream);
+
+    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+
+    allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1));
+
+    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
+                  stream);
+  }
+
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      SparseDistanceInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    CUSPARSE_CHECK(cusparseCreate(&cusparseHandle));
+
+    make_data();
+
+    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config;
+    dist_config.b_nrows = params.indptr_h.size() - 1;
+    dist_config.b_ncols = params.n_cols;
+    dist_config.b_nnz = params.indices_h.size();
+    dist_config.b_indptr = indptr;
+    dist_config.b_indices = indices;
+    dist_config.b_data = data;
+    dist_config.a_nrows = params.indptr_h.size() - 1;
+    dist_config.a_ncols = params.n_cols;
+    dist_config.a_nnz = params.indices_h.size();
+    dist_config.a_indptr = indptr;
+    dist_config.a_indices = indices;
+    dist_config.a_data = data;
+    dist_config.handle = cusparseHandle;
+    dist_config.allocator = alloc;
+    dist_config.stream = stream;
+
+    int out_size = dist_config.a_nrows * dist_config.b_nrows;
+
+    allocate(out_dists, out_size);
+
+    pairwiseDistance(out_dists, dist_config, params.metric, params.metric_arg);
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(indptr));
+    CUDA_CHECK(cudaFree(indices));
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(out_dists));
+    CUDA_CHECK(cudaFree(out_dists_ref));
+  }
+
+  void compare() {
+    // skip Hellinger test due to sporadic CI issue
+    // https://github.com/rapidsai/cuml/issues/3477
+    if (params.metric == raft::distance::DistanceType::HellingerExpanded) {
+      GTEST_SKIP();
+    } else {
+      ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
+                              params.out_dists_ref_h.size(),
+                              CompareApprox<value_t>(1e-3)));
+    }
+  }
+
+ protected:
+  cudaStream_t stream;
+  cusparseHandle_t cusparseHandle;
+
+  // input data
+  value_idx *indptr, *indices;
+  value_t *data;
+
+  // output data
+  value_t *out_dists, *out_dists_ref;
+
+  SparseDistanceInputs<value_idx, value_t> params;
+};
+
+const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {
+     // dense output
+     0.0,
+     4.0,
+     3026.0,
+     226.0,
+     4.0,
+     0.0,
+     2930.0,
+     234.0,
+     3026.0,
+     2930.0,
+     0.0,
+     1832.0,
+     226.0,
+     234.0,
+     1832.0,
+     0.0,
+   },
+   raft::distance::DistanceType::L2Expanded},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},
+   {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
+   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+    5.0},
+   raft::distance::DistanceType::InnerProduct},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {
+     // dense output
+     0.0,
+     4.0,
+     3026.0,
+     226.0,
+     4.0,
+     0.0,
+     2930.0,
+     234.0,
+     3026.0,
+     2930.0,
+     0.0,
+     1832.0,
+     226.0,
+     234.0,
+     1832.0,
+     0.0,
+   },
+   raft::distance::DistanceType::L2Unexpanded},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219,
+    0.58146987, 0.44940102, 1.,         0.76978799, 0.39419924, 0.,
+    0.97577154, 0.48904013, 0.48300801, 0.45087445, 0.73323749, 0.21050481,
+    0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
+    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,
+    0.79593037, 0.48904013, 0.51413997, 0.,         0.28605559, 0.35772784,
+    1.,         0.60889396, 0.43324829, 0.84923694, 0.45658883, 0.48300801,
+    0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
+    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784,
+    0.58623212, 0.,         0.77917274, 0.48390993, 0.24558392, 0.99166225,
+    0.58146987, 0.73323749, 0.67534399, 1.,         0.6745457,  0.77917274,
+    0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
+    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,
+    0.51360432, 0.68185144, 1.,         0.54847744, 0.8321819,  0.43324829,
+    0.67676228, 0.24558392, 0.76064776, 0.51360432, 0.,         1.,
+    0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
+    0.61547536, 0.68185144, 1.,         0.},
+   raft::distance::DistanceType::CosineExpanded},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+    1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+    1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+   {0.0,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.75,
+    0.2857142857142857,
+    0.75,
+    0.7142857142857143,
+    0.5,
+    1.0,
+    0.6666666666666666,
+    0.42857142857142855,
+    0.0,
+    0.75,
+    0.625,
+    0.375,
+    0.42857142857142855,
+    0.75,
+    0.375,
+    0.75,
+    0.7142857142857143,
+    0.7142857142857143,
+    0.75,
+    0.0,
+    0.7142857142857143,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.6666666666666666,
+    0.625,
+    0.6666666666666666,
+    1.0,
+    0.75,
+    0.625,
+    0.7142857142857143,
+    0.0,
+    0.5,
+    0.5714285714285714,
+    1.0,
+    0.8,
+    0.5,
+    0.6666666666666666,
+    0.2857142857142857,
+    0.375,
+    0.42857142857142855,
+    0.5,
+    0.0,
+    0.6666666666666666,
+    0.7777777777777778,
+    0.4444444444444444,
+    0.7777777777777778,
+    0.75,
+    0.75,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.5714285714285714,
+    0.6666666666666666,
+    0.0,
+    0.7142857142857143,
+    0.5,
+    0.5,
+    0.8571428571428571,
+    0.7142857142857143,
+    0.75,
+    0.6666666666666666,
+    1.0,
+    0.7777777777777778,
+    0.7142857142857143,
+    0.0,
+    0.42857142857142855,
+    0.8571428571428571,
+    0.8333333333333334,
+    0.5,
+    0.375,
+    0.625,
+    0.8,
+    0.4444444444444444,
+    0.5,
+    0.42857142857142855,
+    0.0,
+    0.7777777777777778,
+    0.75,
+    1.0,
+    0.75,
+    0.6666666666666666,
+    0.5,
+    0.7777777777777778,
+    0.5,
+    0.8571428571428571,
+    0.7777777777777778,
+    0.0,
+    1.0,
+    0.6666666666666666,
+    0.7142857142857143,
+    1.0,
+    0.6666666666666666,
+    0.75,
+    0.8571428571428571,
+    0.8333333333333334,
+    0.75,
+    1.0,
+    0.0},
+   raft::distance::DistanceType::JaccardExpanded},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    3.3954660629919076,
+    5.6469232737388815,
+    6.373112846266441,
+    4.0212880272531715,
+    6.916281504639404,
+    5.741508386786526,
+    5.411470999663036,
+    9.0,
+    4.977014354725805,
+    3.3954660629919076,
+    0.0,
+    7.56256082439209,
+    5.540261147481582,
+    4.832322929216881,
+    4.62003193872216,
+    6.498056792320361,
+    4.309846252268695,
+    6.317531174829905,
+    6.016362684141827,
+    5.6469232737388815,
+    7.56256082439209,
+    0.0,
+    5.974878731322299,
+    4.898357301336036,
+    6.442097410320605,
+    5.227077347287883,
+    7.134101195584642,
+    5.457753923371659,
+    7.0,
+    6.373112846266441,
+    5.540261147481582,
+    5.974878731322299,
+    0.0,
+    5.5507273748583,
+    4.897749658726415,
+    9.0,
+    8.398776718824767,
+    3.908281400328807,
+    4.83431066343688,
+    4.0212880272531715,
+    4.832322929216881,
+    4.898357301336036,
+    5.5507273748583,
+    0.0,
+    6.632989819428174,
+    7.438852294822894,
+    5.6631570310967465,
+    7.579428202635459,
+    6.760811985364303,
+    6.916281504639404,
+    4.62003193872216,
+    6.442097410320605,
+    4.897749658726415,
+    6.632989819428174,
+    0.0,
+    5.249404187382862,
+    6.072559523278559,
+    4.07661278488929,
+    6.19678948003145,
+    5.741508386786526,
+    6.498056792320361,
+    5.227077347287883,
+    9.0,
+    7.438852294822894,
+    5.249404187382862,
+    0.0,
+    3.854811639654704,
+    6.652724827169063,
+    5.298236851430971,
+    5.411470999663036,
+    4.309846252268695,
+    7.134101195584642,
+    8.398776718824767,
+    5.6631570310967465,
+    6.072559523278559,
+    3.854811639654704,
+    0.0,
+    7.529184598969917,
+    6.903282911791188,
+    9.0,
+    6.317531174829905,
+    5.457753923371659,
+    3.908281400328807,
+    7.579428202635459,
+    4.07661278488929,
+    6.652724827169063,
+    7.529184598969917,
+    0.0,
+    7.0,
+    4.977014354725805,
+    6.016362684141827,
+    7.0,
+    4.83431066343688,
+    6.760811985364303,
+    6.19678948003145,
+    5.298236851430971,
+    6.903282911791188,
+    7.0,
+    0.0},
+   raft::distance::DistanceType::Canberra},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    1.31462855332296,
+    1.3690307816129905,
+    1.698603990921237,
+    1.3460470789553531,
+    1.6636670712582544,
+    1.2651744044972217,
+    1.1938329352055201,
+    1.8811409082590185,
+    1.3653115050624267,
+    1.31462855332296,
+    0.0,
+    1.9447722703291133,
+    1.42818777206562,
+    1.4685491458946494,
+    1.3071999866010466,
+    1.4988622861692171,
+    0.9698559287406783,
+    1.4972023224597841,
+    1.5243383567266802,
+    1.3690307816129905,
+    1.9447722703291133,
+    0.0,
+    1.2748400840107568,
+    1.0599569946448246,
+    1.546591282841402,
+    1.147526531928459,
+    1.447002179128145,
+    1.5982242387673176,
+    1.3112533607072414,
+    1.698603990921237,
+    1.42818777206562,
+    1.2748400840107568,
+    0.0,
+    1.038121552545461,
+    1.011788365364402,
+    1.3907391109256988,
+    1.3128200942311496,
+    1.19595706584447,
+    1.3233328139624725,
+    1.3460470789553531,
+    1.4685491458946494,
+    1.0599569946448246,
+    1.038121552545461,
+    0.0,
+    1.3642741698145529,
+    1.3493868683808095,
+    1.394942694628328,
+    1.572881849642552,
+    1.380122665319464,
+    1.6636670712582544,
+    1.3071999866010466,
+    1.546591282841402,
+    1.011788365364402,
+    1.3642741698145529,
+    0.0,
+    1.018961640373018,
+    1.0114394258945634,
+    0.8338711034820684,
+    1.1247823842299223,
+    1.2651744044972217,
+    1.4988622861692171,
+    1.147526531928459,
+    1.3907391109256988,
+    1.3493868683808095,
+    1.018961640373018,
+    0.0,
+    0.7701238110357329,
+    1.245486437864406,
+    0.5551259549534626,
+    1.1938329352055201,
+    0.9698559287406783,
+    1.447002179128145,
+    1.3128200942311496,
+    1.394942694628328,
+    1.0114394258945634,
+    0.7701238110357329,
+    0.0,
+    1.1886800117391216,
+    1.0083692448135637,
+    1.8811409082590185,
+    1.4972023224597841,
+    1.5982242387673176,
+    1.19595706584447,
+    1.572881849642552,
+    0.8338711034820684,
+    1.245486437864406,
+    1.1886800117391216,
+    0.0,
+    1.3661374102525012,
+    1.3653115050624267,
+    1.5243383567266802,
+    1.3112533607072414,
+    1.3233328139624725,
+    1.380122665319464,
+    1.1247823842299223,
+    0.5551259549534626,
+    1.0083692448135637,
+    1.3661374102525012,
+    0.0},
+   raft::distance::DistanceType::LpUnexpanded,
+   2.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    0.9251771844789913,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.717493881903289,
+    0.6920214832303888,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.0,
+    0.9036452083899731,
+    0.8655339692155823,
+    0.8706483735804971,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.6329837991017668,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.0,
+    0.7988276152181608,
+    0.7028075145996631,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.8429599432532096,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.7988276152181608,
+    0.0,
+    0.48376552205293305,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8429599432532096,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.7028075145996631,
+    0.48376552205293305,
+    0.0,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8429599432532096,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.0,
+    0.8853924473642432,
+    0.535821510936138,
+    0.6497196601457607,
+    0.8853924473642432,
+    0.717493881903289,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.0,
+    0.5279604218147174,
+    0.6658348373853169,
+    0.33799874888632914,
+    0.6920214832303888,
+    0.6329837991017668,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.535821510936138,
+    0.5279604218147174,
+    0.0,
+    0.662579808115858,
+    0.5079750812968089,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.6497196601457607,
+    0.6658348373853169,
+    0.662579808115858,
+    0.0,
+    0.8429599432532096,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.33799874888632914,
+    0.5079750812968089,
+    0.8429599432532096,
+    0.0},
+   raft::distance::DistanceType::Linf},
+
+  {4,
+   {0, 1, 1, 2, 4},
+   {3, 2, 0, 1},  // indices
+   {0.99296, 0.42180, 0.11687, 0.305869},
+   {
+     // dense output
+     0.0,
+     0.99296,
+     1.41476,
+     1.415707,
+     0.99296,
+     0.0,
+     0.42180,
+     0.42274,
+     1.41476,
+     0.42180,
+     0.0,
+     0.84454,
+     1.41570,
+     0.42274,
+     0.84454,
+     0.0,
+   },
+   raft::distance::DistanceType::L1},
+  {10,
+   {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45},
+   {0,  1, 5,  6,  9,  1, 4, 14, 7, 3, 4,  7, 9, 11, 14,
+    0,  3, 7,  8,  12, 0, 2, 5,  7, 8, 14, 4, 9, 10, 11,
+    13, 4, 10, 14, 5,  6, 8, 9,  0, 2, 3,  4, 6, 10, 11},
+   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507,
+    0.73789274, 0.08450219, 1.,         0.20184723, 0.18036963, 0.12581403,
+    0.13867603, 0.24040536, 0.11288773, 0.00290246, 0.09120187, 0.31190555,
+    0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
+    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881,
+    0.15605804, 0.3867739,  0.24908977, 0.36413632, 0.37643732, 0.28910679,
+    0.0198409,  0.31461499, 0.24412279, 0.08327667, 0.04444576, 0.05047969,
+    0.26190054, 0.2077349,  0.10803964},
+   {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01,
+    9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00,
+    6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08,
+    1.00000000e+00, 7.33151506e-01, 1.00000000e+00, 9.86880955e-01,
+    9.19154851e-01, 5.38849774e-01, 1.00000000e+00, 8.98332369e-01,
+    1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 8.03303970e-01,
+    6.64465915e-01, 8.69374690e-01, 1.00000000e+00, 1.00000000e+00,
+    1.00000000e+00, 1.00000000e+00, 9.24116813e-01, 7.33151506e-01,
+    8.03303970e-01, 0.00000000e+00, 8.16225843e-01, 9.39818306e-01,
+    7.27700415e-01, 7.30155528e-01, 8.89451011e-01, 8.05419635e-01,
+    9.90039274e-01, 1.00000000e+00, 6.64465915e-01, 8.16225843e-01,
+    0.00000000e+00, 6.38804490e-01, 1.00000000e+00, 1.00000000e+00,
+    9.52559809e-01, 9.53789212e-01, 7.97613546e-01, 9.86880955e-01,
+    8.69374690e-01, 9.39818306e-01, 6.38804490e-01, 0.0,
+    1.00000000e+00, 9.72569112e-01, 8.24907516e-01, 8.07933016e-01,
+    8.91271059e-01, 9.19154851e-01, 1.00000000e+00, 7.27700415e-01,
+    1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 7.63596268e-01,
+    8.40131263e-01, 7.40428532e-01, 1.00000000e+00, 5.38849774e-01,
+    1.00000000e+00, 7.30155528e-01, 1.00000000e+00, 9.72569112e-01,
+    7.63596268e-01, 0.00000000e+00, 1.00000000e+00, 7.95485011e-01,
+    6.64669302e-01, 1.00000000e+00, 1.00000000e+00, 8.89451011e-01,
+    9.52559809e-01, 8.24907516e-01, 8.40131263e-01, 1.00000000e+00,
+    0.00000000e+00, 8.51370877e-01, 8.59439512e-01, 8.98332369e-01,
+    1.00000000e+00, 8.05419635e-01, 9.53789212e-01, 8.07933016e-01,
+    7.40428532e-01, 7.95485011e-01, 8.51370877e-01, 1.49011612e-08},
+   // Dataset is L1 normalized into pdfs
+   raft::distance::DistanceType::HellingerExpanded}};
+
+typedef SparseDistanceTest<int, float> SparseDistanceTestF;
+TEST_P(SparseDistanceTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
+
+};  // namespace distance
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
new file mode 100644
index 0000000000..f7954f899f
--- /dev/null
+++ b/cpp/test/sparse/filter.cu
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+#include <raft/sparse/op/sort.h>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/sparse/coo.cuh>
+#include <raft/sparse/op/filter.cuh>
+
+#include <iostream>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseFilterInputs {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseFilterTests
+  : public ::testing::TestWithParam<SparseFilterInputs<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseFilterInputs<T> params;
+};
+
+const std::vector<SparseFilterInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
+
+typedef SparseFilterTests<float> COORemoveZeros;
+TEST_P(COORemoveZeros, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  std::shared_ptr<raft::mr::device::allocator> alloc(
+    new raft::mr::device::default_allocator);
+  params = ::testing::TestWithParam<SparseFilterInputs<float>>::GetParam();
+
+  float *in_h_vals = new float[params.nnz];
+
+  COO<float> in(alloc, stream, params.nnz, 5, 5);
+
+  raft::random::Rng r(params.seed);
+  r.uniform(in.vals(), params.nnz, float(-1.0), float(1.0), stream);
+
+  raft::update_host(in_h_vals, in.vals(), params.nnz, stream);
+
+  in_h_vals[0] = 0;
+  in_h_vals[2] = 0;
+  in_h_vals[3] = 0;
+
+  int *in_h_rows = new int[params.nnz];
+  int *in_h_cols = new int[params.nnz];
+
+  for (int i = 0; i < params.nnz; i++) {
+    in_h_rows[i] = params.nnz - i - 1;
+    in_h_cols[i] = i;
+  }
+
+  raft::update_device(in.rows(), in_h_rows, params.nnz, stream);
+  raft::update_device(in.cols(), in_h_cols, params.nnz, stream);
+  raft::update_device(in.vals(), in_h_vals, params.nnz, stream);
+
+  op::coo_sort<float>(&in, alloc, stream);
+
+  int out_rows_ref_h[2] = {0, 3};
+  int out_cols_ref_h[2] = {4, 1};
+
+  float *out_vals_ref_h = (float *)malloc(2 * sizeof(float));
+  out_vals_ref_h[0] = in_h_vals[4];
+  out_vals_ref_h[1] = in_h_vals[1];
+
+  COO<float> out_ref(alloc, stream, 2, 5, 5);
+  COO<float> out(alloc, stream);
+
+  raft::update_device(out_ref.rows(), *&out_rows_ref_h, 2, stream);
+  raft::update_device(out_ref.cols(), *&out_cols_ref_h, 2, stream);
+  raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream);
+
+  op::coo_remove_zeros<32, float>(&in, &out, alloc, stream);
+
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2,
+                                       raft::Compare<float>()));
+
+  CUDA_CHECK(cudaStreamDestroy(stream));
+  free(out_vals_ref_h);
+
+  delete[] in_h_rows;
+  delete[] in_h_cols;
+  delete[] in_h_vals;
+}
+
+INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros,
+                        ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
new file mode 100644
index 0000000000..0f773b9fee
--- /dev/null
+++ b/cpp/test/sparse/knn.cu
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cusparse_v2.h>
+#include <gtest/gtest.h>
+
+#include <raft/sparse/selection/knn.cuh>
+#include "../test_utils.h"
+
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+namespace raft {
+namespace sparse {
+namespace selection {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct SparseKNNInputs {
+  value_idx n_cols;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_t> out_dists_ref_h;
+  std::vector<value_idx> out_indices_ref_h;
+
+  int k;
+
+  int batch_size_index = 2;
+  int batch_size_query = 2;
+
+  raft::distance::DistanceType metric =
+    raft::distance::DistanceType::L2Expanded;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream &operator<<(::std::ostream &os,
+                           const SparseKNNInputs<value_idx, value_t> &dims) {
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class SparseKNNTest
+  : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
+ protected:
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h = params.data_h;
+
+    printf("Allocating input\n");
+
+    allocate(indptr, indptr_h.size());
+    allocate(indices, indices_h.size());
+    allocate(data, data_h.size());
+
+    printf("Updating device\n");
+
+    update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices, indices_h.data(), indices_h.size(), stream);
+    update_device(data, data_h.data(), data_h.size(), stream);
+
+    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+    std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
+
+    printf("Allocating ref output\n");
+    allocate(out_indices_ref, out_indices_ref_h.size());
+    allocate(out_dists_ref, out_dists_ref_h.size());
+
+    printf("Updating device\n");
+
+    update_device(out_indices_ref, out_indices_ref_h.data(),
+                  out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
+                  stream);
+
+    printf("Allocating final output\n");
+
+    allocate(out_dists, n_rows * k);
+    allocate(out_indices, n_rows * k);
+
+    printf("Done.\n");
+  }
+
+  void SetUp() override {
+    params =
+      ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
+
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    CUSPARSE_CHECK(cusparseCreate(&cusparseHandle));
+
+    n_rows = params.indptr_h.size() - 1;
+    nnz = params.indices_h.size();
+    k = params.k;
+
+    printf("Making data\n");
+
+    make_data();
+
+    printf("About to run kselect\n");
+
+    raft::sparse::selection::brute_force_knn<value_idx, value_t>(
+      indptr, indices, data, nnz, n_rows, params.n_cols, indptr, indices, data,
+      nnz, n_rows, params.n_cols, out_indices, out_dists, k, cusparseHandle,
+      alloc, stream, params.batch_size_index, params.batch_size_query,
+      params.metric);
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    printf("Executed k-select");
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(indptr));
+    CUDA_CHECK(cudaFree(indices));
+    CUDA_CHECK(cudaFree(data));
+    CUDA_CHECK(cudaFree(out_indices));
+    CUDA_CHECK(cudaFree(out_dists));
+    CUDA_CHECK(cudaFree(out_indices_ref));
+    CUDA_CHECK(cudaFree(out_dists_ref));
+
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void compare() {
+    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k,
+                            CompareApprox<value_t>(1e-4)));
+    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k,
+                            Compare<value_idx>()));
+  }
+
+ protected:
+  cudaStream_t stream;
+  cusparseHandle_t cusparseHandle;
+
+  int n_rows, nnz, k;
+
+  // input data
+  value_idx *indptr, *indices;
+  value_t *data;
+
+  // output data
+  value_idx *out_indices;
+  value_t *out_dists;
+
+  value_idx *out_indices_ref;
+  value_t *out_dists_ref;
+
+  SparseKNNInputs<value_idx, value_t> params;
+};
+
+const std::vector<SparseKNNInputs<int, float>> inputs_i32_f = {
+  {9,                                                 // ncols
+   {0, 2, 4, 6, 8},                                   // indptr
+   {0, 4, 0, 3, 0, 2, 0, 8},                          // indices
+   {0.0f, 1.0f, 5.0f, 6.0f, 5.0f, 6.0f, 0.0f, 1.0f},  // data
+   {0, 1.41421, 0, 7.87401, 0, 7.87401, 0, 1.41421},  // dists
+   {0, 3, 1, 0, 2, 0, 3, 0},                          // inds
+   2,
+   2,
+   2,
+   raft::distance::DistanceType::L2Expanded}};
+typedef SparseKNNTest<int, float> SparseKNNTestF;
+TEST_P(SparseKNNTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
+
+};  // end namespace selection
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
new file mode 100644
index 0000000000..7adbbf8b9a
--- /dev/null
+++ b/cpp/test/sparse/norm.cu
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/csr.cuh>
+#include <raft/sparse/linalg/norm.cuh>
+#include "../test_utils.h"
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+enum NormalizeMethod { MAX, L1 };
+
+template <typename Type_f, typename Index_>
+struct CSRRowNormalizeInputs {
+  NormalizeMethod method;
+  std::vector<Index_> ex_scan;
+  std::vector<Type_f> in_vals;
+  std::vector<Type_f> verify;
+};
+
+template <typename Type_f, typename Index_>
+class CSRRowNormalizeTest
+  : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      CSRRowNormalizeInputs<Type_f, Index_>>::GetParam();
+    cudaStreamCreate(&stream);
+
+    raft::allocate(in_vals, params.in_vals.size());
+    raft::allocate(verify, params.verify.size());
+    raft::allocate(ex_scan, params.ex_scan.size());
+    raft::allocate(result, params.verify.size(), true);
+  }
+
+  void Run() {
+    Index_ n_rows = params.ex_scan.size();
+    Index_ nnz = params.in_vals.size();
+
+    raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
+    raft::update_device(in_vals, params.in_vals.data(), nnz, stream);
+    raft::update_device(verify, params.verify.data(), nnz, stream);
+
+    switch (params.method) {
+      case MAX:
+        linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows,
+                                                  result, stream);
+        break;
+      case L1:
+        linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows,
+                                                 result, stream);
+        break;
+    }
+
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(ex_scan));
+    CUDA_CHECK(cudaFree(in_vals));
+    CUDA_CHECK(cudaFree(verify));
+    CUDA_CHECK(cudaFree(result));
+    cudaStreamDestroy(stream);
+  }
+
+ protected:
+  CSRRowNormalizeInputs<Type_f, Index_> params;
+  cudaStream_t stream;
+  Index_ *ex_scan;
+  Type_f *in_vals, *result, *verify;
+};
+
+using CSRRowNormalizeTestF = CSRRowNormalizeTest<float, int>;
+TEST_P(CSRRowNormalizeTestF, Result) { Run(); }
+
+using CSRRowNormalizeTestD = CSRRowNormalizeTest<double, int>;
+TEST_P(CSRRowNormalizeTestD, Result) { Run(); }
+
+const std::vector<CSRRowNormalizeInputs<float, int>> csrnormalize_inputs_f = {
+  {MAX,
+   {0, 4, 8, 9},
+   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
+  {L1,
+   {0, 4, 8, 9},
+   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+};
+const std::vector<CSRRowNormalizeInputs<double, int>> csrnormalize_inputs_d = {
+  {MAX,
+   {0, 4, 8, 9},
+   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
+  {L1,
+   {0, 4, 8, 9},
+   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+};
+
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF,
+                        ::testing::ValuesIn(csrnormalize_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD,
+                        ::testing::ValuesIn(csrnormalize_inputs_d));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
new file mode 100644
index 0000000000..b64fa25883
--- /dev/null
+++ b/cpp/test/sparse/row_op.cu
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <raft/sparse/csr.cuh>
+#include <raft/sparse/op/row_op.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+template <typename Type_f, typename Index_>
+struct CSRRowOpInputs {
+  std::vector<Index_> ex_scan;
+  std::vector<Type_f> verify;
+};
+
+/** Wrapper to call csr_row_op because the enclosing function of a __device__
+ *  lambda cannot have private ot protected access within the class. */
+template <typename Type_f, typename Index_>
+void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz,
+                        Type_f *result, cudaStream_t stream) {
+  op::csr_row_op<Index_, 32>(
+    row_ind, n_rows, nnz,
+    [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {
+      for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row;
+    },
+    stream);
+}
+
+template <typename Type_f, typename Index_>
+class CSRRowOpTest
+  : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>> {
+ protected:
+  void SetUp() override {
+    params =
+      ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam();
+    cudaStreamCreate(&stream);
+    n_rows = params.ex_scan.size();
+    nnz = params.verify.size();
+
+    raft::allocate(verify, nnz);
+    raft::allocate(ex_scan, n_rows);
+    raft::allocate(result, nnz, true);
+  }
+
+  void Run() {
+    raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
+    raft::update_device(verify, params.verify.data(), nnz, stream);
+
+    csr_row_op_wrapper<Type_f, Index_>(ex_scan, n_rows, nnz, result, stream);
+
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(ex_scan));
+    CUDA_CHECK(cudaFree(verify));
+    CUDA_CHECK(cudaFree(result));
+    cudaStreamDestroy(stream);
+  }
+
+ protected:
+  CSRRowOpInputs<Type_f, Index_> params;
+  cudaStream_t stream;
+  Index_ n_rows, nnz;
+  Index_ *ex_scan;
+  Type_f *result, *verify;
+};
+
+using CSRRowOpTestF = CSRRowOpTest<float, int>;
+TEST_P(CSRRowOpTestF, Result) { Run(); }
+
+using CSRRowOpTestD = CSRRowOpTest<double, int>;
+TEST_P(CSRRowOpTestD, Result) { Run(); }
+
+const std::vector<CSRRowOpInputs<float, int>> csrrowop_inputs_f = {
+  {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}},
+};
+const std::vector<CSRRowOpInputs<double, int>> csrrowop_inputs_d = {
+  {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}},
+};
+
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF,
+                        ::testing::ValuesIn(csrrowop_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD,
+                        ::testing::ValuesIn(csrrowop_inputs_d));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu
new file mode 100644
index 0000000000..46f2f6a844
--- /dev/null
+++ b/cpp/test/sparse/selection.cu
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+
+#include "../test_utils.h"
+
+#include <raft/sparse/utils.h>
+#include <raft/sparse/selection/selection.cuh>
+
+namespace raft {
+namespace sparse {
+namespace selection {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct SparseSelectionInputs {
+  value_idx n_rows;
+  value_idx n_cols;
+
+  std::vector<value_t> dists_h;
+
+  std::vector<value_t> out_dists_ref_h;
+  std::vector<value_idx> out_indices_ref_h;
+
+  int k;
+
+  bool select_min;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream &operator<<(
+  ::std::ostream &os, const SparseSelectionInputs<value_idx, value_t> &dims) {
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class SparseSelectionTest
+  : public ::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>> {
+ protected:
+  void make_data() {
+    std::vector<value_t> dists_h = params.dists_h;
+
+    allocate(dists, n_rows * n_cols);
+    update_device(dists, dists_h.data(), dists_h.size(), stream);
+
+    allocate(inds, n_rows * n_cols);
+    iota_fill(inds, n_rows, n_cols, stream);
+
+    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+    std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
+
+    allocate(out_indices_ref, out_indices_ref_h.size());
+    allocate(out_dists_ref, out_dists_ref_h.size());
+
+    update_device(out_indices_ref, out_indices_ref_h.data(),
+                  out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
+                  stream);
+
+    allocate(out_dists, n_rows * k);
+    allocate(out_indices, n_rows * k);
+  }
+
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      SparseSelectionInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    n_rows = params.n_rows;
+    n_cols = params.n_cols;
+    k = params.k;
+
+    make_data();
+
+    raft::sparse::selection::select_k(dists, inds, n_rows, n_cols, out_dists,
+                                      out_indices, params.select_min, k,
+                                      stream);
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    CUDA_CHECK(cudaFree(dists));
+    CUDA_CHECK(cudaFree(inds));
+    CUDA_CHECK(cudaFree(out_indices));
+    CUDA_CHECK(cudaFree(out_dists));
+    CUDA_CHECK(cudaFree(out_indices_ref));
+    CUDA_CHECK(cudaFree(out_dists_ref));
+
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  void compare() {
+    ASSERT_TRUE(
+      devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k,
+                            Compare<value_idx>()));
+  }
+
+ protected:
+  cudaStream_t stream;
+
+  int n_rows, n_cols, k;
+
+  // input data
+  value_t *dists;
+  value_idx *inds;
+
+  // output data
+  value_idx *out_indices;
+  value_t *out_dists;
+
+  value_idx *out_indices_ref;
+  value_t *out_dists_ref;
+
+  SparseSelectionInputs<value_idx, value_t> params;
+};
+
+const std::vector<SparseSelectionInputs<int, float>> inputs_i32_f = {
+  {5,
+   5,
+   {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+    1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+   {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
+    4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
+   {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3},
+   5,
+   true}};
+typedef SparseSelectionTest<int, float> SparseSelectionTestF;
+TEST_P(SparseSelectionTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
+
+};  // end namespace selection
+};  // end namespace sparse
+};  // end namespace raft
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
new file mode 100644
index 0000000000..b9a8b849eb
--- /dev/null
+++ b/cpp/test/sparse/sort.cu
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+#include <raft/sparse/op/sort.h>
+#include <raft/mr/device/allocator.hpp>
+
+#include <iostream>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseSortInput {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseSortTest : public ::testing::TestWithParam<SparseSortInput<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseSortInput<T> params;
+};
+
+const std::vector<SparseSortInput<float>> inputsf = {{5, 10, 5, 1234ULL}};
+
+typedef SparseSortTest<float> COOSort;
+TEST_P(COOSort, Result) {
+  int *in_rows, *in_cols, *verify;
+  float *in_vals;
+
+  params = ::testing::TestWithParam<SparseSortInput<float>>::GetParam();
+  raft::random::Rng r(params.seed);
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+  std::shared_ptr<raft::mr::device::allocator> alloc(
+    new raft::mr::device::default_allocator);
+
+  raft::allocate(in_vals, params.nnz);
+  r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream);
+
+  int *in_rows_h = (int *)malloc(params.nnz * sizeof(int));
+  int *in_cols_h = (int *)malloc(params.nnz * sizeof(int));
+  int *verify_h = (int *)malloc(params.nnz * sizeof(int));
+
+  for (int i = 0; i < params.nnz; i++) {
+    in_rows_h[i] = params.nnz - i - 1;
+    verify_h[i] = i;
+    in_cols_h[i] = i;
+  }
+
+  raft::allocate(in_rows, params.nnz);
+  raft::allocate(in_cols, params.nnz);
+  raft::allocate(verify, params.nnz);
+
+  raft::update_device(in_rows, in_rows_h, params.nnz, stream);
+
+  raft::update_device(in_cols, in_cols_h, params.nnz, stream);
+  raft::update_device(verify, verify_h, params.nnz, stream);
+
+  op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc,
+               stream);
+
+  ASSERT_TRUE(
+    raft::devArrMatch<int>(verify, in_rows, params.nnz, raft::Compare<int>()));
+
+  delete[] in_rows_h;
+  delete[] in_cols_h;
+  delete[] verify_h;
+
+  CUDA_CHECK(cudaFree(in_rows));
+  CUDA_CHECK(cudaFree(in_cols));
+  CUDA_CHECK(cudaFree(in_vals));
+  CUDA_CHECK(cudaFree(verify));
+  CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseSortTest, COOSort, ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
new file mode 100644
index 0000000000..07dd9d11a2
--- /dev/null
+++ b/cpp/test/sparse/symmetrize.cu
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+#include <raft/sparse/coo.cuh>
+#include <raft/sparse/linalg/symmetrize.cuh>
+
+#include <iostream>
+
+namespace raft {
+namespace sparse {
+
+template <typename T>
+struct SparseSymmetrizeInput {
+  int m, n, nnz;
+  unsigned long long int seed;
+};
+
+template <typename T>
+class SparseSymmetrizeTest
+  : public ::testing::TestWithParam<SparseSymmetrizeInput<T>> {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  SparseSymmetrizeInput<T> params;
+};
+
+const std::vector<SparseSymmetrizeInput<float>> inputsf = {{5, 10, 5, 1234ULL}};
+
+typedef SparseSymmetrizeTest<float> COOSymmetrize;
+TEST_P(COOSymmetrize, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  std::shared_ptr<raft::mr::device::default_allocator> alloc(
+    new raft::mr::device::default_allocator);
+
+  int nnz = 8;
+
+  int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2};
+  float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5};
+
+  int *exp_rows_h =
+    new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0};
+  int *exp_cols_h =
+    new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0};
+  float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0,
+                                         0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
+
+  COO<float> in(alloc, stream, nnz, 4, 4);
+  raft::update_device(in.rows(), *&in_rows_h, nnz, stream);
+  raft::update_device(in.cols(), *&in_cols_h, nnz, stream);
+  raft::update_device(in.vals(), *&in_vals_h, nnz, stream);
+
+  COO<float> out(alloc, stream);
+
+  linalg::coo_symmetrize<32, float>(
+    &in, &out,
+    [] __device__(int row, int col, float val, float trans) {
+      return val + trans;
+    },
+    alloc, stream);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  std::cout << out << std::endl;
+
+  ASSERT_TRUE(out.nnz == nnz * 2);
+  ASSERT_TRUE(raft::devArrMatch<int>(out.rows(), exp_rows_h, out.nnz,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.cols(), exp_cols_h, out.nnz,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out.vals(), exp_vals_h, out.nnz,
+                                       raft::Compare<float>()));
+
+  cudaStreamDestroy(stream);
+
+  delete[] in_rows_h;
+  delete[] in_cols_h;
+  delete[] in_vals_h;
+
+  delete[] exp_rows_h;
+  delete[] exp_cols_h;
+  delete[] exp_vals_h;
+}
+
+INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, COOSymmetrize,
+                        ::testing::ValuesIn(inputsf));
+
+}  // namespace sparse
+}  // namespace raft

From 9798885207aded2cceb37e05053da5a8f59ab206 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Thu, 11 Feb 2021 15:55:38 -0600
Subject: [PATCH 06/11] perf check

---
 cpp/include/raft/handle.hpp |  2 ++
 cpp/test/handle.cpp         | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index a42fdd67b2..92fd6c2663 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -71,6 +71,8 @@ class handle_t {
   handle_t(const handle_t& h) : dev_id_(h.get_device()) {}
   handle_t(const handle_t&& h) : dev_id_(h.get_device()) {}
 
+  // light copy operator
+  // skip streams, comms, and libs handles
   handle_t& operator=(const handle_t& h) {
     prop_ = h.get_device_properties();
     device_prop_initialized_ = true;
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 8fef4ead61..ead7382b1b 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -64,6 +64,18 @@ TEST(Raft, GetHandleFromPool) {
   ASSERT_EQ(parent.get_device(), child.get_device());
 }
 
+TEST(Raft, GetHandleFromPoolPerf) {
+  handle_t parent(100);
+  auto start = curTimeMillis();
+  for (int i = 0; i < parent.get_num_internal_streams(); i++) {
+    auto child = parent.get_handle_from_internal_pool(i);
+    ASSERT_EQ(parent.get_internal_stream(i), child.get_stream());
+    child.wait_on_user_stream();
+  }
+  // upperbound on 0.1ms per child handle
+  ASSERT_LE(curTimeMillis() - start, 10);
+}
+
 TEST(Raft, GetHandleStreamViews) {
   handle_t parent(4);
 

From def166f7c26d73e91db6a1d554dea6a32381f603 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Fri, 12 Feb 2021 12:26:36 -0600
Subject: [PATCH 07/11] reviews

---
 cpp/include/raft/handle.hpp | 34 +++++++++++++++++-----------------
 cpp/test/handle.cpp         |  6 +++---
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 92fd6c2663..42a1e4ebb8 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -68,17 +68,25 @@ class handle_t {
       host_allocator_(std::make_shared<mr::host::default_allocator>()) {
     create_resources();
   }
-  handle_t(const handle_t& h) : dev_id_(h.get_device()) {}
-  handle_t(const handle_t&& h) : dev_id_(h.get_device()) {}
 
-  // light copy operator
-  // skip streams, comms, and libs handles
-  handle_t& operator=(const handle_t& h) {
-    prop_ = h.get_device_properties();
+  /**
+   * @brief Construct a light handle copy from another 
+   * user stream, cuda handles, comms and worker pool are not copied
+   * The user_stream of the returned handle is set to the specified stream 
+   * of the other handle worker pool 
+   * @param[in] stream_id stream id in `other` worker streams 
+   * to be set as user stream in the constructed handle
+   * @param[in] n_streams number worker streams to be created
+   */
+  handle_t(const handle_t& other, int stream_id,
+           int n_streams = kNumDefaultWorkerStreams)
+    : dev_id_(other.get_device()), streams_(n_streams) {
+    prop_ = other.get_device_properties();
     device_prop_initialized_ = true;
-    device_allocator_ = get_device_allocator();
-    host_allocator_ = get_host_allocator();
-    return *this;
+    device_allocator_ = other.get_device_allocator();
+    host_allocator_ = other.get_host_allocator();
+    create_resources();
+    set_stream(other.get_internal_stream(stream_id));
   }
 
   /** Destroys all held-up resources */
@@ -160,14 +168,6 @@ class handle_t {
     return int_streams_vec;
   }
 
-  handle_t get_handle_from_internal_pool(
-    int stream_id, int n_streams = kNumDefaultWorkerStreams) const {
-    handle_t handle(n_streams);
-    handle = *this;
-    handle.set_stream(this->get_internal_stream(stream_id));
-    return handle;
-  }
-
   void wait_on_user_stream() const {
     CUDA_CHECK(cudaEventRecord(event_, user_stream_));
     for (int i = 0; i < get_num_internal_streams(); i++) {
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index ead7382b1b..4cb9809844 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -53,7 +53,7 @@ TEST(Raft, GetInternalStreams) {
 TEST(Raft, GetHandleFromPool) {
   handle_t parent(4);
 
-  auto child = parent.get_handle_from_internal_pool(2);
+  handle_t child(parent, 2);
   ASSERT_EQ(parent.get_internal_stream(2), child.get_stream());
   ASSERT_EQ(0, child.get_num_internal_streams());
 
@@ -68,7 +68,7 @@ TEST(Raft, GetHandleFromPoolPerf) {
   handle_t parent(100);
   auto start = curTimeMillis();
   for (int i = 0; i < parent.get_num_internal_streams(); i++) {
-    auto child = parent.get_handle_from_internal_pool(i);
+    handle_t child(parent, i);
     ASSERT_EQ(parent.get_internal_stream(i), child.get_stream());
     child.wait_on_user_stream();
   }
@@ -79,7 +79,7 @@ TEST(Raft, GetHandleFromPoolPerf) {
 TEST(Raft, GetHandleStreamViews) {
   handle_t parent(4);
 
-  auto child = parent.get_handle_from_internal_pool(2);
+  handle_t child(parent, 2);
   ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view());
   ASSERT_EQ(parent.get_internal_stream_view(2).value(),
             child.get_stream_view().value());

From 3b495a631c47650640d0cb5f1fb2f47e3283c9c8 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Tue, 16 Feb 2021 20:22:30 +0100
Subject: [PATCH 08/11] Add GHA to mark issues/prs as stale/rotten (#150)

Issues and PRs without activity for 30d will be marked as stale.
If there is no activity for 90d, they will be marked as rotten.

Authors:
  - Jordan Jacobelli (@Ethyling)

Approvers:
  - Dillon Cullinan (@dillon-cullinan)

URL: https://github.com/rapidsai/raft/pull/150
---
 .github/workflows/stale.yaml | 65 ++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 .github/workflows/stale.yaml

diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
new file mode 100644
index 0000000000..3b7de7ec69
--- /dev/null
+++ b/.github/workflows/stale.yaml
@@ -0,0 +1,65 @@
+name: Mark stale and rotten issues and pull requests
+
+on:
+  schedule:
+    - cron: "0 * * * *"
+
+jobs:
+  mark-stale-issues:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Mark Issues as Stale
+        uses: actions/stale@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          stale-issue-message: >
+            This issue has been marked stale due to no recent activity in the past 30d.
+            Please close this issue if no further response or action is needed.
+            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
+            This issue will be marked rotten if there is no activity in the next 60d.
+          stale-issue-label: "stale"
+          days-before-issue-stale: 30
+          days-before-issue-close: -1
+  mark-stale-prs:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Mark PRs as Stale
+        uses: actions/stale@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          stale-pr-message: >
+            This PR has been marked stale due to no recent activity in the past 30d.
+            Please close this PR if it is no longer required.
+            Otherwise, please respond with a comment indicating any updates.
+            This PR will be marked rotten if there is no activity in the next 60d.
+          stale-pr-label: "stale"
+          days-before-pr-stale: 30
+          days-before-pr-close: -1
+  mark-rotten-issues:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Mark Issues as Rotten
+        uses: actions/stale@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          stale-issue-message: >
+            This issue has been marked rotten due to no recent activity in the past 90d.
+            Please close this issue if no further response or action is needed.
+            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
+          stale-issue-label: "rotten"
+          days-before-issue-stale: 90
+          days-before-issue-close: -1
+  mark-rotten-prs:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Mark PRs as Rotten
+        uses: actions/stale@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          stale-pr-message: >
+            This PR has been marked rotten due to no recent activity in the past 90d.
+            Please close this PR if it is no longer required.
+            Otherwise, please respond with a comment indicating any updates.
+          stale-pr-label: "rotten"
+          days-before-pr-stale: 90
+          days-before-pr-close: -1

From ac15d6932ab2f6ab23d9ae69d147a3708961dfc1 Mon Sep 17 00:00:00 2001
From: Mike Wendt <1915404+mike-wendt@users.noreply.github.com>
Date: Tue, 16 Feb 2021 22:11:52 -0500
Subject: [PATCH 09/11] Update stale GHA with exemptions & new labels (#152)

Follows #150

Updates the stale GHA with the following changes:

- [x] Uses `inactive-30d` and `inactive-90d` labels instead of `stale` and `rotten`
- [x] Updates comments to reflect changes in labels
- [x] Exempts the following labels from being marked `inactive-30d` or `inactive-90d`
  - `0 - Blocked`
  - `0 - Backlog`
  - `good first issue`

Authors:
  - Mike Wendt (@mike-wendt)

Approvers:
  - Ray Douglass (@raydouglass)

URL: https://github.com/rapidsai/raft/pull/152
---
 .github/workflows/stale.yaml | 50 +++++++++++++++---------------------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
index 3b7de7ec69..8b65da69aa 100644
--- a/.github/workflows/stale.yaml
+++ b/.github/workflows/stale.yaml
@@ -1,65 +1,57 @@
-name: Mark stale and rotten issues and pull requests
+name: Mark inactive issues and pull requests
 
 on:
   schedule:
     - cron: "0 * * * *"
 
 jobs:
-  mark-stale-issues:
+  mark-inactive-30d:
     runs-on: ubuntu-latest
     steps:
-      - name: Mark Issues as Stale
+      - name: Mark 30 day inactive issues and pull requests
         uses: actions/stale@v3
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           stale-issue-message: >
-            This issue has been marked stale due to no recent activity in the past 30d.
+            This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
             Please close this issue if no further response or action is needed.
             Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-            This issue will be marked rotten if there is no activity in the next 60d.
-          stale-issue-label: "stale"
+            This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
+          stale-issue-label: "inactive-30d"
+          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
           days-before-issue-stale: 30
           days-before-issue-close: -1
-  mark-stale-prs:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Mark PRs as Stale
-        uses: actions/stale@v3
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
           stale-pr-message: >
-            This PR has been marked stale due to no recent activity in the past 30d.
+            This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
             Please close this PR if it is no longer required.
             Otherwise, please respond with a comment indicating any updates.
-            This PR will be marked rotten if there is no activity in the next 60d.
-          stale-pr-label: "stale"
+            This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
+          stale-pr-label: "inactive-30d"
+          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
           days-before-pr-stale: 30
           days-before-pr-close: -1
-  mark-rotten-issues:
+          operations-per-run: 50
+  mark-inactive-90d:
     runs-on: ubuntu-latest
     steps:
-      - name: Mark Issues as Rotten
+      - name: Mark 90 day inactive issues and pull requests
         uses: actions/stale@v3
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           stale-issue-message: >
-            This issue has been marked rotten due to no recent activity in the past 90d.
+            This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
             Please close this issue if no further response or action is needed.
             Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-          stale-issue-label: "rotten"
+          stale-issue-label: "inactive-90d"
+          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
           days-before-issue-stale: 90
           days-before-issue-close: -1
-  mark-rotten-prs:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Mark PRs as Rotten
-        uses: actions/stale@v3
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
           stale-pr-message: >
-            This PR has been marked rotten due to no recent activity in the past 90d.
+            This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
             Please close this PR if it is no longer required.
             Otherwise, please respond with a comment indicating any updates.
-          stale-pr-label: "rotten"
+          stale-pr-label: "inactive-90d"
+          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
           days-before-pr-stale: 90
           days-before-pr-close: -1
+          operations-per-run: 50

From 30e341f483003a1094e326c49780c9382846d867 Mon Sep 17 00:00:00 2001
From: afender <afender@nvidia.com>
Date: Wed, 17 Feb 2021 17:56:56 -0600
Subject: [PATCH 10/11] error check

---
 cpp/include/raft/handle.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 42a1e4ebb8..dbe7e83189 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -81,6 +81,9 @@ class handle_t {
   handle_t(const handle_t& other, int stream_id,
            int n_streams = kNumDefaultWorkerStreams)
     : dev_id_(other.get_device()), streams_(n_streams) {
+    RAFT_EXPECTS(
+      other.get_num_internal_streams() > 0,
+      "ERROR: the main handle must have at least one worker stream\n");
     prop_ = other.get_device_properties();
     device_prop_initialized_ = true;
     device_allocator_ = other.get_device_allocator();

From 88fff801285b9192e0a89e29467b2c52e168e6b2 Mon Sep 17 00:00:00 2001
From: Joseph <68436579+jolorunyomi@users.noreply.github.com>
Date: Thu, 18 Feb 2021 10:07:34 -0600
Subject: [PATCH 11/11] Auto-label PRs based on their content (#117)

This PR adds the GitHub action [PR Labeler](https://github.com/actions/labeler) to auto-label PRs based on their content.

Labeling is managed with a configuration file `.github/labeler.yml` using the following [options](https://github.com/actions/labeler#usage).

Authors:
  - Joseph (@jolorunyomi)

Approvers:
  - AJ Schmidt (@ajschmidt8)
  - Mike Wendt (@mike-wendt)
  - Rick Ratzel (@rlratzel)

URL: https://github.com/rapidsai/raft/pull/117
---
 .github/labeler.yml           | 16 ++++++++++++++++
 .github/workflows/labeler.yml | 11 +++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 .github/labeler.yml
 create mode 100644 .github/workflows/labeler.yml

diff --git a/.github/labeler.yml b/.github/labeler.yml
new file mode 100644
index 0000000000..9809e2cc2e
--- /dev/null
+++ b/.github/labeler.yml
@@ -0,0 +1,16 @@
+# https://github.com/actions/labeler#common-examples
+# Adapted from https://github.com/rapidsai/raft/blob/main/.github/CODEOWNERS
+# Labels culled from https://github.com/rapidsai/raft/labels
+
+python:
+ - 'python/**'
+
+cpp:
+ - 'cpp/**'
+
+CMake:
+ - '**/CMakeLists.txt'
+ - '**/cmake/**'
+
+gpuCI:
+  - 'ci/**'
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
new file mode 100644
index 0000000000..55117f774a
--- /dev/null
+++ b/.github/workflows/labeler.yml
@@ -0,0 +1,11 @@
+name: "Pull Request Labeler"
+on:
+- pull_request_target
+
+jobs:
+ triage:
+   runs-on: ubuntu-latest
+   steps:
+   - uses: actions/labeler@main
+     with:
+       repo-token: "${{ secrets.GITHUB_TOKEN }}"