From 959bb29c0a825a2b3a1aac17f6171670f0eb2ffd Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 8 Dec 2022 20:41:25 +0000
Subject: [PATCH 01/20] gram matrix support for csr

---
 .../distance/detail/kernels/gram_matrix.cuh   | 166 ++++++++++++++++++
 .../detail/kernels/kernel_matrices.cuh        | 146 +++++++++++++++
 2 files changed, 312 insertions(+)

diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index 344dda693e..6bc412e248 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <raft/distance/distance.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <raft/sparse/distance/distance.cuh>
 
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/gemm.cuh>
@@ -77,6 +79,42 @@ class GramMatrixBase {
     evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
   }
 
+  virtual void operator()(const raft::handle_t& handle,
+                          const int* x1_indptr,
+                          const int* x1_indices,
+                          const math_t* x1_data,
+                          int x1_nnz,
+                          int n1,
+                          int n_cols,
+                          const int* x2_indptr,
+                          const int* x2_indices,
+                          const math_t* x2_data,
+                          int x2_nnz,
+                          int n2,
+                          math_t* out,
+                          bool is_row_major,
+                          cudaStream_t stream,
+                          int ld_out = 0)
+  {
+    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
+    evaluateSparse(handle,
+                   x1_indptr,
+                   x1_indices,
+                   x1_data,
+                   x1_nnz,
+                   n1,
+                   n_cols,
+                   x2_indptr,
+                   x2_indices,
+                   x2_data,
+                   x2_nnz,
+                   n2,
+                   out,
+                   is_row_major,
+                   stream,
+                   ld_out);
+  }
+
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
    *
    * @param [in] x1 device array of vectors, size [n1*n_cols]
@@ -107,6 +145,41 @@ class GramMatrixBase {
     linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
   }
 
+  virtual void evaluateSparse(const raft::handle_t& handle,
+                              const int* x1_indptr,
+                              const int* x1_indices,
+                              const math_t* x1_data,
+                              int x1_nnz,
+                              int n1,
+                              int n_cols,
+                              const int* x2_indptr,
+                              const int* x2_indices,
+                              const math_t* x2_data,
+                              int x2_nnz,
+                              int n2,
+                              math_t* out,
+                              bool is_row_major,
+                              cudaStream_t stream,
+                              int ld_out)
+  {
+    linearSparse(handle,
+                 x1_indptr,
+                 x1_indices,
+                 x1_data,
+                 x1_nnz,
+                 n1,
+                 n_cols,
+                 x2_indptr,
+                 x2_indices,
+                 x2_data,
+                 x2_nnz,
+                 n2,
+                 out,
+                 is_row_major,
+                 stream,
+                 ld_out);
+  }
+
   // private:
   // The following methods should be private, they are kept public to avoid:
   // "error: The enclosing parent function ("distance") for an extended
@@ -182,6 +255,99 @@ class GramMatrixBase {
     }
   }
 
+  void linearSparse(const raft::handle_t& handle,
+                    const int* x1_indptr,
+                    const int* x1_indices,
+                    const math_t* x1_data,
+                    int x1_nnz,
+                    int n1,
+                    int n_cols,
+                    const int* x2_indptr,
+                    const int* x2_indices,
+                    const math_t* x2_data,
+                    int x2_nnz,
+                    int n2,
+                    math_t* out,
+                    bool is_row_major,
+                    cudaStream_t stream,
+                    int ld_out)
+  {
+    int minor_out = is_row_major ? n2 : n1;
+    ASSERT(ld_out == minor_out, "Sparse linear Kernel distance does not support ld_out parameter");
+    distanceSparse(handle,
+                   x1_indptr,
+                   x1_indices,
+                   x1_data,
+                   x1_nnz,
+                   n1,
+                   n_cols,
+                   x2_indptr,
+                   x2_indices,
+                   x2_data,
+                   x2_nnz,
+                   n2,
+                   out,
+                   is_row_major,
+                   stream,
+                   raft::distance::DistanceType::InnerProduct);
+  }
+
+  void distanceSparse(const raft::handle_t& handle,
+                      const int* x1_indptr,
+                      const int* x1_indices,
+                      const math_t* x1_data,
+                      int x1_nnz,
+                      int n1,
+                      int n_cols,
+                      const int* x2_indptr,
+                      const int* x2_indices,
+                      const math_t* x2_data,
+                      int x2_nnz,
+                      int n2,
+                      math_t* out,
+                      bool is_row_major,
+                      cudaStream_t stream,
+                      raft::distance::DistanceType metric,
+                      float metricArg = 0.0)
+  {
+    raft::sparse::distance::distances_config_t<int, math_t> dist_config(handle);
+
+    // switch a,b based on is_row_major
+    if (!is_row_major) {
+      dist_config.a_nrows   = n2;
+      dist_config.a_ncols   = n_cols;
+      dist_config.a_nnz     = x2_nnz;
+      dist_config.a_indptr  = const_cast<int*>(x2_indptr);
+      dist_config.a_indices = const_cast<int*>(x2_indices);
+      dist_config.a_data    = const_cast<math_t*>(x2_data);
+      dist_config.b_nrows   = n1;
+      dist_config.b_ncols   = n_cols;
+      dist_config.b_nnz     = x1_nnz;
+      dist_config.b_indptr  = const_cast<int*>(x1_indptr);
+      dist_config.b_indices = const_cast<int*>(x1_indices);
+      dist_config.b_data    = const_cast<math_t*>(x1_data);
+    } else {
+      dist_config.a_nrows   = n1;
+      dist_config.a_ncols   = n_cols;
+      dist_config.a_nnz     = x1_nnz;
+      dist_config.a_indptr  = const_cast<int*>(x1_indptr);
+      dist_config.a_indices = const_cast<int*>(x1_indices);
+      dist_config.a_data    = const_cast<math_t*>(x1_data);
+      dist_config.b_nrows   = n2;
+      dist_config.b_ncols   = n_cols;
+      dist_config.b_nnz     = x2_nnz;
+      dist_config.b_indptr  = const_cast<int*>(x2_indptr);
+      dist_config.b_indices = const_cast<int*>(x2_indices);
+      dist_config.b_data    = const_cast<math_t*>(x2_data);
+    }
+
+    if (raft::sparse::distance::supportedDistance.find(metric) ==
+        raft::sparse::distance::supportedDistance.end())
+      THROW("DistanceType not supported: %d", metric);
+
+    raft::sparse::distance::pairwiseDistance(out, dist_config, metric, metricArg);
+  }
+
   /** Calculates the Gram matrix using Euclidean distance.
    *
    * Can be used as a building block for more complex kernel functions.
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index b74de84d80..b81ace83b3 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -100,6 +100,40 @@ __global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t ga
     }
 }
 
+/** Epiloge function for rbf kernel without padding.
+ * Calculates output = exp(-gain * input);
+ * @param inout device vector, size [len]
+ * @param len length of the input vector
+ * @param gain
+ */
+template <typename math_t>
+__global__ void rbf_kernel_nopad(math_t* inout, size_t len, math_t gain)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = exp(-1.0 * gain * inout[tid]);
+  }
+}
+
+/** Epiloge function for rbf kernel without padding.
+ * Calculates output = exp(-gain * input);
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param gain
+ */
+template <typename math_t>
+__global__ void rbf_kernel(math_t* inout, int ld, int rows, int cols, math_t gain)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = exp(-1.0 * gain * inout[tidx + tidy * ld]);
+    }
+}
+
 /**
  * Create a kernel matrix using polynomial kernel function.
  */
@@ -180,6 +214,42 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
       x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
     applyKernel(out, ld_out, n1, n2, is_row_major, stream);
   }
+
+  void evaluateSparse(const raft::handle_t& handle,
+                      const int* x1_indptr,
+                      const int* x1_indices,
+                      const math_t* x1_data,
+                      int x1_nnz,
+                      int n1,
+                      int n_cols,
+                      const int* x2_indptr,
+                      const int* x2_indices,
+                      const math_t* x2_data,
+                      int x2_nnz,
+                      int n2,
+                      math_t* out,
+                      bool is_row_major,
+                      cudaStream_t stream,
+                      int ld_out)
+  {
+    GramMatrixBase<math_t>::linearSparse(handle,
+                                         x1_indptr,
+                                         x1_indices,
+                                         x1_data,
+                                         x1_nnz,
+                                         n1,
+                                         n_cols,
+                                         x2_indptr,
+                                         x2_indices,
+                                         x2_data,
+                                         x2_nnz,
+                                         n2,
+                                         out,
+                                         is_row_major,
+                                         stream,
+                                         ld_out);
+    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+  }
 };
 
 /**
@@ -260,6 +330,42 @@ class TanhKernel : public GramMatrixBase<math_t> {
       x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
     applyKernel(out, ld_out, n1, n2, is_row_major, stream);
   }
+
+  void evaluateSparse(const raft::handle_t& handle,
+                      const int* x1_indptr,
+                      const int* x1_indices,
+                      const math_t* x1_data,
+                      int x1_nnz,
+                      int n1,
+                      int n_cols,
+                      const int* x2_indptr,
+                      const int* x2_indices,
+                      const math_t* x2_data,
+                      int x2_nnz,
+                      int n2,
+                      math_t* out,
+                      bool is_row_major,
+                      cudaStream_t stream,
+                      int ld_out)
+  {
+    GramMatrixBase<math_t>::linearSparse(handle,
+                                         x1_indptr,
+                                         x1_indices,
+                                         x1_data,
+                                         x1_nnz,
+                                         n1,
+                                         n_cols,
+                                         x2_indptr,
+                                         x2_indices,
+                                         x2_data,
+                                         x2_nnz,
+                                         n2,
+                                         out,
+                                         is_row_major,
+                                         stream,
+                                         ld_out);
+    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+  }
 };
 
 /**
@@ -337,6 +443,46 @@ class RBFKernel : public GramMatrixBase<math_t> {
     distance(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
   }
 
+  void evaluateSparse(const raft::handle_t& handle,
+                      const int* x1_indptr,
+                      const int* x1_indices,
+                      const math_t* x1_data,
+                      int x1_nnz,
+                      int n1,
+                      int n_cols,
+                      const int* x2_indptr,
+                      const int* x2_indices,
+                      const math_t* x2_data,
+                      int x2_nnz,
+                      int n2,
+                      math_t* out,
+                      bool is_row_major,
+                      cudaStream_t stream,
+                      int ld_out)
+  {
+    int minor_out = is_row_major ? n2 : n1;
+    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
+
+    GramMatrixBase<math_t>::distanceSparse(handle,
+                                           x1_indptr,
+                                           x1_indices,
+                                           x1_data,
+                                           x1_nnz,
+                                           n1,
+                                           n_cols,
+                                           x2_indptr,
+                                           x2_indices,
+                                           x2_data,
+                                           x2_nnz,
+                                           n2,
+                                           out,
+                                           is_row_major,
+                                           stream,
+                                           raft::distance::DistanceType::L2Unexpanded);
+
+    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+  }
+
   /** Customize distance function withe RBF epilogue */
   void distance(const math_t* x1,
                 int n1,

From 36c56b1016cb3a27f4d6ffb9cd8764f58e129f75 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 2 Feb 2023 08:24:32 -0800
Subject: [PATCH 02/20] Add CSRxDense kernel compute, also add row norm for CSR

---
 .../distance/detail/kernels/gram_matrix.cuh   | 153 +++++++++++++++++
 .../detail/kernels/kernel_matrices.cuh        | 156 ++++++++++++++++++
 .../raft/sparse/linalg/detail/norm.cuh        |  88 ++++++++++
 cpp/include/raft/sparse/linalg/norm.cuh       |  32 ++++
 4 files changed, 429 insertions(+)

diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index 6bc412e248..25c5992bc1 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -18,6 +18,7 @@
 
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/distance/distance.cuh>
 
 #include <raft/linalg/detail/cublas_wrappers.hpp>
@@ -79,6 +80,46 @@ class GramMatrixBase {
     evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
   }
 
+  virtual void operator()(const raft::handle_t& handle,
+                          const int* x1_indptr,
+                          const int* x1_indices,
+                          const math_t* x1_data,
+                          int x1_nnz,
+                          int n1,
+                          int n_cols,
+                          const math_t* x2_data,
+                          int n2,
+                          math_t* out,
+                          bool is_row_major,
+                          cudaStream_t stream,
+                          int ld2       = 0,
+                          int ld_out    = 0,
+                          math_t* norm  = nullptr,
+                          int offset_x1 = 0,
+                          int* idx_x2   = 0)
+
+  {
+    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
+    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
+    evaluateSparseX1(handle,
+                     x1_indptr,
+                     x1_indices,
+                     x1_data,
+                     x1_nnz,
+                     n1,
+                     n_cols,
+                     x2_data,
+                     n2,
+                     out,
+                     is_row_major,
+                     stream,
+                     ld2,
+                     ld_out,
+                     norm,
+                     offset_x1,
+                     idx_x2);
+  }
+
   virtual void operator()(const raft::handle_t& handle,
                           const int* x1_indptr,
                           const int* x1_indices,
@@ -145,6 +186,40 @@ class GramMatrixBase {
     linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
   }
 
+  virtual void evaluateSparseX1(const raft::handle_t& handle,
+                                const int* x1_indptr,
+                                const int* x1_indices,
+                                const math_t* x1_data,
+                                int x1_nnz,
+                                int n1,
+                                int n_cols,
+                                const math_t* x2_data,
+                                int n2,
+                                math_t* out,
+                                bool is_row_major,
+                                cudaStream_t stream,
+                                int ld2,
+                                int ld_out,
+                                math_t* norm,
+                                int offset_x1,
+                                int* idx_x2)
+  {
+    linearSparseX1(handle,
+                   x1_indptr,
+                   x1_indices,
+                   x1_data,
+                   x1_nnz,
+                   n1,
+                   n_cols,
+                   x2_data,
+                   n2,
+                   out,
+                   is_row_major,
+                   stream,
+                   ld2,
+                   ld_out);
+  }
+
   virtual void evaluateSparse(const raft::handle_t& handle,
                               const int* x1_indptr,
                               const int* x1_indices,
@@ -255,6 +330,84 @@ class GramMatrixBase {
     }
   }
 
+  void linearSparseX1(const raft::handle_t& handle,
+                      const int* x1_indptr,
+                      const int* x1_indices,
+                      const math_t* x1_data,
+                      int x1_nnz,
+                      int n1,
+                      int n_cols,
+                      const math_t* x2_data,
+                      int n2,
+                      math_t* out,
+                      bool is_row_major,
+                      cudaStream_t stream,
+                      int ld2,
+                      int ld_out)
+  {
+    math_t alpha = 1.0;
+    math_t beta  = 0.0;
+
+    cusparseSpMatDescr_t descrX1;
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatecsr(&descrX1,
+                                                              n1,
+                                                              n_cols,
+                                                              x1_nnz,
+                                                              const_cast<int*>(x1_indptr),
+                                                              const_cast<int*>(x1_indices),
+                                                              const_cast<math_t*>(x1_data)));
+
+    auto order = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
+
+    cusparseDnMatDescr_t descrX2;
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+      &descrX2, n2, n_cols, ld2, const_cast<math_t*>(x2_data), order));
+
+    cusparseDnMatDescr_t descrOut;
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+      &descrOut, n1, n2, ld_out, const_cast<math_t*>(out), order));
+
+    auto alg = order == CUSPARSE_ORDER_COL ? CUSPARSE_SPMM_CSR_ALG1 : CUSPARSE_SPMM_CSR_ALG2;
+
+    // compute X1*X2^T
+    auto opX1 = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    auto opX2 = CUSPARSE_OPERATION_TRANSPOSE;
+
+    size_t bufferSize;
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle.get_cusparse_handle(),
+                                                                    opX1,
+                                                                    opX2,
+                                                                    &alpha,
+                                                                    descrX1,
+                                                                    descrX2,
+                                                                    &beta,
+                                                                    descrOut,
+                                                                    alg,
+                                                                    &bufferSize,
+                                                                    stream));
+
+    raft::interruptible::synchronize(stream);
+
+    rmm::device_uvector<math_t> tmp(bufferSize, stream);
+
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle.get_cusparse_handle(),
+                                                         opX1,
+                                                         opX2,
+                                                         &alpha,
+                                                         descrX1,
+                                                         descrX2,
+                                                         &beta,
+                                                         descrOut,
+                                                         alg,
+                                                         tmp.data(),
+                                                         stream));
+
+    RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descrX1));
+    RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descrX2));
+    RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descrOut));
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+
   void linearSparse(const raft::handle_t& handle,
                     const int* x1_indptr,
                     const int* x1_indices,
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index b81ace83b3..db9e16233b 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -134,6 +134,32 @@ __global__ void rbf_kernel(math_t* inout, int ld, int rows, int cols, math_t gai
     }
 }
 
+/** Epiloge function for rbf kernel using expansion.
+ * Calculates output_ij = exp(-gain * (norm_i + norm_j - 2*input_ij));
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param norm norm for row indices
+ * @param offset_i offset into norm for rows (assumed to be coalesced)
+ * @param idx_j indirect column id to access norm
+ * @param gain
+ */
+template <typename math_t>
+__global__ void rbf_kernel_expanded(
+  math_t* inout, int ld, int rows, int cols, math_t* norm, int offset_i, int* idx_j, math_t gain)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y) {
+    math_t norm_y = norm[idx_j[tidy]];
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] =
+        exp(-1.0 * gain * (norm[tidx + offset_i] + norm_y - inout[tidx + tidy * ld] * 2));
+    }
+  }
+}
+
 /**
  * Create a kernel matrix using polynomial kernel function.
  */
@@ -215,6 +241,41 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
     applyKernel(out, ld_out, n1, n2, is_row_major, stream);
   }
 
+  void evaluateSparseX1(const raft::handle_t& handle,
+                        const int* x1_indptr,
+                        const int* x1_indices,
+                        const math_t* x1_data,
+                        int x1_nnz,
+                        int n1,
+                        int n_cols,
+                        const math_t* x2_data,
+                        int n2,
+                        math_t* out,
+                        bool is_row_major,
+                        cudaStream_t stream,
+                        int ld2,
+                        int ld_out,
+                        math_t* norm,
+                        int offset_x1,
+                        int* idx_x2)
+  {
+    GramMatrixBase<math_t>::linearSparseX1(handle,
+                                           x1_indptr,
+                                           x1_indices,
+                                           x1_data,
+                                           x1_nnz,
+                                           n1,
+                                           n_cols,
+                                           x2_data,
+                                           n2,
+                                           out,
+                                           is_row_major,
+                                           stream,
+                                           ld2,
+                                           ld_out);
+    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+  }
+
   void evaluateSparse(const raft::handle_t& handle,
                       const int* x1_indptr,
                       const int* x1_indices,
@@ -331,6 +392,41 @@ class TanhKernel : public GramMatrixBase<math_t> {
     applyKernel(out, ld_out, n1, n2, is_row_major, stream);
   }
 
+  void evaluateSparseX1(const raft::handle_t& handle,
+                        const int* x1_indptr,
+                        const int* x1_indices,
+                        const math_t* x1_data,
+                        int x1_nnz,
+                        int n1,
+                        int n_cols,
+                        const math_t* x2_data,
+                        int n2,
+                        math_t* out,
+                        bool is_row_major,
+                        cudaStream_t stream,
+                        int ld2,
+                        int ld_out,
+                        math_t* norm,
+                        int offset_x1,
+                        int* idx_x2)
+  {
+    GramMatrixBase<math_t>::linearSparseX1(handle,
+                                           x1_indptr,
+                                           x1_indices,
+                                           x1_data,
+                                           x1_nnz,
+                                           n1,
+                                           n_cols,
+                                           x2_data,
+                                           n2,
+                                           out,
+                                           is_row_major,
+                                           stream,
+                                           ld2,
+                                           ld_out);
+    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+  }
+
   void evaluateSparse(const raft::handle_t& handle,
                       const int* x1_indptr,
                       const int* x1_indices,
@@ -392,6 +488,23 @@ class RBFKernel : public GramMatrixBase<math_t> {
     }
   }
 
+  void applyExpandedRbfKernel(math_t* inout,
+                              int ld,
+                              int rows,
+                              int cols,
+                              math_t* norm,
+                              int offset_i,
+                              int* idx_j,
+                              bool is_row_major,
+                              cudaStream_t stream)
+  {
+    ASSERT(!is_row_major, "Expanded RBF kernel currently only supports col major format");
+    rbf_kernel_expanded<<<dim3(raft::ceildiv(rows, 32), raft::ceildiv(cols, 4), 1),
+                          dim3(32, 4, 1),
+                          0,
+                          stream>>>(inout, ld, rows, cols, norm, offset_i, idx_j, gain);
+  }
+
  public:
   /**
    * Constructs a RBF kernel object.
@@ -443,6 +556,49 @@ class RBFKernel : public GramMatrixBase<math_t> {
     distance(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
   }
 
+  void evaluateSparseX1(const raft::handle_t& handle,
+                        const int* x1_indptr,
+                        const int* x1_indices,
+                        const math_t* x1_data,
+                        int x1_nnz,
+                        int n1,
+                        int n_cols,
+                        const math_t* x2_data,
+                        int n2,
+                        math_t* out,
+                        bool is_row_major,
+                        cudaStream_t stream,
+                        int ld2,
+                        int ld_out,
+                        math_t* norm,
+                        int offset_x1,
+                        int* idx_x2)
+  {
+    int minor2    = is_row_major ? n_cols : n2;
+    int minor_out = is_row_major ? n2 : n1;
+    ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
+    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
+
+    ASSERT(norm != nullptr, "RBF Kernel needs pre-computed norm for expanded distance compute");
+    // compute L2 expanded
+    GramMatrixBase<math_t>::linearSparseX1(handle,
+                                           x1_indptr,
+                                           x1_indices,
+                                           x1_data,
+                                           x1_nnz,
+                                           n1,
+                                           n_cols,
+                                           x2_data,
+                                           n2,
+                                           out,
+                                           is_row_major,
+                                           stream,
+                                           ld2,
+                                           ld_out);
+
+    applyExpandedRbfKernel(out, ld_out, n1, n2, norm, offset_x1, idx_x2, is_row_major, stream);
+  }
+
   void evaluateSparse(const raft::handle_t& handle,
                       const int* x1_indptr,
                       const int* x1_indices,
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index c2a8aa4246..7605ce8351 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cusparse_v2.h>
+#include <raft/linalg/norm_types.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -170,6 +171,93 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
   RAFT_CUDA_TRY(cudaGetLastError());
 }
 
+template <int warpSize, int rpb>
+struct CsrReductionPolicy {
+  static constexpr int LogicalWarpSize = warpSize;
+  static constexpr int RowsPerBlock    = rpb;
+  static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
+};
+
+template <typename Policy,
+          typename Type,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+  csrReductionKernel(Type* dots,
+                     const IdxType* ia,
+                     const Type* data,
+                     IdxType N,
+                     Type init,
+                     MainLambda main_op,
+                     ReduceLambda reduce_op,
+                     FinalLambda final_op)
+{
+  IdxType i = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
+  if (i >= N) return;
+
+  Type acc = init;
+  for (IdxType j = ia[i] + threadIdx.x; j < ia[i + 1]; j += Policy::LogicalWarpSize) {
+    acc = reduce_op(acc, main_op(data[j]));
+  }
+  acc = raft::logicalWarpReduce<Policy::LogicalWarpSize>(acc, reduce_op);
+  if (threadIdx.x == 0) { dots[i] = final_op(acc); }
+}
+
+template <typename Policy,
+          typename Type,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<Type, IdxType>,
+          typename ReduceLambda = raft::Sum<Type>,
+          typename FinalLambda  = raft::Nop<Type>>
+void csrReduction(Type* dots,
+                  const IdxType* ia,
+                  const Type* data,
+                  IdxType N,
+                  Type init,
+                  cudaStream_t stream,
+                  MainLambda main_op     = raft::Nop<Type, IdxType>(),
+                  ReduceLambda reduce_op = raft::Sum<Type>(),
+                  FinalLambda final_op   = raft::Nop<Type>())
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "csrReduction<%d,%d>", Policy::LogicalWarpSize, Policy::RowsPerBlock);
+  dim3 threads(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1);
+  dim3 blocks(ceildiv<IdxType>(N, Policy::RowsPerBlock), 1, 1);
+  csrReductionKernel<Policy>
+    <<<blocks, threads, 0, stream>>>(dots, ia, data, N, init, main_op, reduce_op, final_op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename Type, typename IdxType, typename Lambda>
+void rowNormCsrCaller(Type* dots,
+                      const IdxType* ia,
+                      const Type* data,
+                      IdxType nnz,
+                      IdxType N,
+                      raft::linalg::NormType type,
+                      cudaStream_t stream,
+                      Lambda fin_op)
+{
+  // TODO: dispatch nnz to Policy?
+  switch (type) {
+    case raft::linalg::NormType::L1Norm:
+      csrReduction<CsrReductionPolicy<32, 4>>(
+        dots, ia, data, N, (Type)0, stream, raft::L1Op<Type>(), raft::Sum<Type>(), fin_op);
+      break;
+    case raft::linalg::NormType::L2Norm:
+      csrReduction<CsrReductionPolicy<32, 4>>(
+        dots, ia, data, N, (Type)0, stream, raft::L2Op<Type>(), raft::Sum<Type>(), fin_op);
+      break;
+    case raft::linalg::NormType::LinfNorm:
+      csrReduction<CsrReductionPolicy<32, 4>>(
+        dots, ia, data, N, (Type)0, stream, raft::L1Op<Type>(), raft::Max<Type>(), fin_op);
+      break;
+    default: THROW("Unsupported norm type: %d", type);
+  };
+}
+
 };  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index e13fd22843..07b11d51f7 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -18,6 +18,7 @@
 
 #pragma once
 
+#include <raft/linalg/norm_types.hpp>
 #include <raft/sparse/linalg/detail/norm.cuh>
 
 namespace raft {
@@ -66,6 +67,37 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
   detail::csr_row_normalize_max(ia, vals, nnz, m, result, stream);
 }
 
+/**
+ * @brief Compute row-wise norm of the input matrix and perform fin_op lambda
+ *
+ * Row-wise norm is useful while computing pairwise distance matrix, for
+ * example.
+ * This is used in many clustering algos like knn, kmeans, dbscan, etc...
+ *
+ * @tparam Type the data type
+ * @tparam Lambda device final lambda
+ * @tparam IdxType Integer type used to for addressing
+ * @param dots the output vector of row-wise dot products
+ * @param ia the input matrix row pointers
+ * @param data the input matrix nnz data
+ * @param N number of rows of data
+ * @param type the type of norm to be applied
+ * @param stream cuda stream where to launch work
+ * @param fin_op the final lambda op
+ */
+template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+void rowNormCsr(Type* dots,
+                const IdxType* ia,
+                const Type* data,
+                IdxType nnz,
+                IdxType N,
+                raft::linalg::NormType type,
+                cudaStream_t stream,
+                Lambda fin_op = raft::Nop<Type, IdxType>())
+{
+  detail::rowNormCsrCaller(dots, ia, data, nnz, N, type, stream, fin_op);
+}
+
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft

From a99c129420fee926096ce7bd19f253015640d241 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 3 Feb 2023 04:37:58 -0800
Subject: [PATCH 03/20] fix RBF for dense with offset

---
 .../distance/detail/kernels/gram_matrix.cuh   | 25 ++++++---
 .../detail/kernels/kernel_factory.cuh         |  2 +-
 .../detail/kernels/kernel_matrices.cuh        | 51 ++++++++++++++-----
 3 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index 25c5992bc1..7f93232a18 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -61,6 +61,9 @@ class GramMatrixBase {
    * @param ld1 leading dimension of x1
    * @param ld2 leading dimension of x2
    * @param ld_out leading dimension of out
+   * @param norm optional L2 row norm of x1 for expanded computation within RBF.
+   * @param offset_x1 offset where x1 starts within norm
+   * @param idx_x2 indirect access to x2 row id within norm
    */
   virtual void operator()(const math_t* x1,
                           int n1,
@@ -70,14 +73,18 @@ class GramMatrixBase {
                           math_t* out,
                           bool is_row_major,
                           cudaStream_t stream,
-                          int ld1    = 0,
-                          int ld2    = 0,
-                          int ld_out = 0)
+                          int ld1       = 0,
+                          int ld2       = 0,
+                          int ld_out    = 0,
+                          math_t* norm  = nullptr,
+                          int offset_x1 = 0,
+                          int* idx_x2   = nullptr)
   {
     if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
     if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
     if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
-    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+    evaluate(
+      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out, norm, offset_x1, idx_x2);
   }
 
   virtual void operator()(const raft::handle_t& handle,
@@ -96,7 +103,7 @@ class GramMatrixBase {
                           int ld_out    = 0,
                           math_t* norm  = nullptr,
                           int offset_x1 = 0,
-                          int* idx_x2   = 0)
+                          int* idx_x2   = nullptr)
 
   {
     if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
@@ -170,6 +177,9 @@ class GramMatrixBase {
    * @param ld1 leading dimension of x1 (usually it is n1)
    * @param ld2 leading dimension of x2 (usually it is n2)
    * @param ld_out leading dimension of out (usually it is n1)
+   * @param norm optional L2 row norm of x1 for expanded computation within RBF.
+   * @param offset_x1 offset where x1 starts within norm
+   * @param idx_x2 indirect access to x2 row id within norm
    */
   virtual void evaluate(const math_t* x1,
                         int n1,
@@ -181,7 +191,10 @@ class GramMatrixBase {
                         cudaStream_t stream,
                         int ld1,
                         int ld2,
-                        int ld_out)
+                        int ld_out,
+                        math_t* norm,
+                        int offset_x1,
+                        int* idx_x2)
   {
     linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
   }
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
index 1aa6809bcd..68e9d72418 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
@@ -38,7 +38,7 @@ class KernelFactory {
         res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, cublas_handle);
         break;
       case TANH: res = new TanhKernel<math_t>(gamma, coef0, cublas_handle); break;
-      case RBF: res = new RBFKernel<math_t>(gamma); break;
+      case RBF: res = new RBFKernel<math_t>(gamma, cublas_handle); break;
       default: throw raft::exception("Kernel not implemented");
     }
     return res;
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index db9e16233b..5b2a524f46 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -223,6 +223,9 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * @param ld1 leading dimension of x1
    * @param ld2 leading dimension of x2
    * @param ld_out leading dimension of out
+   * @param norm optional L2 row norm of x1 for expanded computation within RBF.
+   * @param offset_x1 offset where x1 starts within norm
+   * @param idx_x2 indirect access to x2 row id within norm
    */
   void evaluate(const math_t* x1,
                 int n1,
@@ -234,7 +237,10 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
                 cudaStream_t stream,
                 int ld1,
                 int ld2,
-                int ld_out)
+                int ld_out,
+                math_t* norm,
+                int offset_x1,
+                int* idx_x2)
   {
     GramMatrixBase<math_t>::linear(
       x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
@@ -374,6 +380,9 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * @param ld1 leading dimension of x1 (usually it is n1)
    * @param ld2 leading dimension of x2 (usually it is n2)
    * @param ld_out leading dimension of out (usually it is n1)
+   * @param norm optional L2 row norm of x1 for expanded computation within RBF.
+   * @param offset_x1 offset where x1 starts within norm
+   * @param idx_x2 indirect access to x2 row id within norm
    */
   void evaluate(const math_t* x1,
                 int n1,
@@ -385,7 +394,10 @@ class TanhKernel : public GramMatrixBase<math_t> {
                 cudaStream_t stream,
                 int ld1,
                 int ld2,
-                int ld_out)
+                int ld_out,
+                math_t* norm,
+                int offset_x1,
+                int* idx_x2)
   {
     GramMatrixBase<math_t>::linear(
       x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
@@ -514,7 +526,10 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * @tparam math_t floating point type
    * @param gain
    */
-  RBFKernel(math_t gain) : GramMatrixBase<math_t>(NULL), gain(gain) {}
+  RBFKernel(math_t gain, cublasHandle_t cublas_handle)
+    : GramMatrixBase<math_t>(cublas_handle), gain(gain)
+  {
+  }
 
   /** Evaluate kernel matrix using RBF kernel.
    *
@@ -534,6 +549,9 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * @param ld1 leading dimension of x1, currently only ld1 == n1 is supported
    * @param ld2 leading dimension of x2, currently only ld2 == n2 is supported
    * @param ld_out leading dimension of out, only ld_out == n1 is supported
+   * @param norm optional L2 row norm of x1 for expanded computation within RBF.
+   * @param offset_x1 offset where x1 starts within norm
+   * @param idx_x2 indirect access to x2 row id within norm
    */
   void evaluate(const math_t* x1,
                 int n1,
@@ -545,15 +563,25 @@ class RBFKernel : public GramMatrixBase<math_t> {
                 cudaStream_t stream,
                 int ld1,
                 int ld2,
-                int ld_out)
+                int ld_out,
+                math_t* norm,
+                int offset_x1,
+                int* idx_x2)
   {
-    int minor1    = is_row_major ? n_cols : n1;
-    int minor2    = is_row_major ? n_cols : n2;
     int minor_out = is_row_major ? n2 : n1;
-    ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
-    ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
     ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
-    distance(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+    if (norm != nullptr) {
+      // compute L2expanded
+      GramMatrixBase<math_t>::linear(
+        x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+      applyExpandedRbfKernel(out, ld_out, n1, n2, norm, offset_x1, idx_x2, is_row_major, stream);
+    } else {
+      int minor1 = is_row_major ? n_cols : n1;
+      int minor2 = is_row_major ? n_cols : n2;
+      ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
+      ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
+      distance(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+    }
   }
 
   void evaluateSparseX1(const raft::handle_t& handle,
@@ -574,11 +602,6 @@ class RBFKernel : public GramMatrixBase<math_t> {
                         int offset_x1,
                         int* idx_x2)
   {
-    int minor2    = is_row_major ? n_cols : n2;
-    int minor_out = is_row_major ? n2 : n1;
-    ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
-    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
-
     ASSERT(norm != nullptr, "RBF Kernel needs pre-computed norm for expanded distance compute");
     // compute L2 expanded
     GramMatrixBase<math_t>::linearSparseX1(handle,

From 60017db6b6ea8d7ac7deaa6427a30a9d4ebb8c30 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Tue, 21 Feb 2023 14:05:01 +0000
Subject: [PATCH 04/20] add matrix wrapper to unify kernel API

---
 .../distance/detail/kernels/gram_matrix.cuh   | 515 +++++-------------
 .../detail/kernels/kernel_factory.cuh         |  10 +-
 .../detail/kernels/kernel_matrices.cuh        | 513 ++++-------------
 .../raft/distance/detail/matrix/matrix.hpp    |  99 ++++
 4 files changed, 348 insertions(+), 789 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/matrix/matrix.hpp

diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index 7f93232a18..409e06b8e6 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/distance/detail/matrix/matrix.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
@@ -40,232 +41,55 @@ namespace raft::distance::kernels::detail {
  */
 template <typename math_t>
 class GramMatrixBase {
-  cublasHandle_t cublas_handle;
+  const raft::handle_t& handle;
 
  public:
-  GramMatrixBase(cublasHandle_t cublas_handle) : cublas_handle(cublas_handle){};
+  GramMatrixBase(const raft::handle_t& handle) : handle(handle){};
 
   virtual ~GramMatrixBase(){};
 
   /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
    *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
+   * @param [in] x1 device matrix, size [n1*n_cols]
+   * @param [in] x2 device matrix, size [n2*n_cols]
+   * @param [out] out (dense) device matrix to store the Gram matrix, size [n1*n2]
    * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   * @param norm optional L2 row norm of x1 for expanded computation within RBF.
-   * @param offset_x1 offset where x1 starts within norm
-   * @param idx_x2 indirect access to x2 row id within norm
+   * @param dot_x1 optional dot product of x1 for expanded computation within RBF.
+   * @param dot_x2 optional dot product of x2 for expanded computation within RBF.
    */
-  virtual void operator()(const math_t* x1,
-                          int n1,
-                          int n_cols,
-                          const math_t* x2,
-                          int n2,
-                          math_t* out,
-                          bool is_row_major,
-                          cudaStream_t stream,
-                          int ld1       = 0,
-                          int ld2       = 0,
-                          int ld_out    = 0,
-                          math_t* norm  = nullptr,
-                          int offset_x1 = 0,
-                          int* idx_x2   = nullptr)
-  {
-    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
-    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
-    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
-    evaluate(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out, norm, offset_x1, idx_x2);
-  }
-
-  virtual void operator()(const raft::handle_t& handle,
-                          const int* x1_indptr,
-                          const int* x1_indices,
-                          const math_t* x1_data,
-                          int x1_nnz,
-                          int n1,
-                          int n_cols,
-                          const math_t* x2_data,
-                          int n2,
-                          math_t* out,
-                          bool is_row_major,
-                          cudaStream_t stream,
-                          int ld2       = 0,
-                          int ld_out    = 0,
-                          math_t* norm  = nullptr,
-                          int offset_x1 = 0,
-                          int* idx_x2   = nullptr)
-
-  {
-    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
-    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
-    evaluateSparseX1(handle,
-                     x1_indptr,
-                     x1_indices,
-                     x1_data,
-                     x1_nnz,
-                     n1,
-                     n_cols,
-                     x2_data,
-                     n2,
-                     out,
-                     is_row_major,
-                     stream,
-                     ld2,
-                     ld_out,
-                     norm,
-                     offset_x1,
-                     idx_x2);
-  }
-
-  virtual void operator()(const raft::handle_t& handle,
-                          const int* x1_indptr,
-                          const int* x1_indices,
-                          const math_t* x1_data,
-                          int x1_nnz,
-                          int n1,
-                          int n_cols,
-                          const int* x2_indptr,
-                          const int* x2_indices,
-                          const math_t* x2_data,
-                          int x2_nnz,
-                          int n2,
-                          math_t* out,
-                          bool is_row_major,
+  virtual void operator()(const raft::distance::matrix::detail::Matrix<math_t>& x1,
+                          const raft::distance::matrix::detail::Matrix<math_t>& x2,
+                          raft::distance::matrix::detail::DenseMatrix<math_t>& out,
                           cudaStream_t stream,
-                          int ld_out = 0)
+                          math_t* dot_x1 = nullptr,
+                          math_t* dot_x2 = nullptr)
   {
-    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
-    evaluateSparse(handle,
-                   x1_indptr,
-                   x1_indices,
-                   x1_data,
-                   x1_nnz,
-                   n1,
-                   n_cols,
-                   x2_indptr,
-                   x2_indices,
-                   x2_data,
-                   x2_nnz,
-                   n2,
-                   out,
-                   is_row_major,
-                   stream,
-                   ld_out);
+    ASSERT(x1.n_rows == out.n_rows,
+           "GramMatrix input matrix dimensions for x1 and out do not match");
+    ASSERT(x2.n_rows == out.n_cols,
+           "GramMatrix input matrix dimensions for x2 and out do not match");
+    ASSERT(x1.n_cols == x2.n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
+    evaluate(x1, x2, out, stream, dot_x1, dot_x2);
   }
 
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
    *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
+   * @param [in] x1 device matrix, size [n1*n_cols]
+   * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
    * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   * @param norm optional L2 row norm of x1 for expanded computation within RBF.
-   * @param offset_x1 offset where x1 starts within norm
-   * @param idx_x2 indirect access to x2 row id within norm
+   * @param dot_x1 optional dot product of x1 for expanded computation within RBF.
+   * @param dot_x2 optional dot product of x2 for expanded computation within RBF.
    */
-  virtual void evaluate(const math_t* x1,
-                        int n1,
-                        int n_cols,
-                        const math_t* x2,
-                        int n2,
-                        math_t* out,
-                        bool is_row_major,
+  virtual void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
+                        const raft::distance::matrix::detail::Matrix<math_t>& x2,
+                        raft::distance::matrix::detail::DenseMatrix<math_t>& out,
                         cudaStream_t stream,
-                        int ld1,
-                        int ld2,
-                        int ld_out,
-                        math_t* norm,
-                        int offset_x1,
-                        int* idx_x2)
-  {
-    linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
-
-  virtual void evaluateSparseX1(const raft::handle_t& handle,
-                                const int* x1_indptr,
-                                const int* x1_indices,
-                                const math_t* x1_data,
-                                int x1_nnz,
-                                int n1,
-                                int n_cols,
-                                const math_t* x2_data,
-                                int n2,
-                                math_t* out,
-                                bool is_row_major,
-                                cudaStream_t stream,
-                                int ld2,
-                                int ld_out,
-                                math_t* norm,
-                                int offset_x1,
-                                int* idx_x2)
-  {
-    linearSparseX1(handle,
-                   x1_indptr,
-                   x1_indices,
-                   x1_data,
-                   x1_nnz,
-                   n1,
-                   n_cols,
-                   x2_data,
-                   n2,
-                   out,
-                   is_row_major,
-                   stream,
-                   ld2,
-                   ld_out);
-  }
-
-  virtual void evaluateSparse(const raft::handle_t& handle,
-                              const int* x1_indptr,
-                              const int* x1_indices,
-                              const math_t* x1_data,
-                              int x1_nnz,
-                              int n1,
-                              int n_cols,
-                              const int* x2_indptr,
-                              const int* x2_indices,
-                              const math_t* x2_data,
-                              int x2_nnz,
-                              int n2,
-                              math_t* out,
-                              bool is_row_major,
-                              cudaStream_t stream,
-                              int ld_out)
+                        math_t* dot_x1,
+                        math_t* dot_x2)
   {
-    linearSparse(handle,
-                 x1_indptr,
-                 x1_indices,
-                 x1_data,
-                 x1_nnz,
-                 n1,
-                 n_cols,
-                 x2_indptr,
-                 x2_indices,
-                 x2_data,
-                 x2_nnz,
-                 n2,
-                 out,
-                 is_row_major,
-                 stream,
-                 ld_out);
+    linear(x1, x2, out, stream);
   }
 
   // private:
@@ -279,106 +103,89 @@ class GramMatrixBase {
    *
    * Can be used as a building block for more complex kernel functions.
    *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
+   * @param [in] x1 device matrix, size [n1*n_cols]
+   * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
    * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
    */
-  void linear(const math_t* x1,
-              int n1,
-              int n_cols,
-              const math_t* x2,
-              int n2,
-              math_t* out,
-              bool is_row_major,
-              cudaStream_t stream,
-              int ld1,
-              int ld2,
-              int ld_out)
+  void linear(const raft::distance::matrix::detail::DenseMatrix<math_t>& x1,
+              const raft::distance::matrix::detail::DenseMatrix<math_t>& x2,
+              raft::distance::matrix::detail::DenseMatrix<math_t>& out,
+              cudaStream_t stream)
   {
+    ASSERT(x1.is_row_major == x2.is_row_major,
+           "GramMatrix leading dimensions for x1 and x2 do not match");
+    ASSERT(x2.is_row_major == out.is_row_major,
+           "GramMatrix leading dimensions for x2 and out do not match");
+
     math_t alpha = 1.0;
     math_t beta  = 0.0;
-    if (is_row_major) {
+    if (out.is_row_major) {
       // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),
                                                        CUBLAS_OP_T,
                                                        CUBLAS_OP_N,
-                                                       n2,
-                                                       n1,
-                                                       n_cols,
+                                                       out.n_cols,
+                                                       out.n_rows,
+                                                       x1.n_cols,
                                                        &alpha,
-                                                       x2,
-                                                       ld2,
-                                                       x1,
-                                                       ld1,
+                                                       x2.data,
+                                                       x2.ld,
+                                                       x1.data,
+                                                       x1.ld,
                                                        &beta,
-                                                       out,
-                                                       ld_out,
+                                                       out.data,
+                                                       out.ld,
                                                        stream));
     } else {
       // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),
                                                        CUBLAS_OP_N,
                                                        CUBLAS_OP_T,
-                                                       n1,
-                                                       n2,
-                                                       n_cols,
+                                                       out.n_rows,
+                                                       out.n_cols,
+                                                       x1.n_cols,
                                                        &alpha,
-                                                       x1,
-                                                       ld1,
-                                                       x2,
-                                                       ld2,
+                                                       x1.data,
+                                                       x1.ld,
+                                                       x2.data,
+                                                       x2.ld,
                                                        &beta,
-                                                       out,
-                                                       ld_out,
+                                                       out.data,
+                                                       out.ld,
                                                        stream));
     }
   }
 
-  void linearSparseX1(const raft::handle_t& handle,
-                      const int* x1_indptr,
-                      const int* x1_indices,
-                      const math_t* x1_data,
-                      int x1_nnz,
-                      int n1,
-                      int n_cols,
-                      const math_t* x2_data,
-                      int n2,
-                      math_t* out,
-                      bool is_row_major,
-                      cudaStream_t stream,
-                      int ld2,
-                      int ld_out)
+  void linear(const raft::distance::matrix::detail::CsrMatrix<math_t>& x1,
+              const raft::distance::matrix::detail::DenseMatrix<math_t>& x2,
+              raft::distance::matrix::detail::DenseMatrix<math_t>& out,
+              cudaStream_t stream)
   {
     math_t alpha = 1.0;
     math_t beta  = 0.0;
 
+    ASSERT(x2.is_row_major == out.is_row_major,
+           "GramMatrix leading dimensions for x2 and out do not match");
+
     cusparseSpMatDescr_t descrX1;
     RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatecsr(&descrX1,
-                                                              n1,
-                                                              n_cols,
-                                                              x1_nnz,
-                                                              const_cast<int*>(x1_indptr),
-                                                              const_cast<int*>(x1_indices),
-                                                              const_cast<math_t*>(x1_data)));
+                                                              x1.n_rows,
+                                                              x1.n_cols,
+                                                              x1.nnz,
+                                                              const_cast<int*>(x1.indptr),
+                                                              const_cast<int*>(x1.indices),
+                                                              const_cast<math_t*>(x1.data)));
 
-    auto order = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
+    auto order = out.is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
 
     cusparseDnMatDescr_t descrX2;
     RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
-      &descrX2, n2, n_cols, ld2, const_cast<math_t*>(x2_data), order));
+      &descrX2, x2.n_rows, x2.n_cols, x2.ld, const_cast<math_t*>(x2.data), order));
 
     cusparseDnMatDescr_t descrOut;
     RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
-      &descrOut, n1, n2, ld_out, const_cast<math_t*>(out), order));
+      &descrOut, out.n_rows, out.n_cols, out.ld, const_cast<math_t*>(out.data), order));
 
     auto alg = order == CUSPARSE_ORDER_COL ? CUSPARSE_SPMM_CSR_ALG1 : CUSPARSE_SPMM_CSR_ALG2;
 
@@ -421,130 +228,80 @@ class GramMatrixBase {
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 
-  void linearSparse(const raft::handle_t& handle,
-                    const int* x1_indptr,
-                    const int* x1_indices,
-                    const math_t* x1_data,
-                    int x1_nnz,
-                    int n1,
-                    int n_cols,
-                    const int* x2_indptr,
-                    const int* x2_indices,
-                    const math_t* x2_data,
-                    int x2_nnz,
-                    int n2,
-                    math_t* out,
-                    bool is_row_major,
-                    cudaStream_t stream,
-                    int ld_out)
-  {
-    int minor_out = is_row_major ? n2 : n1;
-    ASSERT(ld_out == minor_out, "Sparse linear Kernel distance does not support ld_out parameter");
-    distanceSparse(handle,
-                   x1_indptr,
-                   x1_indices,
-                   x1_data,
-                   x1_nnz,
-                   n1,
-                   n_cols,
-                   x2_indptr,
-                   x2_indices,
-                   x2_data,
-                   x2_nnz,
-                   n2,
-                   out,
-                   is_row_major,
-                   stream,
-                   raft::distance::DistanceType::InnerProduct);
-  }
-
-  void distanceSparse(const raft::handle_t& handle,
-                      const int* x1_indptr,
-                      const int* x1_indices,
-                      const math_t* x1_data,
-                      int x1_nnz,
-                      int n1,
-                      int n_cols,
-                      const int* x2_indptr,
-                      const int* x2_indices,
-                      const math_t* x2_data,
-                      int x2_nnz,
-                      int n2,
-                      math_t* out,
-                      bool is_row_major,
-                      cudaStream_t stream,
-                      raft::distance::DistanceType metric,
-                      float metricArg = 0.0)
+  void linear(const raft::distance::matrix::detail::CsrMatrix<math_t>& x1,
+              const raft::distance::matrix::detail::CsrMatrix<math_t>& x2,
+              raft::distance::matrix::detail::DenseMatrix<math_t>& out,
+              cudaStream_t stream)
   {
+    int minor_out = out.is_row_major ? out.n_cols : out.n_rows;
+    ASSERT(out.ld == minor_out, "Sparse linear Kernel distance does not support ld_out parameter");
     raft::sparse::distance::distances_config_t<int, math_t> dist_config(handle);
 
     // switch a,b based on is_row_major
-    if (!is_row_major) {
-      dist_config.a_nrows   = n2;
-      dist_config.a_ncols   = n_cols;
-      dist_config.a_nnz     = x2_nnz;
-      dist_config.a_indptr  = const_cast<int*>(x2_indptr);
-      dist_config.a_indices = const_cast<int*>(x2_indices);
-      dist_config.a_data    = const_cast<math_t*>(x2_data);
-      dist_config.b_nrows   = n1;
-      dist_config.b_ncols   = n_cols;
-      dist_config.b_nnz     = x1_nnz;
-      dist_config.b_indptr  = const_cast<int*>(x1_indptr);
-      dist_config.b_indices = const_cast<int*>(x1_indices);
-      dist_config.b_data    = const_cast<math_t*>(x1_data);
+    if (!out.is_row_major) {
+      dist_config.a_nrows   = x2.n_rows;
+      dist_config.a_ncols   = x2.n_cols;
+      dist_config.a_nnz     = x2.nnz;
+      dist_config.a_indptr  = const_cast<int*>(x2.indptr);
+      dist_config.a_indices = const_cast<int*>(x2.indices);
+      dist_config.a_data    = const_cast<math_t*>(x2.data);
+      dist_config.b_nrows   = x1.n_rows;
+      dist_config.b_ncols   = x1.n_cols;
+      dist_config.b_nnz     = x1.nnz;
+      dist_config.b_indptr  = const_cast<int*>(x1.indptr);
+      dist_config.b_indices = const_cast<int*>(x1.indices);
+      dist_config.b_data    = const_cast<math_t*>(x1.data);
     } else {
-      dist_config.a_nrows   = n1;
-      dist_config.a_ncols   = n_cols;
-      dist_config.a_nnz     = x1_nnz;
-      dist_config.a_indptr  = const_cast<int*>(x1_indptr);
-      dist_config.a_indices = const_cast<int*>(x1_indices);
-      dist_config.a_data    = const_cast<math_t*>(x1_data);
-      dist_config.b_nrows   = n2;
-      dist_config.b_ncols   = n_cols;
-      dist_config.b_nnz     = x2_nnz;
-      dist_config.b_indptr  = const_cast<int*>(x2_indptr);
-      dist_config.b_indices = const_cast<int*>(x2_indices);
-      dist_config.b_data    = const_cast<math_t*>(x2_data);
+      dist_config.a_nrows   = x1.n_rows;
+      dist_config.a_ncols   = x1.n_cols;
+      dist_config.a_nnz     = x1.nnz;
+      dist_config.a_indptr  = const_cast<int*>(x1.indptr);
+      dist_config.a_indices = const_cast<int*>(x1.indices);
+      dist_config.a_data    = const_cast<math_t*>(x1.data);
+      dist_config.b_nrows   = x2.n_rows;
+      dist_config.b_ncols   = x2.n_cols;
+      dist_config.b_nnz     = x2.nnz;
+      dist_config.b_indptr  = const_cast<int*>(x2.indptr);
+      dist_config.b_indices = const_cast<int*>(x2.indices);
+      dist_config.b_data    = const_cast<math_t*>(x2.data);
     }
 
-    if (raft::sparse::distance::supportedDistance.find(metric) ==
-        raft::sparse::distance::supportedDistance.end())
-      THROW("DistanceType not supported: %d", metric);
-
-    raft::sparse::distance::pairwiseDistance(out, dist_config, metric, metricArg);
+    raft::sparse::distance::pairwiseDistance(
+      out.data, dist_config, raft::distance::DistanceType::InnerProduct, 0.0);
   }
 
-  /** Calculates the Gram matrix using Euclidean distance.
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
    *
    * Can be used as a building block for more complex kernel functions.
    *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
+   * @param [in] x1 device matrix, size [n1*n_cols]
+   * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
    * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
    */
-  virtual void distance(const math_t* x1,
-                        int n1,
-                        int n_cols,
-                        const math_t* x2,
-                        int n2,
-                        math_t* out,
-                        bool is_row_major,
-                        cudaStream_t stream,
-                        int ld1,
-                        int ld2,
-                        int ld_out)
+  void linear(const raft::distance::matrix::detail::Matrix<math_t>& x1,
+              const raft::distance::matrix::detail::Matrix<math_t>& x2,
+              raft::distance::matrix::detail::DenseMatrix<math_t>& out,
+              cudaStream_t stream)
   {
-    raft::distance::distance<raft::distance::DistanceType::L2Unexpanded, math_t, math_t, math_t>(
-      x1, x2, out, n1, n2, n_cols, stream, is_row_major);
+    // dispatch
+    if (x1.isDense()) {
+      ASSERT(x2.isDense(), "GramMatrix input matrix does not allow Dense*Csr");
+      auto x1_dense = x1.asDense();
+      auto x2_dense = x2.asDense();
+      linear(*x1_dense, *x2_dense, out, stream);
+    } else {
+      auto x1_csr = x1.asCsr();
+      if (x2.isDense()) {
+        auto x2_dense = x2.asDense();
+        linear(*x1_csr, *x2_dense, out, stream);
+      } else {
+        auto x2_csr = x2.asCsr();
+        linear(*x1_csr, *x2_csr, out, stream);
+      }
+    }
   }
 };
 };  // end namespace raft::distance::kernels::detail
\ No newline at end of file
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
index 68e9d72418..460c039073 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
@@ -26,19 +26,19 @@ namespace raft::distance::kernels::detail {
 template <typename math_t>
 class KernelFactory {
  public:
-  static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t cublas_handle)
+  static GramMatrixBase<math_t>* create(KernelParams params, const raft::handle_t& handle)
   {
     GramMatrixBase<math_t>* res;
     // KernelParams is not templated, we convert the parameters to math_t here:
     math_t coef0 = params.coef0;
     math_t gamma = params.gamma;
     switch (params.kernel) {
-      case LINEAR: res = new GramMatrixBase<math_t>(cublas_handle); break;
+      case LINEAR: res = new GramMatrixBase<math_t>(handle); break;
       case POLYNOMIAL:
-        res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, cublas_handle);
+        res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, handle);
         break;
-      case TANH: res = new TanhKernel<math_t>(gamma, coef0, cublas_handle); break;
-      case RBF: res = new RBFKernel<math_t>(gamma, cublas_handle); break;
+      case TANH: res = new TanhKernel<math_t>(gamma, coef0, handle); break;
+      case RBF: res = new RBFKernel<math_t>(gamma, handle); break;
       default: throw raft::exception("Kernel not implemented");
     }
     return res;
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index 5b2a524f46..d65fc28cb7 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -21,6 +21,7 @@
 
 #include <raft/distance/distance.cuh>
 #include <raft/linalg/gemm.cuh>
+#include <raft/sparse/linalg/norm.cuh>
 
 namespace raft::distance::kernels::detail {
 
@@ -100,62 +101,27 @@ __global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t ga
     }
 }
 
-/** Epiloge function for rbf kernel without padding.
- * Calculates output = exp(-gain * input);
- * @param inout device vector, size [len]
- * @param len length of the input vector
- * @param gain
- */
-template <typename math_t>
-__global__ void rbf_kernel_nopad(math_t* inout, size_t len, math_t gain)
-{
-  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
-       tid += blockDim.x * gridDim.x) {
-    inout[tid] = exp(-1.0 * gain * inout[tid]);
-  }
-}
-
-/** Epiloge function for rbf kernel without padding.
- * Calculates output = exp(-gain * input);
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param gain
- */
-template <typename math_t>
-__global__ void rbf_kernel(math_t* inout, int ld, int rows, int cols, math_t gain)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y)
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] = exp(-1.0 * gain * inout[tidx + tidy * ld]);
-    }
-}
-
 /** Epiloge function for rbf kernel using expansion.
  * Calculates output_ij = exp(-gain * (norm_i + norm_j - 2*input_ij));
  * @param inout device vector in column major format, size [ld * cols]
  * @param ld leading dimension of the inout buffer
  * @param rows number of rows (rows <= ld)
  * @param cols number of columns
- * @param norm norm for row indices
- * @param offset_i offset into norm for rows (assumed to be coalesced)
- * @param idx_j indirect column id to access norm
+ * @param dot_rows dot product for row indices
+ * @param dot_cols dot product for column indices
  * @param gain
  */
 template <typename math_t>
 __global__ void rbf_kernel_expanded(
-  math_t* inout, int ld, int rows, int cols, math_t* norm, int offset_i, int* idx_j, math_t gain)
+  math_t* inout, int ld, int rows, int cols, math_t* dot_rows, math_t* dot_cols, math_t gain)
 {
   for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
        tidy += blockDim.y * gridDim.y) {
-    math_t norm_y = norm[idx_j[tidy]];
+    math_t norm_y = dot_cols[tidy];
     for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
          tidx += blockDim.x * gridDim.x) {
       inout[tidx + tidy * ld] =
-        exp(-1.0 * gain * (norm[tidx + offset_i] + norm_y - inout[tidx + tidy * ld] * 2));
+        exp(-1.0 * gain * (dot_rows[tidx] + norm_y - inout[tidx + tidy * ld] * 2));
     }
   }
 }
@@ -198,10 +164,10 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * @param exponent
    * @param gain
    * @param offset
-   * @param cublas_handle
+   * @param handle
    */
-  PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t cublas_handle)
-    : GramMatrixBase<math_t>(cublas_handle), exponent(exponent), gain(gain), offset(offset)
+  PolynomialKernel(exp_t exponent, math_t gain, math_t offset, const raft::handle_t& handle)
+    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset)
   {
   }
 
@@ -211,111 +177,22 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
    *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of features in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*cols]
-   * @param [in] n2 number vectors in x2
+   * @param [in] x1 device matrix, size [n1*n_cols]
+   * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
    * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   * @param norm optional L2 row norm of x1 for expanded computation within RBF.
-   * @param offset_x1 offset where x1 starts within norm
-   * @param idx_x2 indirect access to x2 row id within norm
+   * @param dot_x1 optional dot product of x1 for expanded computation within RBF.
+   * @param dot_x2 optional dot product of x2 for expanded computation within RBF.
    */
-  void evaluate(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
+  void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
+                const raft::distance::matrix::detail::Matrix<math_t>& x2,
+                raft::distance::matrix::detail::DenseMatrix<math_t>& out,
                 cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out,
-                math_t* norm,
-                int offset_x1,
-                int* idx_x2)
-  {
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-
-  void evaluateSparseX1(const raft::handle_t& handle,
-                        const int* x1_indptr,
-                        const int* x1_indices,
-                        const math_t* x1_data,
-                        int x1_nnz,
-                        int n1,
-                        int n_cols,
-                        const math_t* x2_data,
-                        int n2,
-                        math_t* out,
-                        bool is_row_major,
-                        cudaStream_t stream,
-                        int ld2,
-                        int ld_out,
-                        math_t* norm,
-                        int offset_x1,
-                        int* idx_x2)
+                math_t* dot_x1,
+                math_t* dot_x2)
   {
-    GramMatrixBase<math_t>::linearSparseX1(handle,
-                                           x1_indptr,
-                                           x1_indices,
-                                           x1_data,
-                                           x1_nnz,
-                                           n1,
-                                           n_cols,
-                                           x2_data,
-                                           n2,
-                                           out,
-                                           is_row_major,
-                                           stream,
-                                           ld2,
-                                           ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-
-  void evaluateSparse(const raft::handle_t& handle,
-                      const int* x1_indptr,
-                      const int* x1_indices,
-                      const math_t* x1_data,
-                      int x1_nnz,
-                      int n1,
-                      int n_cols,
-                      const int* x2_indptr,
-                      const int* x2_indices,
-                      const math_t* x2_data,
-                      int x2_nnz,
-                      int n2,
-                      math_t* out,
-                      bool is_row_major,
-                      cudaStream_t stream,
-                      int ld_out)
-  {
-    GramMatrixBase<math_t>::linearSparse(handle,
-                                         x1_indptr,
-                                         x1_indices,
-                                         x1_data,
-                                         x1_nnz,
-                                         n1,
-                                         n_cols,
-                                         x2_indptr,
-                                         x2_indices,
-                                         x2_data,
-                                         x2_nnz,
-                                         n2,
-                                         out,
-                                         is_row_major,
-                                         stream,
-                                         ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+    GramMatrixBase<math_t>::linear(x1, x2, out, stream);
+    applyKernel(out.data, out.ld, out.n_rows, out.n_cols, out.is_row_major, stream);
   }
 };
 
@@ -355,8 +232,8 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * @param offset
    * @param cublas_handle
    */
-  TanhKernel(math_t gain, math_t offset, cublasHandle_t cublas_handle)
-    : GramMatrixBase<math_t>(cublas_handle), gain(gain), offset(offset)
+  TanhKernel(math_t gain, math_t offset, const raft::handle_t& handle)
+    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset)
   {
   }
 
@@ -366,113 +243,22 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
    *
-   * @param [in] x1 device array of vectors,
-   *  size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of features in x1 and x2
-   * @param [in] x2 device array of vectors,
-   *   size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
+   * @param [in] x1 device matrix, size [n1*n_cols]
+   * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
    * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   * @param norm optional L2 row norm of x1 for expanded computation within RBF.
-   * @param offset_x1 offset where x1 starts within norm
-   * @param idx_x2 indirect access to x2 row id within norm
+   * @param dot_x1 optional dot product of x1 for expanded computation within RBF.
+   * @param dot_x2 optional dot product of x2 for expanded computation within RBF.
    */
-  void evaluate(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
+  void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
+                const raft::distance::matrix::detail::Matrix<math_t>& x2,
+                raft::distance::matrix::detail::DenseMatrix<math_t>& out,
                 cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out,
-                math_t* norm,
-                int offset_x1,
-                int* idx_x2)
-  {
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-
-  void evaluateSparseX1(const raft::handle_t& handle,
-                        const int* x1_indptr,
-                        const int* x1_indices,
-                        const math_t* x1_data,
-                        int x1_nnz,
-                        int n1,
-                        int n_cols,
-                        const math_t* x2_data,
-                        int n2,
-                        math_t* out,
-                        bool is_row_major,
-                        cudaStream_t stream,
-                        int ld2,
-                        int ld_out,
-                        math_t* norm,
-                        int offset_x1,
-                        int* idx_x2)
-  {
-    GramMatrixBase<math_t>::linearSparseX1(handle,
-                                           x1_indptr,
-                                           x1_indices,
-                                           x1_data,
-                                           x1_nnz,
-                                           n1,
-                                           n_cols,
-                                           x2_data,
-                                           n2,
-                                           out,
-                                           is_row_major,
-                                           stream,
-                                           ld2,
-                                           ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-
-  void evaluateSparse(const raft::handle_t& handle,
-                      const int* x1_indptr,
-                      const int* x1_indices,
-                      const math_t* x1_data,
-                      int x1_nnz,
-                      int n1,
-                      int n_cols,
-                      const int* x2_indptr,
-                      const int* x2_indices,
-                      const math_t* x2_data,
-                      int x2_nnz,
-                      int n2,
-                      math_t* out,
-                      bool is_row_major,
-                      cudaStream_t stream,
-                      int ld_out)
+                math_t* dot_x1,
+                math_t* dot_x2)
   {
-    GramMatrixBase<math_t>::linearSparse(handle,
-                                         x1_indptr,
-                                         x1_indices,
-                                         x1_data,
-                                         x1_nnz,
-                                         n1,
-                                         n_cols,
-                                         x2_indptr,
-                                         x2_indices,
-                                         x2_data,
-                                         x2_nnz,
-                                         n2,
-                                         out,
-                                         is_row_major,
-                                         stream,
-                                         ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+    GramMatrixBase<math_t>::linear(x1, x2, out, stream);
+    applyKernel(out.data, out.ld, out.n_rows, out.n_cols, out.is_row_major, stream);
   }
 };
 
@@ -483,38 +269,23 @@ template <typename math_t>
 class RBFKernel : public GramMatrixBase<math_t> {
   math_t gain;
 
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      rbf_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, gain);
-    } else {
-      int n1 = is_row_major ? cols : rows;
-      int n2 = is_row_major ? rows : cols;
-      rbf_kernel<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
-                   dim3(32, 4, 1),
-                   0,
-                   stream>>>(inout, ld, n1, n2, gain);
-    }
-  }
-
   void applyExpandedRbfKernel(math_t* inout,
                               int ld,
                               int rows,
                               int cols,
-                              math_t* norm,
-                              int offset_i,
-                              int* idx_j,
+                              math_t* dot_x1,
+                              math_t* dot_x2,
                               bool is_row_major,
                               cudaStream_t stream)
   {
-    ASSERT(!is_row_major, "Expanded RBF kernel currently only supports col major format");
+    int n1         = is_row_major ? cols : rows;
+    int n2         = is_row_major ? rows : cols;
+    math_t* dot_n1 = is_row_major ? dot_x2 : dot_x1;
+    math_t* dot_n2 = is_row_major ? dot_x1 : dot_x2;
     rbf_kernel_expanded<<<dim3(raft::ceildiv(rows, 32), raft::ceildiv(cols, 4), 1),
                           dim3(32, 4, 1),
                           0,
-                          stream>>>(inout, ld, rows, cols, norm, offset_i, idx_j, gain);
+                          stream>>>(inout, ld, n1, n2, dot_n1, dot_n2, gain);
   }
 
  public:
@@ -526,9 +297,24 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * @tparam math_t floating point type
    * @param gain
    */
-  RBFKernel(math_t gain, cublasHandle_t cublas_handle)
-    : GramMatrixBase<math_t>(cublas_handle), gain(gain)
+  RBFKernel(math_t gain, const raft::handle_t& handle) : GramMatrixBase<math_t>(handle), gain(gain)
+  {
+  }
+
+  void matrixDot(const raft::distance::matrix::detail::Matrix<math_t>& matrix,
+                 math_t* target,
+                 cudaStream_t stream)
   {
+    auto norm = raft::linalg::NormType::L2Norm;
+    if (matrix.isDense()) {
+      auto dense_matrix = matrix.asDense();
+      raft::linalg::rowNorm(
+        target, dense_matrix->data, matrix.n_cols, matrix.n_rows, norm, false, stream);
+    } else {
+      auto csr_matrix = matrix.asCsr();
+      raft::sparse::linalg::rowNormCsr(
+        target, csr_matrix->indptr, csr_matrix->data, csr_matrix->nnz, matrix.n_rows, norm, stream);
+    }
   }
 
   /** Evaluate kernel matrix using RBF kernel.
@@ -537,144 +323,61 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and | | euclidean distance.
    *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of features in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
+   * @param [in] x1 device matrix, size [n1*n_cols]
+   * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
    * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1, currently only ld1 == n1 is supported
-   * @param ld2 leading dimension of x2, currently only ld2 == n2 is supported
-   * @param ld_out leading dimension of out, only ld_out == n1 is supported
-   * @param norm optional L2 row norm of x1 for expanded computation within RBF.
-   * @param offset_x1 offset where x1 starts within norm
-   * @param idx_x2 indirect access to x2 row id within norm
+   * @param dot_x1 optional dot product of x1 for expanded computation within RBF.
+   * @param dot_x2 optional dot product of x2 for expanded computation within RBF.
    */
-  void evaluate(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
+  void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
+                const raft::distance::matrix::detail::Matrix<math_t>& x2,
+                raft::distance::matrix::detail::DenseMatrix<math_t>& out,
                 cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out,
-                math_t* norm,
-                int offset_x1,
-                int* idx_x2)
+                math_t* dot_x1,
+                math_t* dot_x2)
   {
-    int minor_out = is_row_major ? n2 : n1;
-    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
-    if (norm != nullptr) {
-      // compute L2expanded
-      GramMatrixBase<math_t>::linear(
-        x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-      applyExpandedRbfKernel(out, ld_out, n1, n2, norm, offset_x1, idx_x2, is_row_major, stream);
+    if (x1.isDense() && x2.isDense() && (dot_x1 == nullptr || dot_x2 == nullptr)) {
+      auto x1_dense = x1.asDense();
+      auto x2_dense = x2.asDense();
+      distance_rbf(*x1_dense, *x2_dense, out, stream);
     } else {
-      int minor1 = is_row_major ? n_cols : n1;
-      int minor2 = is_row_major ? n_cols : n2;
-      ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
-      ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
-      distance(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+      rmm::device_uvector<math_t> tmp_dot_x1(0, stream);
+      rmm::device_uvector<math_t> tmp_dot_x2(0, stream);
+      if (dot_x1 == nullptr) {
+        tmp_dot_x1.reserve(x1.n_rows, stream);
+        dot_x1 = tmp_dot_x1.data();
+        matrixDot(x1, dot_x1, stream);
+      }
+      if (dot_x2 == nullptr) {
+        tmp_dot_x2.reserve(x2.n_rows, stream);
+        dot_x2 = tmp_dot_x2.data();
+        matrixDot(x2, dot_x2, stream);
+      }
+      // compute L2expanded
+      GramMatrixBase<math_t>::linear(x1, x2, out, stream);
+      applyExpandedRbfKernel(
+        out.data, out.ld, out.n_rows, out.n_cols, dot_x1, dot_x2, out.is_row_major, stream);
     }
   }
 
-  void evaluateSparseX1(const raft::handle_t& handle,
-                        const int* x1_indptr,
-                        const int* x1_indices,
-                        const math_t* x1_data,
-                        int x1_nnz,
-                        int n1,
-                        int n_cols,
-                        const math_t* x2_data,
-                        int n2,
-                        math_t* out,
-                        bool is_row_major,
-                        cudaStream_t stream,
-                        int ld2,
-                        int ld_out,
-                        math_t* norm,
-                        int offset_x1,
-                        int* idx_x2)
-  {
-    ASSERT(norm != nullptr, "RBF Kernel needs pre-computed norm for expanded distance compute");
-    // compute L2 expanded
-    GramMatrixBase<math_t>::linearSparseX1(handle,
-                                           x1_indptr,
-                                           x1_indices,
-                                           x1_data,
-                                           x1_nnz,
-                                           n1,
-                                           n_cols,
-                                           x2_data,
-                                           n2,
-                                           out,
-                                           is_row_major,
-                                           stream,
-                                           ld2,
-                                           ld_out);
-
-    applyExpandedRbfKernel(out, ld_out, n1, n2, norm, offset_x1, idx_x2, is_row_major, stream);
-  }
-
-  void evaluateSparse(const raft::handle_t& handle,
-                      const int* x1_indptr,
-                      const int* x1_indices,
-                      const math_t* x1_data,
-                      int x1_nnz,
-                      int n1,
-                      int n_cols,
-                      const int* x2_indptr,
-                      const int* x2_indices,
-                      const math_t* x2_data,
-                      int x2_nnz,
-                      int n2,
-                      math_t* out,
-                      bool is_row_major,
-                      cudaStream_t stream,
-                      int ld_out)
-  {
-    int minor_out = is_row_major ? n2 : n1;
-    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
-
-    GramMatrixBase<math_t>::distanceSparse(handle,
-                                           x1_indptr,
-                                           x1_indices,
-                                           x1_data,
-                                           x1_nnz,
-                                           n1,
-                                           n_cols,
-                                           x2_indptr,
-                                           x2_indices,
-                                           x2_data,
-                                           x2_nnz,
-                                           n2,
-                                           out,
-                                           is_row_major,
-                                           stream,
-                                           raft::distance::DistanceType::L2Unexpanded);
-
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-
   /** Customize distance function withe RBF epilogue */
-  void distance(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
-                cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out)
+  void distance_rbf(const raft::distance::matrix::detail::DenseMatrix<math_t>& x1,
+                    const raft::distance::matrix::detail::DenseMatrix<math_t>& x2,
+                    raft::distance::matrix::detail::DenseMatrix<math_t>& out,
+                    cudaStream_t stream)
   {
+    int minor1    = x1.is_row_major ? x1.n_cols : x1.n_rows;
+    int minor2    = x2.is_row_major ? x2.n_cols : x2.n_rows;
+    int minor_out = out.is_row_major ? out.n_cols : out.n_rows;
+    ASSERT(x1.ld == minor1, "RBF Kernel distance does not support ld1 parameter");
+    ASSERT(x2.ld == minor2, "RBF Kernel distance does not support ld2 parameter");
+    ASSERT(out.ld == minor_out, "RBF Kernel distance does not support ld_out parameter");
+    ASSERT(x1.is_row_major == x2.is_row_major,
+           "GramMatrix leading dimensions for x1 and x2 do not match");
+    ASSERT(x2.is_row_major == out.is_row_major,
+           "GramMatrix leading dimensions for x2 and out do not match");
+
     math_t gain   = this->gain;
     using index_t = int64_t;
 
@@ -684,17 +387,17 @@ class RBFKernel : public GramMatrixBase<math_t> {
                              math_t,
                              math_t,
                              decltype(fin_op),
-                             index_t>(const_cast<math_t*>(x1),
-                                      const_cast<math_t*>(x2),
-                                      out,
-                                      n1,
-                                      n2,
-                                      n_cols,
+                             index_t>(const_cast<math_t*>(x1.data),
+                                      const_cast<math_t*>(x2.data),
+                                      out.data,
+                                      out.n_rows,
+                                      out.n_cols,
+                                      x1.n_cols,
                                       NULL,
                                       0,
                                       fin_op,
                                       stream,
-                                      is_row_major);
+                                      out.is_row_major);
   }
 };
 
diff --git a/cpp/include/raft/distance/detail/matrix/matrix.hpp b/cpp/include/raft/distance/detail/matrix/matrix.hpp
new file mode 100644
index 0000000000..d4a0dda691
--- /dev/null
+++ b/cpp/include/raft/distance/detail/matrix/matrix.hpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/error.hpp>
+
+namespace raft::distance::matrix::detail {
+
+template <typename math_t>
+class DenseMatrix;
+template <typename math_t>
+class CsrMatrix;
+
+/*
+ * Thin matrix wrapper to allow single API for different matrix representations
+ */
+template <typename math_t>
+class Matrix {
+ public:
+  Matrix(int rows, int cols) : n_rows(rows), n_cols(cols){};
+  virtual bool isDense() const = 0;
+  virtual ~Matrix(){};
+
+  DenseMatrix<math_t>* asDense()
+  {
+    DenseMatrix<math_t>* cast = dynamic_cast<DenseMatrix<math_t>*>(this);
+    ASSERT(cast != nullptr, "Invalid cast! Please check for isDense() before casting.");
+    return cast;
+  };
+
+  CsrMatrix<math_t>* asCsr()
+  {
+    CsrMatrix<math_t>* cast = dynamic_cast<CsrMatrix<math_t>*>(this);
+    ASSERT(cast != nullptr, "Invalid cast! Please check for isDense() before casting.");
+    return cast;
+  };
+
+  const DenseMatrix<math_t>* asDense() const
+  {
+    const DenseMatrix<math_t>* cast = dynamic_cast<const DenseMatrix<math_t>*>(this);
+    ASSERT(cast != nullptr, "Invalid cast! Please check for isDense() before casting.");
+    return cast;
+  };
+
+  const CsrMatrix<math_t>* asCsr() const
+  {
+    const CsrMatrix<math_t>* cast = dynamic_cast<const CsrMatrix<math_t>*>(this);
+    ASSERT(cast != nullptr, "Invalid cast! Please check for isDense() before casting.");
+    return cast;
+  };
+
+  int n_rows;
+  int n_cols;
+};
+
+template <typename math_t>
+class DenseMatrix : public Matrix<math_t> {
+ public:
+  DenseMatrix(math_t* data, int rows, int cols, bool row_major = false, int ld_in = 0)
+    : Matrix<math_t>(rows, cols), data(data), is_row_major(row_major), ld(ld_in)
+  {
+    if (ld <= 0) ld = is_row_major ? cols : rows;
+  }
+  bool isDense() const { return true; }
+  math_t* data;
+  bool is_row_major;
+  int ld;
+};
+
+template <typename math_t>
+class CsrMatrix : public Matrix<math_t> {
+ public:
+  CsrMatrix(int* indptr, int* indices, math_t* data, int nnz, int rows, int cols)
+    : Matrix<math_t>(rows, cols), indptr(indptr), indices(indices), data(data), nnz(nnz)
+  {
+  }
+  bool isDense() const { return false; }
+
+  int nnz;
+  int* indptr;
+  int* indices;
+  math_t* data;
+};
+
+}  // namespace raft::distance::matrix::detail
\ No newline at end of file

From 9f46742e285925c7c4c629ef77924167230fe1f4 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Tue, 21 Feb 2023 18:23:01 +0000
Subject: [PATCH 05/20] finalize merge, adjust/add tests

---
 .../distance/detail/kernels/gram_matrix.cuh   |   5 +-
 .../detail/kernels/kernel_factory.cuh         |   2 +-
 .../detail/kernels/kernel_matrices.cuh        |   7 +-
 .../raft/sparse/linalg/detail/norm.cuh        |  20 +--
 cpp/include/raft/sparse/linalg/norm.cuh       |   4 +-
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/distance/gram.cu                     |  29 ++--
 cpp/test/sparse/norm.cu                       | 123 ++++++++---------
 cpp/test/sparse/normalize.cu                  | 127 ++++++++++++++++++
 9 files changed, 222 insertions(+), 96 deletions(-)
 create mode 100644 cpp/test/sparse/normalize.cu

diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index a5d756d351..1a2b4d67f8 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/detail/matrix/matrix.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
@@ -41,10 +42,10 @@ namespace raft::distance::kernels::detail {
  */
 template <typename math_t>
 class GramMatrixBase {
-  const raft::handle_t& handle;
+  const raft::device_resources& handle;
 
  public:
-  GramMatrixBase(const raft::handle_t& handle) : handle(handle){};
+  GramMatrixBase(const raft::device_resources& handle) : handle(handle){};
 
   virtual ~GramMatrixBase(){};
 
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
index 460c039073..ad4a81c55a 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
@@ -26,7 +26,7 @@ namespace raft::distance::kernels::detail {
 template <typename math_t>
 class KernelFactory {
  public:
-  static GramMatrixBase<math_t>* create(KernelParams params, const raft::handle_t& handle)
+  static GramMatrixBase<math_t>* create(KernelParams params, const raft::device_resources& handle)
   {
     GramMatrixBase<math_t>* res;
     // KernelParams is not templated, we convert the parameters to math_t here:
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index dc96f8ec01..baaa7f5bbe 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -166,7 +166,7 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * @param offset
    * @param handle
    */
-  PolynomialKernel(exp_t exponent, math_t gain, math_t offset, const raft::handle_t& handle)
+  PolynomialKernel(exp_t exponent, math_t gain, math_t offset, const raft::device_resources& handle)
     : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset)
   {
   }
@@ -232,7 +232,7 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * @param offset
    * @param cublas_handle
    */
-  TanhKernel(math_t gain, math_t offset, const raft::handle_t& handle)
+  TanhKernel(math_t gain, math_t offset, const raft::device_resources& handle)
     : GramMatrixBase<math_t>(handle), gain(gain), offset(offset)
   {
   }
@@ -297,7 +297,8 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * @tparam math_t floating point type
    * @param gain
    */
-  RBFKernel(math_t gain, const raft::handle_t& handle) : GramMatrixBase<math_t>(handle), gain(gain)
+  RBFKernel(math_t gain, const raft::device_resources& handle)
+    : GramMatrixBase<math_t>(handle), gain(gain)
   {
   }
 
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 7605ce8351..7dbea8c76c 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <cusparse_v2.h>
+#include <raft/common/nvtx.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/norm_types.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/util/cuda_utils.cuh>
@@ -208,18 +210,18 @@ __global__ void __launch_bounds__(Policy::ThreadsPerBlock)
 template <typename Policy,
           typename Type,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<Type, IdxType>,
-          typename ReduceLambda = raft::Sum<Type>,
-          typename FinalLambda  = raft::Nop<Type>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void csrReduction(Type* dots,
                   const IdxType* ia,
                   const Type* data,
                   IdxType N,
                   Type init,
                   cudaStream_t stream,
-                  MainLambda main_op     = raft::Nop<Type, IdxType>(),
-                  ReduceLambda reduce_op = raft::Sum<Type>(),
-                  FinalLambda final_op   = raft::Nop<Type>())
+                  MainLambda main_op     = raft::identity_op(),
+                  ReduceLambda reduce_op = raft::add_op(),
+                  FinalLambda final_op   = raft::identity_op())
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "csrReduction<%d,%d>", Policy::LogicalWarpSize, Policy::RowsPerBlock);
@@ -244,15 +246,15 @@ void rowNormCsrCaller(Type* dots,
   switch (type) {
     case raft::linalg::NormType::L1Norm:
       csrReduction<CsrReductionPolicy<32, 4>>(
-        dots, ia, data, N, (Type)0, stream, raft::L1Op<Type>(), raft::Sum<Type>(), fin_op);
+        dots, ia, data, N, (Type)0, stream, raft::abs_op(), raft::add_op(), fin_op);
       break;
     case raft::linalg::NormType::L2Norm:
       csrReduction<CsrReductionPolicy<32, 4>>(
-        dots, ia, data, N, (Type)0, stream, raft::L2Op<Type>(), raft::Sum<Type>(), fin_op);
+        dots, ia, data, N, (Type)0, stream, raft::sq_op(), raft::add_op(), fin_op);
       break;
     case raft::linalg::NormType::LinfNorm:
       csrReduction<CsrReductionPolicy<32, 4>>(
-        dots, ia, data, N, (Type)0, stream, raft::L1Op<Type>(), raft::Max<Type>(), fin_op);
+        dots, ia, data, N, (Type)0, stream, raft::abs_op(), raft::max_op(), fin_op);
       break;
     default: THROW("Unsupported norm type: %d", type);
   };
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index 07b11d51f7..d504e735fb 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -85,7 +85,7 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+template <typename Type, typename IdxType = int, typename Lambda = raft::identity_op>
 void rowNormCsr(Type* dots,
                 const IdxType* ia,
                 const Type* data,
@@ -93,7 +93,7 @@ void rowNormCsr(Type* dots,
                 IdxType N,
                 raft::linalg::NormType type,
                 cudaStream_t stream,
-                Lambda fin_op = raft::Nop<Type, IdxType>())
+                Lambda fin_op = raft::identity_op())
 {
   detail::rowNormCsrCaller(dots, ia, data, nnz, N, type, stream, fin_op);
 }
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 575e8cf84b..d08ef85e90 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -223,6 +223,7 @@ if(BUILD_TESTS)
     test/sparse/degree.cu
     test/sparse/filter.cu
     test/sparse/norm.cu
+    test/sparse/normalize.cu
     test/sparse/reduce.cu
     test/sparse/row_op.cu
     test/sparse/sort.cu
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index a2f0e2385c..c4c439e6da 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -22,6 +22,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
+#include <raft/distance/detail/matrix/matrix.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/kernels.cuh>
 #include <raft/random/rng.cuh>
@@ -31,6 +32,8 @@
 
 namespace raft::distance::kernels {
 
+using namespace raft::distance::matrix::detail;
+
 // Get the offset of element [i,k].
 HDI int get_offset(int i, int k, int ld, bool is_row_major)
 {
@@ -151,20 +154,18 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
 
   void runTest()
   {
-    std::unique_ptr<GramMatrixBase<math_t>> kernel = std::unique_ptr<GramMatrixBase<math_t>>(
-      KernelFactory<math_t>::create(params.kernel, handle.get_cublas_handle()));
-
-    kernel->evaluate(x1.data(),
-                     params.n1,
-                     params.n_cols,
-                     x2.data(),
-                     params.n2,
-                     gram.data(),
-                     params.is_row_major,
-                     stream,
-                     params.ld1,
-                     params.ld2,
-                     params.ld_out);
+    std::unique_ptr<GramMatrixBase<math_t>> kernel =
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel, handle));
+
+    DenseMatrix<math_t> x1_dense(
+      x1.data(), params.n1, params.n_cols, params.is_row_major, params.ld1);
+    DenseMatrix<math_t> x2_dense(
+      x2.data(), params.n2, params.n_cols, params.is_row_major, params.ld2);
+    DenseMatrix<math_t> gram_dense(
+      x1.data(), params.n1, params.n2, params.is_row_major, params.ld_out);
+
+    (*kernel)(x1_dense, x2_dense, gram_dense, stream);
+
     naiveKernel();
     ASSERT_TRUE(raft::devArrMatchHost(
       gram_host.data(), gram.data(), gram.size(), raft::CompareApprox<math_t>(1e-6f)));
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 91b7b09fcc..f1328fa52d 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -19,7 +19,7 @@
 #include "../test_utils.cuh"
 
 #include <raft/core/device_resources.hpp>
-#include <raft/sparse/csr.hpp>
+#include <raft/linalg/norm_types.hpp>
 #include <raft/sparse/linalg/norm.cuh>
 #include <raft/util/cudart_utils.hpp>
 
@@ -29,26 +29,24 @@
 namespace raft {
 namespace sparse {
 
-enum NormalizeMethod { MAX, L1 };
-
 template <typename Type_f, typename Index_>
-struct CSRRowNormalizeInputs {
-  NormalizeMethod method;
-  std::vector<Index_> ex_scan;
-  std::vector<Type_f> in_vals;
+struct CSRRowNormInputs {
+  raft::linalg::NormType norm;
+  std::vector<Index_> indptr;
+  std::vector<Type_f> data;
   std::vector<Type_f> verify;
 };
 
 template <typename Type_f, typename Index_>
-class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
+class CSRRowNormTest : public ::testing::TestWithParam<CSRRowNormInputs<Type_f, Index_>> {
  public:
-  CSRRowNormalizeTest()
-    : params(::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>>::GetParam()),
+  CSRRowNormTest()
+    : params(::testing::TestWithParam<CSRRowNormInputs<Type_f, Index_>>::GetParam()),
       stream(handle.get_stream()),
-      in_vals(params.in_vals.size(), stream),
-      verify(params.verify.size(), stream),
-      ex_scan(params.ex_scan.size(), stream),
-      result(params.verify.size(), stream)
+      data(params.data.size(), stream),
+      verify(params.indptr.size() - 1, stream),
+      indptr(params.indptr.size(), stream),
+      result(params.indptr.size() - 1, stream)
   {
   }
 
@@ -57,71 +55,66 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInput
 
   void Run()
   {
-    Index_ n_rows = params.ex_scan.size();
-    Index_ nnz    = params.in_vals.size();
-
-    raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
-    raft::update_device(in_vals.data(), params.in_vals.data(), nnz, stream);
-    raft::update_device(verify.data(), params.verify.data(), nnz, stream);
-
-    switch (params.method) {
-      case MAX:
-        linalg::csr_row_normalize_max<Type_f>(
-          ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
-        break;
-      case L1:
-        linalg::csr_row_normalize_l1<Type_f>(
-          ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
-        break;
-    }
+    Index_ n_rows = params.indptr.size() - 1;
+    Index_ nnz    = params.data.size();
+
+    raft::update_device(indptr.data(), params.indptr.data(), n_rows + 1, stream);
+    raft::update_device(data.data(), params.data.data(), nnz, stream);
+    raft::update_device(verify.data(), params.verify.data(), n_rows, stream);
+
+    linalg::rowNormCsr(result.data(), indptr.data(), data.data(), nnz, n_rows, params.norm, stream);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     ASSERT_TRUE(
-      raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
+      raft::devArrMatch<Type_f>(verify.data(), result.data(), n_rows, raft::Compare<Type_f>()));
   }
 
  protected:
   raft::device_resources handle;
   cudaStream_t stream;
 
-  CSRRowNormalizeInputs<Type_f, Index_> params;
-  rmm::device_uvector<Index_> ex_scan;
-  rmm::device_uvector<Type_f> in_vals, result, verify;
+  CSRRowNormInputs<Type_f, Index_> params;
+  rmm::device_uvector<Index_> indptr;
+  rmm::device_uvector<Type_f> data, result, verify;
 };
 
-using CSRRowNormalizeTestF = CSRRowNormalizeTest<float, int>;
-TEST_P(CSRRowNormalizeTestF, Result) { Run(); }
-
-using CSRRowNormalizeTestD = CSRRowNormalizeTest<double, int>;
-TEST_P(CSRRowNormalizeTestD, Result) { Run(); }
-
-const std::vector<CSRRowNormalizeInputs<float, int>> csrnormalize_inputs_f = {
-  {MAX,
-   {0, 4, 8, 9},
-   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
-   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
-  {L1,
-   {0, 4, 8, 9},
-   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
-   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+using CSRRowNormTestF = CSRRowNormTest<float, int>;
+TEST_P(CSRRowNormTestF, Result) { Run(); }
+
+using CSRRowNormTestD = CSRRowNormTest<double, int>;
+TEST_P(CSRRowNormTestD, Result) { Run(); }
+
+const std::vector<CSRRowNormInputs<float, int>> csrnorm_inputs_f = {
+  {raft::linalg::NormType::LinfNorm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {5.0, 10.0, 2.0}},
+  {raft::linalg::NormType::L1Norm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {8.0, 13.0, 4.0}},
+  {raft::linalg::NormType::L2Norm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {30.0, 105.0, 6.0}},
 };
-const std::vector<CSRRowNormalizeInputs<double, int>> csrnormalize_inputs_d = {
-  {MAX,
-   {0, 4, 8, 9},
-   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
-   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
-  {L1,
-   {0, 4, 8, 9},
-   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
-   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+const std::vector<CSRRowNormInputs<double, int>> csrnorm_inputs_d = {
+  {raft::linalg::NormType::LinfNorm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {5.0, 10.0, 2.0}},
+  {raft::linalg::NormType::L1Norm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {8.0, 13.0, 4.0}},
+  {raft::linalg::NormType::L2Norm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {30.0, 105.0, 6.0}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseNormTest,
-                        CSRRowNormalizeTestF,
-                        ::testing::ValuesIn(csrnormalize_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseNormTest,
-                        CSRRowNormalizeTestD,
-                        ::testing::ValuesIn(csrnormalize_inputs_d));
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormTestF, ::testing::ValuesIn(csrnorm_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormTestD, ::testing::ValuesIn(csrnorm_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/normalize.cu b/cpp/test/sparse/normalize.cu
new file mode 100644
index 0000000000..91b7b09fcc
--- /dev/null
+++ b/cpp/test/sparse/normalize.cu
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_resources.hpp>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/linalg/norm.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+enum NormalizeMethod { MAX, L1 };
+
+template <typename Type_f, typename Index_>
+struct CSRRowNormalizeInputs {
+  NormalizeMethod method;
+  std::vector<Index_> ex_scan;
+  std::vector<Type_f> in_vals;
+  std::vector<Type_f> verify;
+};
+
+template <typename Type_f, typename Index_>
+class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
+ public:
+  CSRRowNormalizeTest()
+    : params(::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>>::GetParam()),
+      stream(handle.get_stream()),
+      in_vals(params.in_vals.size(), stream),
+      verify(params.verify.size(), stream),
+      ex_scan(params.ex_scan.size(), stream),
+      result(params.verify.size(), stream)
+  {
+  }
+
+ protected:
+  void SetUp() override {}
+
+  void Run()
+  {
+    Index_ n_rows = params.ex_scan.size();
+    Index_ nnz    = params.in_vals.size();
+
+    raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
+    raft::update_device(in_vals.data(), params.in_vals.data(), nnz, stream);
+    raft::update_device(verify.data(), params.verify.data(), nnz, stream);
+
+    switch (params.method) {
+      case MAX:
+        linalg::csr_row_normalize_max<Type_f>(
+          ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
+        break;
+      case L1:
+        linalg::csr_row_normalize_l1<Type_f>(
+          ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
+        break;
+    }
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
+  }
+
+ protected:
+  raft::device_resources handle;
+  cudaStream_t stream;
+
+  CSRRowNormalizeInputs<Type_f, Index_> params;
+  rmm::device_uvector<Index_> ex_scan;
+  rmm::device_uvector<Type_f> in_vals, result, verify;
+};
+
+using CSRRowNormalizeTestF = CSRRowNormalizeTest<float, int>;
+TEST_P(CSRRowNormalizeTestF, Result) { Run(); }
+
+using CSRRowNormalizeTestD = CSRRowNormalizeTest<double, int>;
+TEST_P(CSRRowNormalizeTestD, Result) { Run(); }
+
+const std::vector<CSRRowNormalizeInputs<float, int>> csrnormalize_inputs_f = {
+  {MAX,
+   {0, 4, 8, 9},
+   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
+  {L1,
+   {0, 4, 8, 9},
+   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+};
+const std::vector<CSRRowNormalizeInputs<double, int>> csrnormalize_inputs_d = {
+  {MAX,
+   {0, 4, 8, 9},
+   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
+  {L1,
+   {0, 4, 8, 9},
+   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+};
+
+INSTANTIATE_TEST_CASE_P(SparseNormTest,
+                        CSRRowNormalizeTestF,
+                        ::testing::ValuesIn(csrnormalize_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseNormTest,
+                        CSRRowNormalizeTestD,
+                        ::testing::ValuesIn(csrnormalize_inputs_d));
+
+}  // namespace sparse
+}  // namespace raft

From c0964955753230ea30de2454f544efc61c96dffb Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 22 Feb 2023 20:31:27 +0000
Subject: [PATCH 06/20] add test and fix rbf

---
 .../detail/kernels/kernel_matrices.cuh        |  16 +-
 cpp/test/CMakeLists.txt                       |   9 +-
 cpp/test/distance/gram.cu                     |   2 +-
 cpp/test/sparse/gram.cu                       | 342 ++++++++++++++++++
 4 files changed, 363 insertions(+), 6 deletions(-)
 create mode 100644 cpp/test/sparse/gram.cu

diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index baaa7f5bbe..5079a87027 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -230,7 +230,7 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * @tparam math_t floating point type
    * @param gain
    * @param offset
-   * @param cublas_handle
+   * @param handle
    */
   TanhKernel(math_t gain, math_t offset, const raft::device_resources& handle)
     : GramMatrixBase<math_t>(handle), gain(gain), offset(offset)
@@ -282,7 +282,7 @@ class RBFKernel : public GramMatrixBase<math_t> {
     int n2         = is_row_major ? rows : cols;
     math_t* dot_n1 = is_row_major ? dot_x2 : dot_x1;
     math_t* dot_n2 = is_row_major ? dot_x1 : dot_x2;
-    rbf_kernel_expanded<<<dim3(raft::ceildiv(rows, 32), raft::ceildiv(cols, 4), 1),
+    rbf_kernel_expanded<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
                           dim3(32, 4, 1),
                           0,
                           stream>>>(inout, ld, n1, n2, dot_n1, dot_n2, gain);
@@ -309,8 +309,16 @@ class RBFKernel : public GramMatrixBase<math_t> {
     auto norm = raft::linalg::NormType::L2Norm;
     if (matrix.isDense()) {
       auto dense_matrix = matrix.asDense();
-      raft::linalg::rowNorm(
-        target, dense_matrix->data, matrix.n_cols, matrix.n_rows, norm, false, stream);
+      int minor         = dense_matrix->is_row_major ? matrix.n_cols : matrix.n_rows;
+      ASSERT(dense_matrix->ld == minor,
+             "RBF Kernel lazy rowNorm compute does not support ld parameter");
+      raft::linalg::rowNorm(target,
+                            dense_matrix->data,
+                            matrix.n_cols,
+                            matrix.n_rows,
+                            norm,
+                            dense_matrix->is_row_major,
+                            stream);
     } else {
       auto csr_matrix = matrix.asCsr();
       raft::sparse::linalg::rowNormCsr(
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index d08ef85e90..64d757e33a 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -232,7 +232,14 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu OPTIONAL DIST
+    NAME 
+    SPARSE_DIST_TEST 
+    PATH 
+    test/sparse/dist_coo_spmv.cu 
+    test/sparse/distance.cu 
+    test/sparse/gram.cu 
+    OPTIONAL 
+    DIST
     NN
   )
 
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index c4c439e6da..7ea9cc3c7d 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -162,7 +162,7 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
     DenseMatrix<math_t> x2_dense(
       x2.data(), params.n2, params.n_cols, params.is_row_major, params.ld2);
     DenseMatrix<math_t> gram_dense(
-      x1.data(), params.n1, params.n2, params.is_row_major, params.ld_out);
+      gram.data(), params.n1, params.n2, params.is_row_major, params.ld_out);
 
     (*kernel)(x1_dense, x2_dense, gram_dense, stream);
 
diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu
new file mode 100644
index 0000000000..2cf880d23e
--- /dev/null
+++ b/cpp/test/sparse/gram.cu
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
+#include "../test_utils.cuh"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/distance/detail/matrix/matrix.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/kernels.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/convert/dense.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::distance::kernels {
+
+using namespace raft::distance::matrix::detail;
+
+// Get the offset of element [i,k].
+HDI int get_offset(int i, int k, int ld, bool is_row_major)
+{
+  return is_row_major ? i * ld + k : i + k * ld;
+}
+
+enum SparseType { DENSE, MIX, CSR };
+
+struct GramMatrixInputs {
+  int n1;      // feature vectors in matrix 1
+  int n2;      // featuer vectors in matrix 2
+  int n_cols;  // number of elements in a feature vector
+  bool is_row_major;
+  SparseType sparse_input;
+  KernelParams kernel;
+  int ld1;
+  int ld2;
+  int ld_out;
+  // We will generate random input using the dimensions given here.
+  // The reference output is calculated by a custom kernel.
+};
+
+std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p)
+{
+  std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
+  os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/"
+     << (p.is_row_major ? "RowMajor/" : "ColMajor/")
+     << (p.sparse_input == SparseType::DENSE
+           ? "DenseDense/"
+           : (p.sparse_input == SparseType::MIX ? "CsrDense/" : "CsrCsr/"))
+     << kernel_names[p.kernel.kernel] << "/ld_" << p.ld1 << "x" << p.ld2 << "x" << p.ld_out;
+  return os;
+}
+
+const std::vector<GramMatrixInputs> inputs = {
+  {42, 137, 2, false, SparseType::DENSE, {KernelType::LINEAR}},
+  {42, 137, 2, true, SparseType::DENSE, {KernelType::LINEAR}},
+  {42, 137, 2, false, SparseType::DENSE, {KernelType::LINEAR}, 64, 179, 181},
+  {42, 137, 2, true, SparseType::DENSE, {KernelType::LINEAR}, 64, 179, 181},
+  {42, 137, 2, false, SparseType::MIX, {KernelType::LINEAR}},
+  {42, 137, 2, true, SparseType::MIX, {KernelType::LINEAR}},
+  {42, 137, 2, false, SparseType::MIX, {KernelType::LINEAR}, 64, 179, 181},
+  {42, 137, 2, true, SparseType::MIX, {KernelType::LINEAR}, 64, 179, 181},
+  {42, 137, 2, false, SparseType::CSR, {KernelType::LINEAR}},
+  {42, 137, 2, true, SparseType::CSR, {KernelType::LINEAR}},
+  // CSR does not support ld_out
+  {42, 137, 2, false, SparseType::CSR, {KernelType::LINEAR}, 64, 179, 0},
+  {42, 137, 2, true, SparseType::CSR, {KernelType::LINEAR}, 64, 179, 0},
+  {137, 42, 2, false, SparseType::DENSE, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, true, SparseType::DENSE, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, false, SparseType::DENSE, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {137, 42, 2, true, SparseType::DENSE, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {137, 42, 2, false, SparseType::MIX, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, true, SparseType::MIX, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, false, SparseType::MIX, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {137, 42, 2, true, SparseType::MIX, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {137, 42, 2, false, SparseType::CSR, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, true, SparseType::CSR, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  // CSR does not support ld_out
+  {137, 42, 2, false, SparseType::CSR, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 0},
+  {137, 42, 2, true, SparseType::CSR, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 0},
+  {42, 137, 2, false, SparseType::DENSE, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, true, SparseType::DENSE, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, false, SparseType::DENSE, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49},
+  {42, 137, 2, true, SparseType::DENSE, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143},
+  {42, 137, 2, false, SparseType::MIX, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, true, SparseType::MIX, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, false, SparseType::MIX, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49},
+  {42, 137, 2, true, SparseType::MIX, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143},
+  {42, 137, 2, false, SparseType::CSR, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, true, SparseType::CSR, {KernelType::TANH, 0, 0.5, 2.4}},
+  // CSR does not support ld_out
+  {42, 137, 2, false, SparseType::CSR, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 0},
+  {42, 137, 2, true, SparseType::CSR, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 0},
+  {3, 4, 2, false, SparseType::DENSE, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, false, SparseType::DENSE, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, true, SparseType::DENSE, {KernelType::RBF, 0, 0.5}},
+  {3, 4, 2, false, SparseType::MIX, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, false, SparseType::MIX, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, true, SparseType::MIX, {KernelType::RBF, 0, 0.5}},
+  {3, 4, 2, false, SparseType::CSR, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, false, SparseType::CSR, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, true, SparseType::CSR, {KernelType::RBF, 0, 0.5}},
+  // Distance kernel does not support LD parameter yet.
+  //{42, 137, 2, false, {KernelType::RBF, 0, 0.5}, 64, 155, 49},
+  //{42, 137, 2, true, {KernelType::RBF, 0, 0.5}, 64, 155, 143},
+};
+
+template <typename math_t>
+class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
+ protected:
+  GramMatrixTest()
+    : params(GetParam()),
+      stream(0),
+      x1(0, stream),
+      x2(0, stream),
+      x1_csr_indptr(0, stream),
+      x1_csr_indices(0, stream),
+      x1_csr_data(0, stream),
+      x2_csr_indptr(0, stream),
+      x2_csr_indices(0, stream),
+      x2_csr_data(0, stream),
+      gram(0, stream),
+      gram_host(0)
+  {
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
+    if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
+    if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
+    // Derive the size of the output from the offset of the last element.
+    size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1;
+    x1.resize(size, stream);
+    size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1;
+    x2.resize(size, stream);
+    size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1;
+
+    gram.resize(size, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream));
+    gram_host.resize(gram.size());
+    std::fill(gram_host.begin(), gram_host.end(), 0);
+
+    raft::random::Rng r(42137ULL);
+    r.uniform(x1.data(), x1.size(), math_t(0), math_t(1), stream);
+    r.uniform(x2.data(), x2.size(), math_t(0), math_t(1), stream);
+  }
+
+  ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); }
+
+  // Calculate the Gram matrix on the host.
+  void naiveKernel()
+  {
+    std::vector<math_t> x1_host(x1.size());
+    raft::update_host(x1_host.data(), x1.data(), x1.size(), stream);
+    std::vector<math_t> x2_host(x2.size());
+    raft::update_host(x2_host.data(), x2.data(), x2.size(), stream);
+    handle.sync_stream(stream);
+
+    for (int i = 0; i < params.n1; i++) {
+      for (int j = 0; j < params.n2; j++) {
+        float d = 0;
+        for (int k = 0; k < params.n_cols; k++) {
+          if (params.kernel.kernel == KernelType::RBF) {
+            math_t diff = x1_host[get_offset(i, k, params.ld1, params.is_row_major)] -
+                          x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
+            d += diff * diff;
+          } else {
+            d += x1_host[get_offset(i, k, params.ld1, params.is_row_major)] *
+                 x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
+          }
+        }
+        int idx  = get_offset(i, j, params.ld_out, params.is_row_major);
+        math_t v = 0;
+        switch (params.kernel.kernel) {
+          case (KernelType::LINEAR): gram_host[idx] = d; break;
+          case (KernelType::POLYNOMIAL):
+            v              = params.kernel.gamma * d + params.kernel.coef0;
+            gram_host[idx] = std::pow(v, params.kernel.degree);
+            break;
+          case (KernelType::TANH):
+            gram_host[idx] = std::tanh(params.kernel.gamma * d + params.kernel.coef0);
+            break;
+          case (KernelType::RBF): gram_host[idx] = exp(-params.kernel.gamma * d); break;
+        }
+      }
+    }
+  }
+
+  int prepareCsr(math_t* dense, int n_rows, int ld, int* indptr, int* indices, math_t* data)
+  {
+    int nnz           = 0;
+    double eps        = 1e-6;
+    int n_cols        = params.n_cols;
+    bool is_row_major = params.is_row_major;
+    size_t dense_size = get_offset(n_rows - 1, n_cols - 1, ld, is_row_major) + 1;
+
+    std::vector<math_t> dense_host(dense_size);
+    raft::update_host(dense_host.data(), dense, dense_size, stream);
+    handle.sync_stream(stream);
+
+    std::vector<int> indptr_host(n_rows + 1);
+    std::vector<int> indices_host(n_rows * n_cols);
+    std::vector<math_t> data_host(n_rows * n_cols);
+
+    // create csr matrix from dense (with threshold)
+    for (int i = 0; i < n_rows; ++i) {
+      indptr_host[i] = nnz;
+      for (int j = 0; j < n_cols; ++j) {
+        math_t value = dense_host[get_offset(i, j, ld, is_row_major)];
+        if (value > eps) {
+          indices_host[nnz] = j;
+          data_host[nnz]    = value;
+          nnz++;
+        }
+      }
+    }
+    indptr_host[n_rows] = nnz;
+
+    // fill back dense matrix from CSR
+    std::fill(dense_host.data(), dense_host.data() + dense_size, 0);
+    for (int i = 0; i < n_rows; ++i) {
+      for (int idx = indptr_host[i]; idx < indptr_host[i + 1]; ++idx) {
+        dense_host[get_offset(i, indices_host[idx], ld, is_row_major)] = data_host[idx];
+      }
+    }
+
+    raft::update_device(dense, dense_host.data(), dense_size, stream);
+    raft::update_device(indptr, indptr_host.data(), n_rows + 1, stream);
+    raft::update_device(indices, indices_host.data(), nnz, stream);
+    raft::update_device(data, data_host.data(), nnz, stream);
+    handle.sync_stream(stream);
+
+    return nnz;
+  }
+
+  void runTest()
+  {
+    std::unique_ptr<GramMatrixBase<math_t>> kernel =
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel, handle));
+
+    Matrix<math_t>* x1_matrix = nullptr;
+    Matrix<math_t>* x2_matrix = nullptr;
+
+    if (params.sparse_input != SparseType::DENSE) {
+      x1_csr_indptr.reserve(params.n1 + 1, stream);
+      x1_csr_indices.reserve(params.n1 * params.n_cols, stream);
+      x1_csr_data.reserve(params.n1 * params.n_cols, stream);
+      int nnz   = prepareCsr(x1.data(),
+                           params.n1,
+                           params.ld1,
+                           x1_csr_indptr.data(),
+                           x1_csr_indices.data(),
+                           x1_csr_data.data());
+      x1_matrix = new CsrMatrix<math_t>(x1_csr_indptr.data(),
+                                        x1_csr_indices.data(),
+                                        x1_csr_data.data(),
+                                        nnz,
+                                        params.n1,
+                                        params.n_cols);
+    } else {
+      x1_matrix = new DenseMatrix<math_t>(
+        x1.data(), params.n1, params.n_cols, params.is_row_major, params.ld1);
+    }
+
+    if (params.sparse_input == SparseType::CSR) {
+      x2_csr_indptr.reserve(params.n2 + 1, stream);
+      x2_csr_indices.reserve(params.n2 * params.n_cols, stream);
+      x2_csr_data.reserve(params.n2 * params.n_cols, stream);
+      int nnz   = prepareCsr(x2.data(),
+                           params.n2,
+                           params.ld2,
+                           x2_csr_indptr.data(),
+                           x2_csr_indices.data(),
+                           x2_csr_data.data());
+      x2_matrix = new CsrMatrix<math_t>(x2_csr_indptr.data(),
+                                        x2_csr_indices.data(),
+                                        x2_csr_data.data(),
+                                        nnz,
+                                        params.n2,
+                                        params.n_cols);
+    } else {
+      x2_matrix = new DenseMatrix<math_t>(
+        x2.data(), params.n2, params.n_cols, params.is_row_major, params.ld2);
+    }
+
+    DenseMatrix<math_t> gram_dense(
+      gram.data(), params.n1, params.n2, params.is_row_major, params.ld_out);
+
+    naiveKernel();
+
+    (*kernel)(*x1_matrix, *x2_matrix, gram_dense, stream);
+    handle.sync_stream(stream);
+
+    ASSERT_TRUE(raft::devArrMatchHost(
+      gram_host.data(), gram.data(), gram.size(), raft::CompareApprox<math_t>(1e-6f)));
+
+    delete x1_matrix;
+    delete x2_matrix;
+  }
+
+  raft::device_resources handle;
+  cudaStream_t stream = 0;
+  GramMatrixInputs params;
+
+  rmm::device_uvector<math_t> x1;
+  rmm::device_uvector<math_t> x2;
+
+  rmm::device_uvector<int> x1_csr_indptr;
+  rmm::device_uvector<int> x1_csr_indices;
+  rmm::device_uvector<math_t> x1_csr_data;
+  rmm::device_uvector<int> x2_csr_indptr;
+  rmm::device_uvector<int> x2_csr_indices;
+  rmm::device_uvector<math_t> x2_csr_data;
+
+  rmm::device_uvector<math_t> gram;
+  std::vector<math_t> gram_host;
+};
+
+typedef GramMatrixTest<float> GramMatrixTestFloat;
+typedef GramMatrixTest<double> GramMatrixTestDouble;
+
+TEST_P(GramMatrixTestFloat, Gram) { runTest(); }
+
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloat, ::testing::ValuesIn(inputs));
+};  // end namespace raft::distance::kernels

From 8174693cf2281eae90b976b18e49dec2c1ec8e75 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Sun, 12 Mar 2023 10:35:14 -0700
Subject: [PATCH 07/20] review suggestions

---
 .../distance/detail/kernels/gram_matrix.cuh   |  19 +-
 .../detail/kernels/kernel_matrices.cuh        |  81 ++++----
 cpp/test/distance/gram.cu                     |  61 ++----
 cpp/test/distance/gram_base.cuh               |  87 ++++++++
 cpp/test/sparse/gram.cu                       | 185 ++++++++----------
 5 files changed, 237 insertions(+), 196 deletions(-)
 create mode 100644 cpp/test/distance/gram_base.cuh

diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index 1a2b4d67f8..65961e3089 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -48,7 +48,6 @@ class GramMatrixBase {
   GramMatrixBase(const raft::device_resources& handle) : handle(handle){};
 
   virtual ~GramMatrixBase(){};
-
   /** Convenience function to evaluate the Gram matrix for two vector sets.
    *  Vector sets are provided in Matrix format
    *
@@ -56,22 +55,22 @@ class GramMatrixBase {
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out (dense) device matrix to store the Gram matrix, size [n1*n2]
    * @param [in] stream cuda stream
-   * @param dot_x1 optional dot product of x1 for expanded computation within RBF.
-   * @param dot_x2 optional dot product of x2 for expanded computation within RBF.
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
   virtual void operator()(const raft::distance::matrix::detail::Matrix<math_t>& x1,
                           const raft::distance::matrix::detail::Matrix<math_t>& x2,
                           raft::distance::matrix::detail::DenseMatrix<math_t>& out,
                           cudaStream_t stream,
-                          math_t* dot_x1 = nullptr,
-                          math_t* dot_x2 = nullptr)
+                          math_t* norm_x1 = nullptr,
+                          math_t* norm_x2 = nullptr)
   {
     ASSERT(x1.n_rows == out.n_rows,
            "GramMatrix input matrix dimensions for x1 and out do not match");
     ASSERT(x2.n_rows == out.n_cols,
            "GramMatrix input matrix dimensions for x2 and out do not match");
     ASSERT(x1.n_cols == x2.n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
-    evaluate(x1, x2, out, stream, dot_x1, dot_x2);
+    evaluate(x1, x2, out, stream, norm_x1, norm_x2);
   }
 
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
@@ -80,15 +79,15 @@ class GramMatrixBase {
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
    * @param [in] stream cuda stream
-   * @param dot_x1 optional dot product of x1 for expanded computation within RBF.
-   * @param dot_x2 optional dot product of x2 for expanded computation within RBF.
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
    */
   virtual void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
                         const raft::distance::matrix::detail::Matrix<math_t>& x2,
                         raft::distance::matrix::detail::DenseMatrix<math_t>& out,
                         cudaStream_t stream,
-                        math_t* dot_x1,
-                        math_t* dot_x2)
+                        math_t* norm_x1,
+                        math_t* norm_x2)
   {
     linear(x1, x2, out, stream);
   }
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index 5079a87027..8b7954214c 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -102,26 +102,33 @@ __global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t ga
 }
 
 /** Epiloge function for rbf kernel using expansion.
- * Calculates output_ij = exp(-gain * (norm_i + norm_j - 2*input_ij));
+ *
+ * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij));
+ *
+ * Intended usage
+ *   - input is the product of two matrices X and Y input_ij = \sum_k X_ik * Y_jk
+ *   - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X
+ *   - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y
+ *
  * @param inout device vector in column major format, size [ld * cols]
  * @param ld leading dimension of the inout buffer
  * @param rows number of rows (rows <= ld)
  * @param cols number of columns
- * @param dot_rows dot product for row indices
- * @param dot_cols dot product for column indices
+ * @param norm_x l2-norm of X's rows
+ * @param norm_y l2-norm of Y's rows
  * @param gain
  */
 template <typename math_t>
 __global__ void rbf_kernel_expanded(
-  math_t* inout, int ld, int rows, int cols, math_t* dot_rows, math_t* dot_cols, math_t gain)
+  math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
 {
   for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
        tidy += blockDim.y * gridDim.y) {
-    math_t norm_y = dot_cols[tidy];
+    math_t norm_y_val = norm_y[tidy];
     for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
          tidx += blockDim.x * gridDim.x) {
       inout[tidx + tidy * ld] =
-        exp(-1.0 * gain * (dot_rows[tidx] + norm_y - inout[tidx + tidy * ld] * 2));
+        exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2));
     }
   }
 }
@@ -181,15 +188,15 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
    * @param [in] stream cuda stream
-   * @param dot_x1 optional dot product of x1 for expanded computation within RBF.
-   * @param dot_x2 optional dot product of x2 for expanded computation within RBF.
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
    */
   void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
                 const raft::distance::matrix::detail::Matrix<math_t>& x2,
                 raft::distance::matrix::detail::DenseMatrix<math_t>& out,
                 cudaStream_t stream,
-                math_t* dot_x1,
-                math_t* dot_x2)
+                math_t* norm_x1,
+                math_t* norm_x2)
   {
     GramMatrixBase<math_t>::linear(x1, x2, out, stream);
     applyKernel(out.data, out.ld, out.n_rows, out.n_cols, out.is_row_major, stream);
@@ -247,15 +254,15 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
    * @param [in] stream cuda stream
-   * @param dot_x1 optional dot product of x1 for expanded computation within RBF.
-   * @param dot_x2 optional dot product of x2 for expanded computation within RBF.
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
    */
   void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
                 const raft::distance::matrix::detail::Matrix<math_t>& x2,
                 raft::distance::matrix::detail::DenseMatrix<math_t>& out,
                 cudaStream_t stream,
-                math_t* dot_x1,
-                math_t* dot_x2)
+                math_t* norm_x1,
+                math_t* norm_x2)
   {
     GramMatrixBase<math_t>::linear(x1, x2, out, stream);
     applyKernel(out.data, out.ld, out.n_rows, out.n_cols, out.is_row_major, stream);
@@ -273,19 +280,19 @@ class RBFKernel : public GramMatrixBase<math_t> {
                               int ld,
                               int rows,
                               int cols,
-                              math_t* dot_x1,
-                              math_t* dot_x2,
+                              math_t* norm_x1,
+                              math_t* norm_x2,
                               bool is_row_major,
                               cudaStream_t stream)
   {
-    int n1         = is_row_major ? cols : rows;
-    int n2         = is_row_major ? rows : cols;
-    math_t* dot_n1 = is_row_major ? dot_x2 : dot_x1;
-    math_t* dot_n2 = is_row_major ? dot_x1 : dot_x2;
+    int n1          = is_row_major ? cols : rows;
+    int n2          = is_row_major ? rows : cols;
+    math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1;
+    math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2;
     rbf_kernel_expanded<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
                           dim3(32, 4, 1),
                           0,
-                          stream>>>(inout, ld, n1, n2, dot_n1, dot_n2, gain);
+                          stream>>>(inout, ld, n1, n2, norm_n1, norm_n2, gain);
   }
 
  public:
@@ -336,37 +343,37 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
    * @param [in] stream cuda stream
-   * @param dot_x1 optional dot product of x1 for expanded computation within RBF.
-   * @param dot_x2 optional dot product of x2 for expanded computation within RBF.
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
   void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
                 const raft::distance::matrix::detail::Matrix<math_t>& x2,
                 raft::distance::matrix::detail::DenseMatrix<math_t>& out,
                 cudaStream_t stream,
-                math_t* dot_x1,
-                math_t* dot_x2)
+                math_t* norm_x1,
+                math_t* norm_x2)
   {
-    if (x1.isDense() && x2.isDense() && (dot_x1 == nullptr || dot_x2 == nullptr)) {
+    if (x1.isDense() && x2.isDense() && (norm_x1 == nullptr || norm_x2 == nullptr)) {
       auto x1_dense = x1.asDense();
       auto x2_dense = x2.asDense();
       distance_rbf(*x1_dense, *x2_dense, out, stream);
     } else {
-      rmm::device_uvector<math_t> tmp_dot_x1(0, stream);
-      rmm::device_uvector<math_t> tmp_dot_x2(0, stream);
-      if (dot_x1 == nullptr) {
-        tmp_dot_x1.reserve(x1.n_rows, stream);
-        dot_x1 = tmp_dot_x1.data();
-        matrixDot(x1, dot_x1, stream);
+      rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+      rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+      if (norm_x1 == nullptr) {
+        tmp_norm_x1.reserve(x1.n_rows, stream);
+        norm_x1 = tmp_norm_x1.data();
+        matrixDot(x1, norm_x1, stream);
       }
-      if (dot_x2 == nullptr) {
-        tmp_dot_x2.reserve(x2.n_rows, stream);
-        dot_x2 = tmp_dot_x2.data();
-        matrixDot(x2, dot_x2, stream);
+      if (norm_x2 == nullptr) {
+        tmp_norm_x2.reserve(x2.n_rows, stream);
+        norm_x2 = tmp_norm_x2.data();
+        matrixDot(x2, norm_x2, stream);
       }
       // compute L2expanded
       GramMatrixBase<math_t>::linear(x1, x2, out, stream);
       applyExpandedRbfKernel(
-        out.data, out.ld, out.n_rows, out.n_cols, dot_x1, dot_x2, out.is_row_major, stream);
+        out.data, out.ld, out.n_rows, out.n_cols, norm_x1, norm_x2, out.is_row_major, stream);
     }
   }
 
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index 7ea9cc3c7d..6a93fed0ad 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -19,6 +19,7 @@
 #endif
 
 #include "../test_utils.cuh"
+#include "gram_base.cuh"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
@@ -34,12 +35,6 @@ namespace raft::distance::kernels {
 
 using namespace raft::distance::matrix::detail;
 
-// Get the offset of element [i,k].
-HDI int get_offset(int i, int k, int ld, bool is_row_major)
-{
-  return is_row_major ? i * ld + k : i + k * ld;
-}
-
 struct GramMatrixInputs {
   int n1;      // feature vectors in matrix 1
   int n2;      // featuer vectors in matrix 2
@@ -113,45 +108,6 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
 
   ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); }
 
-  // Calculate the Gram matrix on the host.
-  void naiveKernel()
-  {
-    std::vector<math_t> x1_host(x1.size());
-    raft::update_host(x1_host.data(), x1.data(), x1.size(), stream);
-    std::vector<math_t> x2_host(x2.size());
-    raft::update_host(x2_host.data(), x2.data(), x2.size(), stream);
-    handle.sync_stream(stream);
-
-    for (int i = 0; i < params.n1; i++) {
-      for (int j = 0; j < params.n2; j++) {
-        float d = 0;
-        for (int k = 0; k < params.n_cols; k++) {
-          if (params.kernel.kernel == KernelType::RBF) {
-            math_t diff = x1_host[get_offset(i, k, params.ld1, params.is_row_major)] -
-                          x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
-            d += diff * diff;
-          } else {
-            d += x1_host[get_offset(i, k, params.ld1, params.is_row_major)] *
-                 x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
-          }
-        }
-        int idx  = get_offset(i, j, params.ld_out, params.is_row_major);
-        math_t v = 0;
-        switch (params.kernel.kernel) {
-          case (KernelType::LINEAR): gram_host[idx] = d; break;
-          case (KernelType::POLYNOMIAL):
-            v              = params.kernel.gamma * d + params.kernel.coef0;
-            gram_host[idx] = std::pow(v, params.kernel.degree);
-            break;
-          case (KernelType::TANH):
-            gram_host[idx] = std::tanh(params.kernel.gamma * d + params.kernel.coef0);
-            break;
-          case (KernelType::RBF): gram_host[idx] = exp(-params.kernel.gamma * d); break;
-        }
-      }
-    }
-  }
-
   void runTest()
   {
     std::unique_ptr<GramMatrixBase<math_t>> kernel =
@@ -166,7 +122,20 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
 
     (*kernel)(x1_dense, x2_dense, gram_dense, stream);
 
-    naiveKernel();
+    naiveGramMatrixKernel(params.n1,
+                          params.n2,
+                          params.n_cols,
+                          x1,
+                          x2,
+                          gram_host.data(),
+                          params.ld1,
+                          params.ld2,
+                          params.ld_out,
+                          params.is_row_major,
+                          params.kernel,
+                          stream,
+                          handle);
+
     ASSERT_TRUE(raft::devArrMatchHost(
       gram_host.data(), gram.data(), gram.size(), raft::CompareApprox<math_t>(1e-6f)));
   }
diff --git a/cpp/test/distance/gram_base.cuh b/cpp/test/distance/gram_base.cuh
new file mode 100644
index 0000000000..8c0652bc16
--- /dev/null
+++ b/cpp/test/distance/gram_base.cuh
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <memory>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/kernels.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace distance {
+namespace kernels {
+
+// Get the offset of element [i,k].
+HDI int get_offset(int i, int k, int ld, bool is_row_major)
+{
+  return is_row_major ? i * ld + k : i + k * ld;
+}
+
+// Calculate the Gram matrix on the host.
+template <typename math_t>
+void naiveGramMatrixKernel(int n1,
+                           int n2,
+                           int n_cols,
+                           const rmm::device_uvector<math_t>& x1,
+                           const rmm::device_uvector<math_t>& x2,
+                           math_t* gram_host,
+                           int ld1,
+                           int ld2,
+                           int ld_out,
+                           bool is_row_major,
+                           KernelParams kernel,
+                           cudaStream_t stream,
+                           const raft::device_resources& handle)
+{
+  std::vector<math_t> x1_host(x1.size());
+  raft::update_host(x1_host.data(), x1.data(), x1.size(), stream);
+  std::vector<math_t> x2_host(x2.size());
+  raft::update_host(x2_host.data(), x2.data(), x2.size(), stream);
+  handle.sync_stream(stream);
+
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      float d = 0;
+      for (int k = 0; k < n_cols; k++) {
+        if (kernel.kernel == KernelType::RBF) {
+          math_t diff = x1_host[get_offset(i, k, ld1, is_row_major)] -
+                        x2_host[get_offset(j, k, ld2, is_row_major)];
+          d += diff * diff;
+        } else {
+          d += x1_host[get_offset(i, k, ld1, is_row_major)] *
+               x2_host[get_offset(j, k, ld2, is_row_major)];
+        }
+      }
+      int idx  = get_offset(i, j, ld_out, is_row_major);
+      math_t v = 0;
+      switch (kernel.kernel) {
+        case (KernelType::LINEAR): gram_host[idx] = d; break;
+        case (KernelType::POLYNOMIAL):
+          v              = kernel.gamma * d + kernel.coef0;
+          gram_host[idx] = std::pow(v, kernel.degree);
+          break;
+        case (KernelType::TANH): gram_host[idx] = std::tanh(kernel.gamma * d + kernel.coef0); break;
+        case (KernelType::RBF): gram_host[idx] = exp(-kernel.gamma * d); break;
+      }
+    }
+  }
+}
+
+}  // namespace kernels
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu
index 2cf880d23e..bd714d25b3 100644
--- a/cpp/test/sparse/gram.cu
+++ b/cpp/test/sparse/gram.cu
@@ -18,6 +18,7 @@
 #include <raft/distance/specializations.cuh>
 #endif
 
+#include "../distance/gram_base.cuh"
 #include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <iostream>
@@ -29,18 +30,19 @@
 #include <raft/sparse/convert/dense.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft::distance::kernels {
 
 using namespace raft::distance::matrix::detail;
 
-// Get the offset of element [i,k].
-HDI int get_offset(int i, int k, int ld, bool is_row_major)
-{
-  return is_row_major ? i * ld + k : i + k * ld;
-}
-
+/**
+ * Structure to describe structure of the input matrices:
+ *  - DENSE: dense, dense
+ *  - MIX: CSR, dense
+ *  - CSR: CSR, CSR
+ */
 enum SparseType { DENSE, MIX, CSR };
 
 struct GramMatrixInputs {
@@ -69,59 +71,56 @@ std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p)
   return os;
 }
 
-const std::vector<GramMatrixInputs> inputs = {
-  {42, 137, 2, false, SparseType::DENSE, {KernelType::LINEAR}},
-  {42, 137, 2, true, SparseType::DENSE, {KernelType::LINEAR}},
-  {42, 137, 2, false, SparseType::DENSE, {KernelType::LINEAR}, 64, 179, 181},
-  {42, 137, 2, true, SparseType::DENSE, {KernelType::LINEAR}, 64, 179, 181},
-  {42, 137, 2, false, SparseType::MIX, {KernelType::LINEAR}},
-  {42, 137, 2, true, SparseType::MIX, {KernelType::LINEAR}},
-  {42, 137, 2, false, SparseType::MIX, {KernelType::LINEAR}, 64, 179, 181},
-  {42, 137, 2, true, SparseType::MIX, {KernelType::LINEAR}, 64, 179, 181},
-  {42, 137, 2, false, SparseType::CSR, {KernelType::LINEAR}},
-  {42, 137, 2, true, SparseType::CSR, {KernelType::LINEAR}},
-  // CSR does not support ld_out
-  {42, 137, 2, false, SparseType::CSR, {KernelType::LINEAR}, 64, 179, 0},
-  {42, 137, 2, true, SparseType::CSR, {KernelType::LINEAR}, 64, 179, 0},
-  {137, 42, 2, false, SparseType::DENSE, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
-  {137, 42, 2, true, SparseType::DENSE, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
-  {137, 42, 2, false, SparseType::DENSE, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
-  {137, 42, 2, true, SparseType::DENSE, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
-  {137, 42, 2, false, SparseType::MIX, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
-  {137, 42, 2, true, SparseType::MIX, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
-  {137, 42, 2, false, SparseType::MIX, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
-  {137, 42, 2, true, SparseType::MIX, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
-  {137, 42, 2, false, SparseType::CSR, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
-  {137, 42, 2, true, SparseType::CSR, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
-  // CSR does not support ld_out
-  {137, 42, 2, false, SparseType::CSR, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 0},
-  {137, 42, 2, true, SparseType::CSR, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 0},
-  {42, 137, 2, false, SparseType::DENSE, {KernelType::TANH, 0, 0.5, 2.4}},
-  {42, 137, 2, true, SparseType::DENSE, {KernelType::TANH, 0, 0.5, 2.4}},
-  {42, 137, 2, false, SparseType::DENSE, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49},
-  {42, 137, 2, true, SparseType::DENSE, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143},
-  {42, 137, 2, false, SparseType::MIX, {KernelType::TANH, 0, 0.5, 2.4}},
-  {42, 137, 2, true, SparseType::MIX, {KernelType::TANH, 0, 0.5, 2.4}},
-  {42, 137, 2, false, SparseType::MIX, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49},
-  {42, 137, 2, true, SparseType::MIX, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143},
-  {42, 137, 2, false, SparseType::CSR, {KernelType::TANH, 0, 0.5, 2.4}},
-  {42, 137, 2, true, SparseType::CSR, {KernelType::TANH, 0, 0.5, 2.4}},
-  // CSR does not support ld_out
-  {42, 137, 2, false, SparseType::CSR, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 0},
-  {42, 137, 2, true, SparseType::CSR, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 0},
-  {3, 4, 2, false, SparseType::DENSE, {KernelType::RBF, 0, 0.5}},
-  {42, 137, 2, false, SparseType::DENSE, {KernelType::RBF, 0, 0.5}},
-  {42, 137, 2, true, SparseType::DENSE, {KernelType::RBF, 0, 0.5}},
-  {3, 4, 2, false, SparseType::MIX, {KernelType::RBF, 0, 0.5}},
-  {42, 137, 2, false, SparseType::MIX, {KernelType::RBF, 0, 0.5}},
-  {42, 137, 2, true, SparseType::MIX, {KernelType::RBF, 0, 0.5}},
-  {3, 4, 2, false, SparseType::CSR, {KernelType::RBF, 0, 0.5}},
-  {42, 137, 2, false, SparseType::CSR, {KernelType::RBF, 0, 0.5}},
-  {42, 137, 2, true, SparseType::CSR, {KernelType::RBF, 0, 0.5}},
-  // Distance kernel does not support LD parameter yet.
-  //{42, 137, 2, false, {KernelType::RBF, 0, 0.5}, 64, 155, 49},
-  //{42, 137, 2, true, {KernelType::RBF, 0, 0.5}, 64, 155, 143},
-};
+/*struct KernelParams {
+  // Kernel function parameters
+  KernelType kernel;  //!< Type of the kernel function
+  int degree;         //!< Degree of polynomial kernel (ignored by others)
+  double gamma;       //!< multiplier in the
+  double coef0;       //!< additive constant in poly and tanh kernels
+};*/
+
+// const KernelParams linear_kernel_params{.kernel=KernelType::LINEAR};
+
+// {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, {KernelType::TANH, 0, 0.5, 2.4}, {KernelType::RBF, 0, 0.5}
+const std::vector<GramMatrixInputs> inputs = raft::util::itertools::product<GramMatrixInputs>(
+  {42},
+  {137},
+  {2},
+  {true, false},
+  {SparseType::DENSE, SparseType::MIX, SparseType::CSR},
+  {KernelParams{KernelType::LINEAR},
+   KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+   KernelParams{KernelType::TANH, 0, 0.5, 2.4},
+   KernelParams{KernelType::RBF, 0, 0.5}});
+
+// (ld_1, ld_2, ld_out) not supported by RBF and CSR
+const std::vector<GramMatrixInputs> inputs_ld = raft::util::itertools::product<GramMatrixInputs>(
+  {137},
+  {42},
+  {2},
+  {true, false},
+  {SparseType::DENSE, SparseType::MIX},
+  {KernelParams{KernelType::LINEAR},
+   KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+   KernelParams{KernelType::TANH, 0, 0.5, 2.4}},
+  {159},
+  {73},
+  {144});
+
+// (ld_1, ld_2) are supported by CSR
+const std::vector<GramMatrixInputs> inputs_ld_csr =
+  raft::util::itertools::product<GramMatrixInputs>(
+    {42},
+    {137},
+    {2},
+    {true, false},
+    {SparseType::CSR, SparseType::MIX},
+    {KernelParams{KernelType::LINEAR},
+     KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+     KernelParams{KernelType::TANH, 0, 0.5, 2.4}},
+    {64},
+    {155},
+    {0});
 
 template <typename math_t>
 class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
@@ -164,45 +163,6 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
 
   ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); }
 
-  // Calculate the Gram matrix on the host.
-  void naiveKernel()
-  {
-    std::vector<math_t> x1_host(x1.size());
-    raft::update_host(x1_host.data(), x1.data(), x1.size(), stream);
-    std::vector<math_t> x2_host(x2.size());
-    raft::update_host(x2_host.data(), x2.data(), x2.size(), stream);
-    handle.sync_stream(stream);
-
-    for (int i = 0; i < params.n1; i++) {
-      for (int j = 0; j < params.n2; j++) {
-        float d = 0;
-        for (int k = 0; k < params.n_cols; k++) {
-          if (params.kernel.kernel == KernelType::RBF) {
-            math_t diff = x1_host[get_offset(i, k, params.ld1, params.is_row_major)] -
-                          x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
-            d += diff * diff;
-          } else {
-            d += x1_host[get_offset(i, k, params.ld1, params.is_row_major)] *
-                 x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
-          }
-        }
-        int idx  = get_offset(i, j, params.ld_out, params.is_row_major);
-        math_t v = 0;
-        switch (params.kernel.kernel) {
-          case (KernelType::LINEAR): gram_host[idx] = d; break;
-          case (KernelType::POLYNOMIAL):
-            v              = params.kernel.gamma * d + params.kernel.coef0;
-            gram_host[idx] = std::pow(v, params.kernel.degree);
-            break;
-          case (KernelType::TANH):
-            gram_host[idx] = std::tanh(params.kernel.gamma * d + params.kernel.coef0);
-            break;
-          case (KernelType::RBF): gram_host[idx] = exp(-params.kernel.gamma * d); break;
-        }
-      }
-    }
-  }
-
   int prepareCsr(math_t* dense, int n_rows, int ld, int* indptr, int* indices, math_t* data)
   {
     int nnz           = 0;
@@ -303,7 +263,19 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
     DenseMatrix<math_t> gram_dense(
       gram.data(), params.n1, params.n2, params.is_row_major, params.ld_out);
 
-    naiveKernel();
+    naiveGramMatrixKernel(params.n1,
+                          params.n2,
+                          params.n_cols,
+                          x1,
+                          x2,
+                          gram_host.data(),
+                          params.ld1,
+                          params.ld2,
+                          params.ld_out,
+                          params.is_row_major,
+                          params.kernel,
+                          stream,
+                          handle);
 
     (*kernel)(*x1_matrix, *x2_matrix, gram_dense, stream);
     handle.sync_stream(stream);
@@ -333,10 +305,17 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
   std::vector<math_t> gram_host;
 };
 
-typedef GramMatrixTest<float> GramMatrixTestFloat;
-typedef GramMatrixTest<double> GramMatrixTestDouble;
+typedef GramMatrixTest<float> GramMatrixTestFloatStandard;
+typedef GramMatrixTest<float> GramMatrixTestFloatLd;
+typedef GramMatrixTest<float> GramMatrixTestFloatLdCsr;
 
-TEST_P(GramMatrixTestFloat, Gram) { runTest(); }
+TEST_P(GramMatrixTestFloatStandard, Gram) { runTest(); }
+TEST_P(GramMatrixTestFloatLd, Gram) { runTest(); }
+TEST_P(GramMatrixTestFloatLdCsr, Gram) { runTest(); }
 
-INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloat, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatStandard, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatLd, ::testing::ValuesIn(inputs_ld));
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests,
+                         GramMatrixTestFloatLdCsr,
+                         ::testing::ValuesIn(inputs_ld_csr));
 };  // end namespace raft::distance::kernels

From 5bbcd0018372d112cfce2d3dbe13b421406c1732 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Tue, 14 Mar 2023 10:56:16 +0000
Subject: [PATCH 08/20] review comments norm

---
 cpp/include/raft/sparse/linalg/detail/norm.cuh | 16 ++++++++--------
 cpp/include/raft/sparse/linalg/norm.cuh        |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 7dbea8c76c..5af7749c39 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -187,7 +187,7 @@ template <typename Policy,
           typename ReduceLambda,
           typename FinalLambda>
 __global__ void __launch_bounds__(Policy::ThreadsPerBlock)
-  csrReductionKernel(Type* dots,
+  csrReductionKernel(Type* norm,
                      const IdxType* ia,
                      const Type* data,
                      IdxType N,
@@ -204,7 +204,7 @@ __global__ void __launch_bounds__(Policy::ThreadsPerBlock)
     acc = reduce_op(acc, main_op(data[j]));
   }
   acc = raft::logicalWarpReduce<Policy::LogicalWarpSize>(acc, reduce_op);
-  if (threadIdx.x == 0) { dots[i] = final_op(acc); }
+  if (threadIdx.x == 0) { norm[i] = final_op(acc); }
 }
 
 template <typename Policy,
@@ -213,7 +213,7 @@ template <typename Policy,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void csrReduction(Type* dots,
+void csrReduction(Type* norm,
                   const IdxType* ia,
                   const Type* data,
                   IdxType N,
@@ -228,12 +228,12 @@ void csrReduction(Type* dots,
   dim3 threads(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1);
   dim3 blocks(ceildiv<IdxType>(N, Policy::RowsPerBlock), 1, 1);
   csrReductionKernel<Policy>
-    <<<blocks, threads, 0, stream>>>(dots, ia, data, N, init, main_op, reduce_op, final_op);
+    <<<blocks, threads, 0, stream>>>(norm, ia, data, N, init, main_op, reduce_op, final_op);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename Type, typename IdxType, typename Lambda>
-void rowNormCsrCaller(Type* dots,
+void rowNormCsrCaller(Type* norm,
                       const IdxType* ia,
                       const Type* data,
                       IdxType nnz,
@@ -246,15 +246,15 @@ void rowNormCsrCaller(Type* dots,
   switch (type) {
     case raft::linalg::NormType::L1Norm:
       csrReduction<CsrReductionPolicy<32, 4>>(
-        dots, ia, data, N, (Type)0, stream, raft::abs_op(), raft::add_op(), fin_op);
+        norm, ia, data, N, (Type)0, stream, raft::abs_op(), raft::add_op(), fin_op);
       break;
     case raft::linalg::NormType::L2Norm:
       csrReduction<CsrReductionPolicy<32, 4>>(
-        dots, ia, data, N, (Type)0, stream, raft::sq_op(), raft::add_op(), fin_op);
+        norm, ia, data, N, (Type)0, stream, raft::sq_op(), raft::add_op(), fin_op);
       break;
     case raft::linalg::NormType::LinfNorm:
       csrReduction<CsrReductionPolicy<32, 4>>(
-        dots, ia, data, N, (Type)0, stream, raft::abs_op(), raft::max_op(), fin_op);
+        norm, ia, data, N, (Type)0, stream, raft::abs_op(), raft::max_op(), fin_op);
       break;
     default: THROW("Unsupported norm type: %d", type);
   };
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index d504e735fb..6f01569a98 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -77,8 +77,8 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
  * @tparam Type the data type
  * @tparam Lambda device final lambda
  * @tparam IdxType Integer type used to for addressing
- * @param dots the output vector of row-wise dot products
- * @param ia the input matrix row pointers
+ * @param norm the output vector of row-wise norm, size [N]
+ * @param ia the input matrix row index array
  * @param data the input matrix nnz data
  * @param N number of rows of data
  * @param type the type of norm to be applied
@@ -86,7 +86,7 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
  * @param fin_op the final lambda op
  */
 template <typename Type, typename IdxType = int, typename Lambda = raft::identity_op>
-void rowNormCsr(Type* dots,
+void rowNormCsr(Type* norm,
                 const IdxType* ia,
                 const Type* data,
                 IdxType nnz,
@@ -95,7 +95,7 @@ void rowNormCsr(Type* dots,
                 cudaStream_t stream,
                 Lambda fin_op = raft::identity_op())
 {
-  detail::rowNormCsrCaller(dots, ia, data, nnz, N, type, stream, fin_op);
+  detail::rowNormCsrCaller(norm, ia, data, nnz, N, type, stream, fin_op);
 }
 
 };  // end NAMESPACE linalg

From 86a03148b32f0977720ff134448612130b37041e Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Tue, 14 Mar 2023 17:51:50 +0000
Subject: [PATCH 09/20] removed handle member, but re-introduced old API to
 ensure backwards compatibility until cuml is updated

---
 .../distance/detail/kernels/gram_matrix.cuh   | 118 ++++++++++---
 .../detail/kernels/kernel_factory.cuh         |  18 +-
 .../detail/kernels/kernel_matrices.cuh        | 165 +++++++++++++++---
 3 files changed, 259 insertions(+), 42 deletions(-)

diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index 65961e3089..14113bc2a7 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -42,10 +42,14 @@ namespace raft::distance::kernels::detail {
  */
 template <typename math_t>
 class GramMatrixBase {
-  const raft::device_resources& handle;
+ protected:
+  cublasHandle_t cublas_handle;
+  bool legacy_interface;
 
  public:
-  GramMatrixBase(const raft::device_resources& handle) : handle(handle){};
+  GramMatrixBase() : legacy_interface(false){};
+  [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle)
+    : cublas_handle(cublas_handle), legacy_interface(true){};
 
   virtual ~GramMatrixBase(){};
   /** Convenience function to evaluate the Gram matrix for two vector sets.
@@ -54,14 +58,14 @@ class GramMatrixBase {
    * @param [in] x1 device matrix, size [n1*n_cols]
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out (dense) device matrix to store the Gram matrix, size [n1*n2]
-   * @param [in] stream cuda stream
+   * @param [in] handle raft handle
    * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
    * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
   virtual void operator()(const raft::distance::matrix::detail::Matrix<math_t>& x1,
                           const raft::distance::matrix::detail::Matrix<math_t>& x2,
                           raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-                          cudaStream_t stream,
+                          const raft::device_resources& handle,
                           math_t* norm_x1 = nullptr,
                           math_t* norm_x2 = nullptr)
   {
@@ -70,7 +74,7 @@ class GramMatrixBase {
     ASSERT(x2.n_rows == out.n_cols,
            "GramMatrix input matrix dimensions for x2 and out do not match");
     ASSERT(x1.n_cols == x2.n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
-    evaluate(x1, x2, out, stream, norm_x1, norm_x2);
+    evaluate(x1, x2, out, handle, norm_x1, norm_x2);
   }
 
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
@@ -78,18 +82,18 @@ class GramMatrixBase {
    * @param [in] x1 device matrix, size [n1*n_cols]
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] stream cuda stream
+   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
   virtual void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
                         const raft::distance::matrix::detail::Matrix<math_t>& x2,
                         raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-                        cudaStream_t stream,
+                        const raft::device_resources& handle,
                         math_t* norm_x1,
                         math_t* norm_x2)
   {
-    linear(x1, x2, out, stream);
+    linear(x1, x2, out, handle);
   }
 
   // private:
@@ -107,11 +111,13 @@ class GramMatrixBase {
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
    * @param [in] stream cuda stream
+   + @param [in] handle raft handle
    */
   void linear(const raft::distance::matrix::detail::DenseMatrix<math_t>& x1,
               const raft::distance::matrix::detail::DenseMatrix<math_t>& x2,
               raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-              cudaStream_t stream)
+              cudaStream_t stream,
+              cublasHandle_t cublas_handle)
   {
     ASSERT(x1.is_row_major == x2.is_row_major,
            "GramMatrix leading dimensions for x1 and x2 do not match");
@@ -122,7 +128,7 @@ class GramMatrixBase {
     math_t beta  = 0.0;
     if (out.is_row_major) {
       // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
                                                        CUBLAS_OP_T,
                                                        CUBLAS_OP_N,
                                                        out.n_cols,
@@ -139,7 +145,7 @@ class GramMatrixBase {
                                                        stream));
     } else {
       // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
                                                        CUBLAS_OP_N,
                                                        CUBLAS_OP_T,
                                                        out.n_rows,
@@ -160,7 +166,8 @@ class GramMatrixBase {
   void linear(const raft::distance::matrix::detail::CsrMatrix<math_t>& x1,
               const raft::distance::matrix::detail::DenseMatrix<math_t>& x2,
               raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-              cudaStream_t stream)
+              cudaStream_t stream,
+              const cusparseHandle_t& cusparse_handle)
   {
     math_t alpha = 1.0;
     math_t beta  = 0.0;
@@ -194,7 +201,7 @@ class GramMatrixBase {
     auto opX2 = CUSPARSE_OPERATION_TRANSPOSE;
 
     size_t bufferSize;
-    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle.get_cusparse_handle(),
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(cusparse_handle,
                                                                     opX1,
                                                                     opX2,
                                                                     &alpha,
@@ -210,7 +217,7 @@ class GramMatrixBase {
 
     rmm::device_uvector<math_t> tmp(bufferSize, stream);
 
-    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle.get_cusparse_handle(),
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(cusparse_handle,
                                                          opX1,
                                                          opX2,
                                                          &alpha,
@@ -231,7 +238,7 @@ class GramMatrixBase {
   void linear(const raft::distance::matrix::detail::CsrMatrix<math_t>& x1,
               const raft::distance::matrix::detail::CsrMatrix<math_t>& x2,
               raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-              cudaStream_t stream)
+              const raft::device_resources& handle)
   {
     int minor_out = out.is_row_major ? out.n_cols : out.n_rows;
     ASSERT(out.ld == minor_out, "Sparse linear Kernel distance does not support ld_out parameter");
@@ -279,29 +286,100 @@ class GramMatrixBase {
    * @param [in] x1 device matrix, size [n1*n_cols]
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] stream cuda stream
+   * @param [in] handle raft handle
    */
   void linear(const raft::distance::matrix::detail::Matrix<math_t>& x1,
               const raft::distance::matrix::detail::Matrix<math_t>& x2,
               raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-              cudaStream_t stream)
+              const raft::device_resources& handle)
   {
     // dispatch
     if (x1.isDense()) {
       ASSERT(x2.isDense(), "GramMatrix input matrix does not allow Dense*Csr");
       auto x1_dense = x1.asDense();
       auto x2_dense = x2.asDense();
-      linear(*x1_dense, *x2_dense, out, stream);
+      linear(*x1_dense, *x2_dense, out, handle.get_stream(), handle.get_cublas_handle());
     } else {
       auto x1_csr = x1.asCsr();
       if (x2.isDense()) {
         auto x2_dense = x2.asDense();
-        linear(*x1_csr, *x2_dense, out, stream);
+        linear(*x1_csr, *x2_dense, out, handle.get_stream(), handle.get_cusparse_handle());
       } else {
         auto x2_csr = x2.asCsr();
-        linear(*x1_csr, *x2_csr, out, stream);
+        linear(*x1_csr, *x2_csr, out, handle);
       }
     }
   }
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] virtual void evaluate(const math_t* x1,
+                                       int n1,
+                                       int n_cols,
+                                       const math_t* x2,
+                                       int n2,
+                                       math_t* out,
+                                       bool is_row_major,
+                                       cudaStream_t stream,
+                                       int ld1,
+                                       int ld2,
+                                       int ld_out)
+  {
+    ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
+    raft::distance::matrix::detail::DenseMatrix dense1(
+      const_cast<math_t*>(x1), n1, n_cols, is_row_major, ld1);
+    raft::distance::matrix::detail::DenseMatrix dense2(
+      const_cast<math_t*>(x2), n2, n_cols, is_row_major, ld2);
+    raft::distance::matrix::detail::DenseMatrix dense_out(out, n1, n2, is_row_major, ld_out);
+    linear(dense1, dense2, dense_out, stream, cublas_handle);
+  }
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  [[deprecated]] void operator()(const math_t* x1,
+                                 int n1,
+                                 int n_cols,
+                                 const math_t* x2,
+                                 int n2,
+                                 math_t* out,
+                                 bool is_row_major,
+                                 cudaStream_t stream,
+                                 int ld1    = 0,
+                                 int ld2    = 0,
+                                 int ld_out = 0)
+  {
+    ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
+    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
+    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
+    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
+    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  }
 };
+
 };  // end namespace raft::distance::kernels::detail
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
index ad4a81c55a..7c74e231d7 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
@@ -26,7 +26,23 @@ namespace raft::distance::kernels::detail {
 template <typename math_t>
 class KernelFactory {
  public:
-  static GramMatrixBase<math_t>* create(KernelParams params, const raft::device_resources& handle)
+  static GramMatrixBase<math_t>* create(KernelParams params)
+  {
+    GramMatrixBase<math_t>* res;
+    // KernelParams is not templated, we convert the parameters to math_t here:
+    math_t coef0 = params.coef0;
+    math_t gamma = params.gamma;
+    switch (params.kernel) {
+      case LINEAR: res = new GramMatrixBase<math_t>(); break;
+      case POLYNOMIAL: res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0); break;
+      case TANH: res = new TanhKernel<math_t>(gamma, coef0); break;
+      case RBF: res = new RBFKernel<math_t>(gamma); break;
+      default: throw raft::exception("Kernel not implemented");
+    }
+    return res;
+  }
+
+  [[deprecated]] static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t handle)
   {
     GramMatrixBase<math_t>* res;
     // KernelParams is not templated, we convert the parameters to math_t here:
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index 8b7954214c..8836a3605b 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -171,9 +171,13 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * @param exponent
    * @param gain
    * @param offset
-   * @param handle
    */
-  PolynomialKernel(exp_t exponent, math_t gain, math_t offset, const raft::device_resources& handle)
+  PolynomialKernel(exp_t exponent, math_t gain, math_t offset)
+    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset)
+  {
+  }
+
+  [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle)
     : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset)
   {
   }
@@ -187,19 +191,58 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * @param [in] x1 device matrix, size [n1*n_cols]
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] stream cuda stream
+   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
   void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
                 const raft::distance::matrix::detail::Matrix<math_t>& x2,
                 raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-                cudaStream_t stream,
+                const raft::device_resources& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
-    GramMatrixBase<math_t>::linear(x1, x2, out, stream);
-    applyKernel(out.data, out.ld, out.n_rows, out.n_cols, out.is_row_major, stream);
+    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    applyKernel(out.data, out.ld, out.n_rows, out.n_cols, out.is_row_major, handle.get_stream());
+  }
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out)
+  {
+    ASSERT(GramMatrixBase<math_t>::legacy_interface,
+           "Legacy interface can only be used with legacy ctor.");
+    raft::distance::matrix::detail::DenseMatrix dense1(
+      const_cast<math_t*>(x1), n1, n_cols, is_row_major, ld1);
+    raft::distance::matrix::detail::DenseMatrix dense2(
+      const_cast<math_t*>(x2), n2, n_cols, is_row_major, ld2);
+    raft::distance::matrix::detail::DenseMatrix dense_out(out, n1, n2, is_row_major, ld_out);
+    GramMatrixBase<math_t>::linear(
+      dense1, dense2, dense_out, stream, GramMatrixBase<math_t>::cublas_handle);
+    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
   }
 };
 
@@ -237,9 +280,10 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * @tparam math_t floating point type
    * @param gain
    * @param offset
-   * @param handle
    */
-  TanhKernel(math_t gain, math_t offset, const raft::device_resources& handle)
+  TanhKernel(math_t gain, math_t offset) : GramMatrixBase<math_t>(), gain(gain), offset(offset) {}
+
+  [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle)
     : GramMatrixBase<math_t>(handle), gain(gain), offset(offset)
   {
   }
@@ -253,19 +297,58 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * @param [in] x1 device matrix, size [n1*n_cols]
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] stream cuda stream
+   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
   void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
                 const raft::distance::matrix::detail::Matrix<math_t>& x2,
                 raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-                cudaStream_t stream,
+                const raft::device_resources& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
-    GramMatrixBase<math_t>::linear(x1, x2, out, stream);
-    applyKernel(out.data, out.ld, out.n_rows, out.n_cols, out.is_row_major, stream);
+    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    applyKernel(out.data, out.ld, out.n_rows, out.n_cols, out.is_row_major, handle.get_stream());
+  }
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out)
+  {
+    ASSERT(GramMatrixBase<math_t>::legacy_interface,
+           "Legacy interface can only be used with legacy ctor.");
+    raft::distance::matrix::detail::DenseMatrix dense1(
+      const_cast<math_t*>(x1), n1, n_cols, is_row_major, ld1);
+    raft::distance::matrix::detail::DenseMatrix dense2(
+      const_cast<math_t*>(x2), n2, n_cols, is_row_major, ld2);
+    raft::distance::matrix::detail::DenseMatrix dense_out(out, n1, n2, is_row_major, ld_out);
+    GramMatrixBase<math_t>::linear(
+      dense1, dense2, dense_out, stream, GramMatrixBase<math_t>::cublas_handle);
+    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
   }
 };
 
@@ -304,14 +387,16 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * @tparam math_t floating point type
    * @param gain
    */
-  RBFKernel(math_t gain, const raft::device_resources& handle)
+  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain) {}
+
+  [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle)
     : GramMatrixBase<math_t>(handle), gain(gain)
   {
   }
 
-  void matrixDot(const raft::distance::matrix::detail::Matrix<math_t>& matrix,
-                 math_t* target,
-                 cudaStream_t stream)
+  void matrixRowNormL2(const raft::distance::matrix::detail::Matrix<math_t>& matrix,
+                       math_t* target,
+                       cudaStream_t stream)
   {
     auto norm = raft::linalg::NormType::L2Norm;
     if (matrix.isDense()) {
@@ -342,17 +427,18 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * @param [in] x1 device matrix, size [n1*n_cols]
    * @param [in] x2 device matrix, size [n2*n_cols]
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] stream cuda stream
+   * @param [in] handle raft handle
    * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
    * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
   void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
                 const raft::distance::matrix::detail::Matrix<math_t>& x2,
                 raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-                cudaStream_t stream,
+                const raft::device_resources& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
+    cudaStream_t stream = handle.get_stream();
     if (x1.isDense() && x2.isDense() && (norm_x1 == nullptr || norm_x2 == nullptr)) {
       auto x1_dense = x1.asDense();
       auto x2_dense = x2.asDense();
@@ -363,15 +449,15 @@ class RBFKernel : public GramMatrixBase<math_t> {
       if (norm_x1 == nullptr) {
         tmp_norm_x1.reserve(x1.n_rows, stream);
         norm_x1 = tmp_norm_x1.data();
-        matrixDot(x1, norm_x1, stream);
+        matrixRowNormL2(x1, norm_x1, stream);
       }
       if (norm_x2 == nullptr) {
         tmp_norm_x2.reserve(x2.n_rows, stream);
         norm_x2 = tmp_norm_x2.data();
-        matrixDot(x2, norm_x2, stream);
+        matrixRowNormL2(x2, norm_x2, stream);
       }
       // compute L2expanded
-      GramMatrixBase<math_t>::linear(x1, x2, out, stream);
+      GramMatrixBase<math_t>::linear(x1, x2, out, handle);
       applyExpandedRbfKernel(
         out.data, out.ld, out.n_rows, out.n_cols, norm_x1, norm_x2, out.is_row_major, stream);
     }
@@ -415,6 +501,43 @@ class RBFKernel : public GramMatrixBase<math_t> {
                                       fin_op,
                                       out.is_row_major);
   }
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out)
+  {
+    ASSERT(GramMatrixBase<math_t>::legacy_interface,
+           "Legacy interface can only be used with legacy ctor.");
+    raft::distance::matrix::detail::DenseMatrix dense1(
+      const_cast<math_t*>(x1), n1, n_cols, is_row_major, ld1);
+    raft::distance::matrix::detail::DenseMatrix dense2(
+      const_cast<math_t*>(x2), n2, n_cols, is_row_major, ld2);
+    raft::distance::matrix::detail::DenseMatrix dense_out(out, n1, n2, is_row_major, ld_out);
+    distance_rbf(dense1, dense2, dense_out, stream);
+  }
 };
 
 };  // end namespace raft::distance::kernels::detail

From 591b77dcd75937f1cea0079d2304b47d2f9cadeb Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 20 Mar 2023 16:56:55 +0000
Subject: [PATCH 10/20] changed GramMatrix API to support
 device_mdspan/device_csr_matrix_view as input

---
 cpp/include/raft/core/device_mdspan.hpp       |  30 +
 .../distance/detail/kernels/gram_matrix.cuh   | 609 ++++++++++++------
 .../detail/kernels/kernel_matrices.cuh        | 440 +++++++++----
 .../raft/distance/detail/matrix/matrix.hpp    |  99 ---
 cpp/test/distance/gram.cu                     |  14 +-
 cpp/test/sparse/gram.cu                       |  95 ++-
 6 files changed, 794 insertions(+), 493 deletions(-)
 delete mode 100644 cpp/include/raft/distance/detail/matrix/matrix.hpp

diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index f72ae36d64..ace7ea0f2c 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -259,6 +259,36 @@ auto make_device_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_col
   return device_matrix_view<ElementType, IndexType, LayoutPolicy>{ptr, extents};
 }
 
+/**
+ * @brief Create a 2-dim mdspan instance for device pointer with a strided layout
+ *        that is restricted to stride 1 in the trailing dimension. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @param[in] ptr on device to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ * @param[in] is_row_major whether the data is in row major format (column major otherwise)
+ * @param[in] ld leading dimension / stride of data
+ */
+template <typename ElementType, typename IndexType = std::uint32_t>
+auto make_device_matrix_view(
+  ElementType* ptr, IndexType n_rows, IndexType n_cols, bool is_row_major, IndexType ld)
+{
+  IndexType stride0 = is_row_major ? (ld > 0 ? ld : n_cols) : 1;
+  IndexType stride1 = is_row_major ? 1 : (ld > 0 ? ld : n_rows);
+
+  assert(is_row_major ? stride0 >= n_cols : stride1 >= n_rows);
+
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+  std::array<IndexType, 2> strides{stride0, stride1};
+  using mapping_type  = typename layout_stride::template mapping<matrix_extent<IndexType>>;
+  mapping_type layout = {extents, strides};
+
+  return device_matrix_view<ElementType, IndexType, layout_stride>{ptr, layout};
+}
+
 /**
  * @brief Create a 1-dim mdspan instance for device pointer.
  * @tparam ElementType the data type of the vector elements
diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index 14113bc2a7..9cce6cf5ee 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
+#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/distance/detail/matrix/matrix.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
@@ -28,6 +28,13 @@
 
 namespace raft::distance::kernels::detail {
 
+template <typename math_t>
+using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, layout_stride>;
+template <typename math_t>
+using dense_output_matrix_view_t = raft::device_matrix_view<math_t, int, layout_stride>;
+template <typename math_t>
+using csr_input_matrix_view_t = raft::device_csr_matrix_view<const math_t, int, int, int>;
+
 /**
  * Base class for general Gram matrices
  * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
@@ -52,147 +59,410 @@ class GramMatrixBase {
     : cublas_handle(cublas_handle), legacy_interface(true){};
 
   virtual ~GramMatrixBase(){};
+
   /** Convenience function to evaluate the Gram matrix for two vector sets.
    *  Vector sets are provided in Matrix format
    *
-   * @param [in] x1 device matrix, size [n1*n_cols]
-   * @param [in] x2 device matrix, size [n2*n_cols]
-   * @param [out] out (dense) device matrix to store the Gram matrix, size [n1*n2]
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
    * @param [in] handle raft handle
    * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
    * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
-  virtual void operator()(const raft::distance::matrix::detail::Matrix<math_t>& x1,
-                          const raft::distance::matrix::detail::Matrix<math_t>& x2,
-                          raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-                          const raft::device_resources& handle,
-                          math_t* norm_x1 = nullptr,
-                          math_t* norm_x2 = nullptr)
+  void operator()(dense_input_matrix_view_t<math_t> x1,
+                  dense_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  raft::device_resources const& handle,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr)
+  {
+    evaluate(x1, x2, out, handle, norm_x1, norm_x2);
+  }
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(csr_input_matrix_view_t<math_t> x1,
+                  dense_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  raft::device_resources const& handle,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr)
+  {
+    evaluate(x1, x2, out, handle, norm_x1, norm_x2);
+  }
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(csr_input_matrix_view_t<math_t> x1,
+                  csr_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  raft::device_resources const& handle,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr)
   {
-    ASSERT(x1.n_rows == out.n_rows,
-           "GramMatrix input matrix dimensions for x1 and out do not match");
-    ASSERT(x2.n_rows == out.n_cols,
-           "GramMatrix input matrix dimensions for x2 and out do not match");
-    ASSERT(x1.n_cols == x2.n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
     evaluate(x1, x2, out, handle, norm_x1, norm_x2);
   }
 
+  // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
+
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
    *
-   * @param [in] x1 device matrix, size [n1*n_cols]
-   * @param [in] x2 device matrix, size [n2*n_cols]
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(dense_input_matrix_view_t<math_t> x1,
+                        dense_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        raft::device_resources const& handle,
+                        math_t* norm_x1,
+                        math_t* norm_x2)
+  {
+    linear(x1, x2, out, handle);
+  }
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(csr_input_matrix_view_t<math_t> x1,
+                        dense_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        raft::device_resources const& handle,
+                        math_t* norm_x1,
+                        math_t* norm_x2)
+  {
+    linear(x1, x2, out, handle);
+  }
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
    * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  virtual void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
-                        const raft::distance::matrix::detail::Matrix<math_t>& x2,
-                        raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-                        const raft::device_resources& handle,
+  virtual void evaluate(csr_input_matrix_view_t<math_t> x1,
+                        csr_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        raft::device_resources const& handle,
                         math_t* norm_x1,
                         math_t* norm_x2)
   {
     linear(x1, x2, out, handle);
   }
 
-  // private:
-  // The following methods should be private, they are kept public to avoid:
-  // "error: The enclosing parent function ("distance") for an extended
-  // __device__ lambda cannot have private or protected access within its class"
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] virtual void evaluate(const math_t* x1,
+                                       int n1,
+                                       int n_cols,
+                                       const math_t* x2,
+                                       int n2,
+                                       math_t* out,
+                                       bool is_row_major,
+                                       cudaStream_t stream,
+                                       int ld1,
+                                       int ld2,
+                                       int ld_out)
+  {
+    linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  }
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  [[deprecated]] void operator()(const math_t* x1,
+                                 int n1,
+                                 int n_cols,
+                                 const math_t* x2,
+                                 int n2,
+                                 math_t* out,
+                                 bool is_row_major,
+                                 cudaStream_t stream,
+                                 int ld1    = 0,
+                                 int ld2    = 0,
+                                 int ld_out = 0)
+  {
+    ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
+    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
+    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
+    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
+    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  }
 
+ protected:
   /** Calculates the Gram matrix using simple dot product between vector sets.
    *
    * out = x1 * x2
    *
    * Can be used as a building block for more complex kernel functions.
    *
-   * @param [in] x1 device matrix, size [n1*n_cols]
-   * @param [in] x2 device matrix, size [n2*n_cols]
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
    * @param [in] stream cuda stream
-   + @param [in] handle raft handle
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
    */
-  void linear(const raft::distance::matrix::detail::DenseMatrix<math_t>& x1,
-              const raft::distance::matrix::detail::DenseMatrix<math_t>& x2,
-              raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-              cudaStream_t stream,
-              cublasHandle_t cublas_handle)
+  [[deprecated]] void linear(const math_t* x1,
+                             int n1,
+                             int n_cols,
+                             const math_t* x2,
+                             int n2,
+                             math_t* out,
+                             bool is_row_major,
+                             cudaStream_t stream,
+                             int ld1,
+                             int ld2,
+                             int ld_out)
   {
-    ASSERT(x1.is_row_major == x2.is_row_major,
-           "GramMatrix leading dimensions for x1 and x2 do not match");
-    ASSERT(x2.is_row_major == out.is_row_major,
-           "GramMatrix leading dimensions for x2 and out do not match");
-
     math_t alpha = 1.0;
     math_t beta  = 0.0;
-    if (out.is_row_major) {
+    if (is_row_major) {
       // #TODO: Call from public API when ready
       RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
                                                        CUBLAS_OP_T,
                                                        CUBLAS_OP_N,
-                                                       out.n_cols,
-                                                       out.n_rows,
-                                                       x1.n_cols,
+                                                       n2,
+                                                       n1,
+                                                       n_cols,
                                                        &alpha,
-                                                       x2.data,
-                                                       x2.ld,
-                                                       x1.data,
-                                                       x1.ld,
+                                                       x2,
+                                                       ld2,
+                                                       x1,
+                                                       ld1,
                                                        &beta,
-                                                       out.data,
-                                                       out.ld,
+                                                       out,
+                                                       ld_out,
                                                        stream));
     } else {
       // #TODO: Call from public API when ready
       RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
                                                        CUBLAS_OP_N,
                                                        CUBLAS_OP_T,
-                                                       out.n_rows,
-                                                       out.n_cols,
-                                                       x1.n_cols,
+                                                       n1,
+                                                       n2,
+                                                       n_cols,
                                                        &alpha,
-                                                       x1.data,
-                                                       x1.ld,
-                                                       x2.data,
-                                                       x2.ld,
+                                                       x1,
+                                                       ld1,
+                                                       x2,
+                                                       ld2,
                                                        &beta,
-                                                       out.data,
-                                                       out.ld,
+                                                       out,
+                                                       ld_out,
                                                        stream));
     }
   }
 
-  void linear(const raft::distance::matrix::detail::CsrMatrix<math_t>& x1,
-              const raft::distance::matrix::detail::DenseMatrix<math_t>& x2,
-              raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-              cudaStream_t stream,
-              const cusparseHandle_t& cusparse_handle)
+ protected:
+  bool get_is_row_major(dense_output_matrix_view_t<math_t> matrix)
+  {
+    ASSERT(matrix.stride(0) == 1 || matrix.stride(1) == 1,
+           "GramMatrix matrix layout minor stride needs to be 1");
+    return (matrix.stride(1) == 1);
+  }
+
+  bool get_is_row_major(dense_input_matrix_view_t<math_t> matrix)
+  {
+    ASSERT(matrix.stride(0) == 1 || matrix.stride(1) == 1,
+           "GramMatrix matrix layout minor stride needs to be 1");
+    return (matrix.stride(1) == 1);
+  }
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   */
+  void linear(dense_input_matrix_view_t<math_t> x1,
+              dense_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out,
+              raft::device_resources const& handle)
   {
+    // check is_row_major consistency
+    bool is_row_major = get_is_row_major(out);
+    ASSERT(is_row_major ? (x1.stride(1) == 1) : (x1.stride(0) == 1),
+           "GramMatrix leading dimensions for x1 and out do not match");
+    ASSERT(is_row_major ? (x2.stride(1) == 1) : (x2.stride(0) == 1),
+           "GramMatrix leading dimensions for x2 and out do not match");
+
+    // check dimensions
+    int n1     = out.extent(0);
+    int n2     = out.extent(1);
+    int n_cols = x1.extent(1);
+    ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match");
+    ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match");
+    ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+    // extract major stride
+    int ld1    = is_row_major ? x1.stride(0) : x1.stride(1);
+    int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
+    int ld_out = is_row_major ? out.stride(0) : out.stride(1);
+
     math_t alpha = 1.0;
     math_t beta  = 0.0;
+    if (is_row_major) {
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),
+                                                       CUBLAS_OP_T,
+                                                       CUBLAS_OP_N,
+                                                       n2,
+                                                       n1,
+                                                       n_cols,
+                                                       &alpha,
+                                                       x2.data_handle(),
+                                                       ld2,
+                                                       x1.data_handle(),
+                                                       ld1,
+                                                       &beta,
+                                                       out.data_handle(),
+                                                       ld_out,
+                                                       handle.get_stream()));
+    } else {
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_T,
+                                                       n1,
+                                                       n2,
+                                                       n_cols,
+                                                       &alpha,
+                                                       x1.data_handle(),
+                                                       ld1,
+                                                       x2.data_handle(),
+                                                       ld2,
+                                                       &beta,
+                                                       out.data_handle(),
+                                                       ld_out,
+                                                       handle.get_stream()));
+    }
+  }
 
-    ASSERT(x2.is_row_major == out.is_row_major,
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   */
+  void linear(csr_input_matrix_view_t<math_t> x1,
+              dense_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out,
+              raft::device_resources const& handle)
+  {
+    // check is_row_major consistency
+    bool is_row_major = get_is_row_major(out);
+    ASSERT(is_row_major ? (x2.stride(1) == 1) : (x2.stride(0) == 1),
            "GramMatrix leading dimensions for x2 and out do not match");
 
+    // check dimensions
+    auto x1_structure = x1.get_structure();
+    ASSERT(x1_structure.get_n_rows() == out.extent(0),
+           "GramMatrix input matrix dimensions for x1 and out do not match");
+    ASSERT(x2.extent(0) == out.extent(1),
+           "GramMatrix input matrix dimensions for x2 and out do not match");
+    ASSERT(x2.extent(1) == x1_structure.get_n_cols(),
+           "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+    // extract major stride
+    int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
+    int ld_out = is_row_major ? out.stride(0) : out.stride(1);
+
+    math_t alpha = 1.0;
+    math_t beta  = 0.0;
+
     cusparseSpMatDescr_t descrX1;
-    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatecsr(&descrX1,
-                                                              x1.n_rows,
-                                                              x1.n_cols,
-                                                              x1.nnz,
-                                                              const_cast<int*>(x1.indptr),
-                                                              const_cast<int*>(x1.indices),
-                                                              const_cast<math_t*>(x1.data)));
+    RAFT_CUSPARSE_TRY(
+      raft::sparse::detail::cusparsecreatecsr(&descrX1,
+                                              x1_structure.get_n_rows(),
+                                              x1_structure.get_n_cols(),
+                                              x1_structure.get_nnz(),
+                                              const_cast<int*>(x1_structure.get_indptr().data()),
+                                              const_cast<int*>(x1_structure.get_indices().data()),
+                                              const_cast<math_t*>(x1.get_elements().data())));
 
-    auto order = out.is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
+    auto order = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
 
     cusparseDnMatDescr_t descrX2;
     RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
-      &descrX2, x2.n_rows, x2.n_cols, x2.ld, const_cast<math_t*>(x2.data), order));
+      &descrX2, x2.extent(0), x2.extent(1), ld2, const_cast<math_t*>(x2.data_handle()), order));
 
     cusparseDnMatDescr_t descrOut;
-    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
-      &descrOut, out.n_rows, out.n_cols, out.ld, const_cast<math_t*>(out.data), order));
+    RAFT_CUSPARSE_TRY(
+      raft::sparse::detail::cusparsecreatednmat(&descrOut,
+                                                out.extent(0),
+                                                out.extent(1),
+                                                ld_out,
+                                                const_cast<math_t*>(out.data_handle()),
+                                                order));
 
     auto alg = order == CUSPARSE_ORDER_COL ? CUSPARSE_SPMM_CSR_ALG1 : CUSPARSE_SPMM_CSR_ALG2;
 
@@ -201,7 +471,7 @@ class GramMatrixBase {
     auto opX2 = CUSPARSE_OPERATION_TRANSPOSE;
 
     size_t bufferSize;
-    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(cusparse_handle,
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle.get_cusparse_handle(),
                                                                     opX1,
                                                                     opX2,
                                                                     &alpha,
@@ -211,13 +481,13 @@ class GramMatrixBase {
                                                                     descrOut,
                                                                     alg,
                                                                     &bufferSize,
-                                                                    stream));
+                                                                    handle.get_stream()));
 
-    raft::interruptible::synchronize(stream);
+    raft::interruptible::synchronize(handle.get_stream());
 
-    rmm::device_uvector<math_t> tmp(bufferSize, stream);
+    rmm::device_uvector<math_t> tmp(bufferSize, handle.get_stream());
 
-    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(cusparse_handle,
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle.get_cusparse_handle(),
                                                          opX1,
                                                          opX2,
                                                          &alpha,
@@ -227,7 +497,7 @@ class GramMatrixBase {
                                                          descrOut,
                                                          alg,
                                                          tmp.data(),
-                                                         stream));
+                                                         handle.get_stream()));
 
     RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descrX1));
     RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descrX2));
@@ -235,150 +505,63 @@ class GramMatrixBase {
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 
-  void linear(const raft::distance::matrix::detail::CsrMatrix<math_t>& x1,
-              const raft::distance::matrix::detail::CsrMatrix<math_t>& x2,
-              raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-              const raft::device_resources& handle)
-  {
-    int minor_out = out.is_row_major ? out.n_cols : out.n_rows;
-    ASSERT(out.ld == minor_out, "Sparse linear Kernel distance does not support ld_out parameter");
-    raft::sparse::distance::distances_config_t<int, math_t> dist_config(handle);
-
-    // switch a,b based on is_row_major
-    if (!out.is_row_major) {
-      dist_config.a_nrows   = x2.n_rows;
-      dist_config.a_ncols   = x2.n_cols;
-      dist_config.a_nnz     = x2.nnz;
-      dist_config.a_indptr  = const_cast<int*>(x2.indptr);
-      dist_config.a_indices = const_cast<int*>(x2.indices);
-      dist_config.a_data    = const_cast<math_t*>(x2.data);
-      dist_config.b_nrows   = x1.n_rows;
-      dist_config.b_ncols   = x1.n_cols;
-      dist_config.b_nnz     = x1.nnz;
-      dist_config.b_indptr  = const_cast<int*>(x1.indptr);
-      dist_config.b_indices = const_cast<int*>(x1.indices);
-      dist_config.b_data    = const_cast<math_t*>(x1.data);
-    } else {
-      dist_config.a_nrows   = x1.n_rows;
-      dist_config.a_ncols   = x1.n_cols;
-      dist_config.a_nnz     = x1.nnz;
-      dist_config.a_indptr  = const_cast<int*>(x1.indptr);
-      dist_config.a_indices = const_cast<int*>(x1.indices);
-      dist_config.a_data    = const_cast<math_t*>(x1.data);
-      dist_config.b_nrows   = x2.n_rows;
-      dist_config.b_ncols   = x2.n_cols;
-      dist_config.b_nnz     = x2.nnz;
-      dist_config.b_indptr  = const_cast<int*>(x2.indptr);
-      dist_config.b_indices = const_cast<int*>(x2.indices);
-      dist_config.b_data    = const_cast<math_t*>(x2.data);
-    }
-
-    raft::sparse::distance::pairwiseDistance(
-      out.data, dist_config, raft::distance::DistanceType::InnerProduct, 0.0);
-  }
-
   /** Calculates the Gram matrix using simple dot product between vector sets.
    *
    * out = x1 * x2
    *
    * Can be used as a building block for more complex kernel functions.
    *
-   * @param [in] x1 device matrix, size [n1*n_cols]
-   * @param [in] x2 device matrix, size [n2*n_cols]
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
    * @param [in] handle raft handle
    */
-  void linear(const raft::distance::matrix::detail::Matrix<math_t>& x1,
-              const raft::distance::matrix::detail::Matrix<math_t>& x2,
-              raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-              const raft::device_resources& handle)
+  void linear(csr_input_matrix_view_t<math_t> x1,
+              csr_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out,
+              raft::device_resources const& handle)
   {
-    // dispatch
-    if (x1.isDense()) {
-      ASSERT(x2.isDense(), "GramMatrix input matrix does not allow Dense*Csr");
-      auto x1_dense = x1.asDense();
-      auto x2_dense = x2.asDense();
-      linear(*x1_dense, *x2_dense, out, handle.get_stream(), handle.get_cublas_handle());
+    // check is_row_major consistency
+    bool is_row_major = get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    int minor_out     = is_row_major ? out.extent(1) : out.extent(0);
+    ASSERT(ld_out == minor_out, "Sparse linear Kernel distance does not support ld_out parameter");
+
+    auto x1_structure = x1.get_structure();
+    auto x2_structure = x2.get_structure();
+    raft::sparse::distance::distances_config_t<int, math_t> dist_config(handle);
+
+    // switch a,b based on is_row_major
+    if (!is_row_major) {
+      dist_config.a_nrows   = x2_structure.get_n_rows();
+      dist_config.a_ncols   = x2_structure.get_n_cols();
+      dist_config.a_nnz     = x2_structure.get_nnz();
+      dist_config.a_indptr  = const_cast<int*>(x2_structure.get_indptr().data());
+      dist_config.a_indices = const_cast<int*>(x2_structure.get_indices().data());
+      dist_config.a_data    = const_cast<math_t*>(x2.get_elements().data());
+      dist_config.b_nrows   = x1_structure.get_n_rows();
+      dist_config.b_ncols   = x1_structure.get_n_cols();
+      dist_config.b_nnz     = x1_structure.get_nnz();
+      dist_config.b_indptr  = const_cast<int*>(x1_structure.get_indptr().data());
+      dist_config.b_indices = const_cast<int*>(x1_structure.get_indices().data());
+      dist_config.b_data    = const_cast<math_t*>(x1.get_elements().data());
     } else {
-      auto x1_csr = x1.asCsr();
-      if (x2.isDense()) {
-        auto x2_dense = x2.asDense();
-        linear(*x1_csr, *x2_dense, out, handle.get_stream(), handle.get_cusparse_handle());
-      } else {
-        auto x2_csr = x2.asCsr();
-        linear(*x1_csr, *x2_csr, out, handle);
-      }
+      dist_config.a_nrows   = x1_structure.get_n_rows();
+      dist_config.a_ncols   = x1_structure.get_n_cols();
+      dist_config.a_nnz     = x1_structure.get_nnz();
+      dist_config.a_indptr  = const_cast<int*>(x1_structure.get_indptr().data());
+      dist_config.a_indices = const_cast<int*>(x1_structure.get_indices().data());
+      dist_config.a_data    = const_cast<math_t*>(x1.get_elements().data());
+      dist_config.b_nrows   = x2_structure.get_n_rows();
+      dist_config.b_ncols   = x2_structure.get_n_cols();
+      dist_config.b_nnz     = x2_structure.get_nnz();
+      dist_config.b_indptr  = const_cast<int*>(x2_structure.get_indptr().data());
+      dist_config.b_indices = const_cast<int*>(x2_structure.get_indices().data());
+      dist_config.b_data    = const_cast<math_t*>(x2.get_elements().data());
     }
-  }
-
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] virtual void evaluate(const math_t* x1,
-                                       int n1,
-                                       int n_cols,
-                                       const math_t* x2,
-                                       int n2,
-                                       math_t* out,
-                                       bool is_row_major,
-                                       cudaStream_t stream,
-                                       int ld1,
-                                       int ld2,
-                                       int ld_out)
-  {
-    ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
-    raft::distance::matrix::detail::DenseMatrix dense1(
-      const_cast<math_t*>(x1), n1, n_cols, is_row_major, ld1);
-    raft::distance::matrix::detail::DenseMatrix dense2(
-      const_cast<math_t*>(x2), n2, n_cols, is_row_major, ld2);
-    raft::distance::matrix::detail::DenseMatrix dense_out(out, n1, n2, is_row_major, ld_out);
-    linear(dense1, dense2, dense_out, stream, cublas_handle);
-  }
 
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   */
-  [[deprecated]] void operator()(const math_t* x1,
-                                 int n1,
-                                 int n_cols,
-                                 const math_t* x2,
-                                 int n2,
-                                 math_t* out,
-                                 bool is_row_major,
-                                 cudaStream_t stream,
-                                 int ld1    = 0,
-                                 int ld2    = 0,
-                                 int ld_out = 0)
-  {
-    ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
-    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
-    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
-    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
-    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+    raft::sparse::distance::pairwiseDistance(
+      out.data_handle(), dist_config, raft::distance::DistanceType::InnerProduct, 0.0);
   }
 };
 
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index 8836a3605b..cb93ee3cf8 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -188,22 +188,79 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
    *
-   * @param [in] x1 device matrix, size [n1*n_cols]
-   * @param [in] x2 device matrix, size [n2*n_cols]
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                raft::device_resources const& handle,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
+  }
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
    * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
-                const raft::distance::matrix::detail::Matrix<math_t>& x2,
-                raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-                const raft::device_resources& handle,
+  void evaluate(csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
     GramMatrixBase<math_t>::linear(x1, x2, out, handle);
-    applyKernel(out.data, out.ld, out.n_rows, out.n_cols, out.is_row_major, handle.get_stream());
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
+  }
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                raft::device_resources const& handle,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
   }
 
   /** Evaluate the Gram matrix using the legacy interface.
@@ -235,13 +292,8 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
   {
     ASSERT(GramMatrixBase<math_t>::legacy_interface,
            "Legacy interface can only be used with legacy ctor.");
-    raft::distance::matrix::detail::DenseMatrix dense1(
-      const_cast<math_t*>(x1), n1, n_cols, is_row_major, ld1);
-    raft::distance::matrix::detail::DenseMatrix dense2(
-      const_cast<math_t*>(x2), n2, n_cols, is_row_major, ld2);
-    raft::distance::matrix::detail::DenseMatrix dense_out(out, n1, n2, is_row_major, ld_out);
     GramMatrixBase<math_t>::linear(
-      dense1, dense2, dense_out, stream, GramMatrixBase<math_t>::cublas_handle);
+      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
     applyKernel(out, ld_out, n1, n2, is_row_major, stream);
   }
 };
@@ -294,22 +346,79 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
    *
-   * @param [in] x1 device matrix, size [n1*n_cols]
-   * @param [in] x2 device matrix, size [n2*n_cols]
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
    * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
-                const raft::distance::matrix::detail::Matrix<math_t>& x2,
-                raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-                const raft::device_resources& handle,
+  void evaluate(dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
     GramMatrixBase<math_t>::linear(x1, x2, out, handle);
-    applyKernel(out.data, out.ld, out.n_rows, out.n_cols, out.is_row_major, handle.get_stream());
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
+  }
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                raft::device_resources const& handle,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
+  }
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                raft::device_resources const& handle,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
   }
 
   /** Evaluate the Gram matrix using the legacy interface.
@@ -341,13 +450,8 @@ class TanhKernel : public GramMatrixBase<math_t> {
   {
     ASSERT(GramMatrixBase<math_t>::legacy_interface,
            "Legacy interface can only be used with legacy ctor.");
-    raft::distance::matrix::detail::DenseMatrix dense1(
-      const_cast<math_t*>(x1), n1, n_cols, is_row_major, ld1);
-    raft::distance::matrix::detail::DenseMatrix dense2(
-      const_cast<math_t*>(x2), n2, n_cols, is_row_major, ld2);
-    raft::distance::matrix::detail::DenseMatrix dense_out(out, n1, n2, is_row_major, ld_out);
     GramMatrixBase<math_t>::linear(
-      dense1, dense2, dense_out, stream, GramMatrixBase<math_t>::cublas_handle);
+      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
     applyKernel(out, ld_out, n1, n2, is_row_major, stream);
   }
 };
@@ -359,14 +463,14 @@ template <typename math_t>
 class RBFKernel : public GramMatrixBase<math_t> {
   math_t gain;
 
-  void applyExpandedRbfKernel(math_t* inout,
-                              int ld,
-                              int rows,
-                              int cols,
-                              math_t* norm_x1,
-                              math_t* norm_x2,
-                              bool is_row_major,
-                              cudaStream_t stream)
+  void applyKernel(math_t* inout,
+                   int ld,
+                   int rows,
+                   int cols,
+                   math_t* norm_x1,
+                   math_t* norm_x2,
+                   bool is_row_major,
+                   cudaStream_t stream)
   {
     int n1          = is_row_major ? cols : rows;
     int n2          = is_row_major ? rows : cols;
@@ -394,28 +498,83 @@ class RBFKernel : public GramMatrixBase<math_t> {
   {
   }
 
-  void matrixRowNormL2(const raft::distance::matrix::detail::Matrix<math_t>& matrix,
+  void matrixRowNormL2(dense_input_matrix_view_t<math_t> matrix,
                        math_t* target,
                        cudaStream_t stream)
   {
-    auto norm = raft::linalg::NormType::L2Norm;
-    if (matrix.isDense()) {
-      auto dense_matrix = matrix.asDense();
-      int minor         = dense_matrix->is_row_major ? matrix.n_cols : matrix.n_rows;
-      ASSERT(dense_matrix->ld == minor,
-             "RBF Kernel lazy rowNorm compute does not support ld parameter");
-      raft::linalg::rowNorm(target,
-                            dense_matrix->data,
-                            matrix.n_cols,
-                            matrix.n_rows,
-                            norm,
-                            dense_matrix->is_row_major,
-                            stream);
-    } else {
-      auto csr_matrix = matrix.asCsr();
-      raft::sparse::linalg::rowNormCsr(
-        target, csr_matrix->indptr, csr_matrix->data, csr_matrix->nnz, matrix.n_rows, norm, stream);
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(matrix);
+    int minor         = is_row_major ? matrix.extent(1) : matrix.extent(0);
+    int ld            = is_row_major ? matrix.stride(0) : matrix.stride(1);
+    ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter");
+    raft::linalg::rowNorm(target,
+                          matrix.data_handle(),
+                          matrix.extent(1),
+                          matrix.extent(0),
+                          raft::linalg::NormType::L2Norm,
+                          is_row_major,
+                          stream);
+  }
+
+  void matrixRowNormL2(csr_input_matrix_view_t<math_t> matrix, math_t* target, cudaStream_t stream)
+  {
+    auto matrix_structure = matrix.get_structure();
+    raft::sparse::linalg::rowNormCsr(target,
+                                     matrix_structure.get_indptr().data(),
+                                     matrix.get_elements().data(),
+                                     matrix_structure.get_nnz(),
+                                     matrix_structure.get_n_rows(),
+                                     raft::linalg::NormType::L2Norm,
+                                     stream);
+  }
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                raft::device_resources const& handle,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    cudaStream_t stream = handle.get_stream();
+
+    // lazy compute norms if not given
+    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+    if (norm_x1 == nullptr) {
+      tmp_norm_x1.reserve(x1.extent(0), stream);
+      norm_x1 = tmp_norm_x1.data();
+      matrixRowNormL2(x1, norm_x1, stream);
     }
+    if (norm_x2 == nullptr) {
+      tmp_norm_x2.reserve(x2.extent(0), stream);
+      norm_x2 = tmp_norm_x2.data();
+      matrixRowNormL2(x2, norm_x2, stream);
+    }
+
+    // compute L2expanded
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    applyKernel(out.data_handle(),
+                ld_out,
+                out.extent(0),
+                out.extent(1),
+                norm_x1,
+                norm_x2,
+                is_row_major,
+                handle.get_stream());
   }
 
   /** Evaluate kernel matrix using RBF kernel.
@@ -424,82 +583,98 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and | | euclidean distance.
    *
-   * @param [in] x1 device matrix, size [n1*n_cols]
-   * @param [in] x2 device matrix, size [n2*n_cols]
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
    * @param [in] handle raft handle
    * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
    * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
-  void evaluate(const raft::distance::matrix::detail::Matrix<math_t>& x1,
-                const raft::distance::matrix::detail::Matrix<math_t>& x2,
-                raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-                const raft::device_resources& handle,
+  void evaluate(csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
     cudaStream_t stream = handle.get_stream();
-    if (x1.isDense() && x2.isDense() && (norm_x1 == nullptr || norm_x2 == nullptr)) {
-      auto x1_dense = x1.asDense();
-      auto x2_dense = x2.asDense();
-      distance_rbf(*x1_dense, *x2_dense, out, stream);
-    } else {
-      rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-      rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-      if (norm_x1 == nullptr) {
-        tmp_norm_x1.reserve(x1.n_rows, stream);
-        norm_x1 = tmp_norm_x1.data();
-        matrixRowNormL2(x1, norm_x1, stream);
-      }
-      if (norm_x2 == nullptr) {
-        tmp_norm_x2.reserve(x2.n_rows, stream);
-        norm_x2 = tmp_norm_x2.data();
-        matrixRowNormL2(x2, norm_x2, stream);
-      }
-      // compute L2expanded
-      GramMatrixBase<math_t>::linear(x1, x2, out, handle);
-      applyExpandedRbfKernel(
-        out.data, out.ld, out.n_rows, out.n_cols, norm_x1, norm_x2, out.is_row_major, stream);
+
+    // lazy compute norms if not given
+    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+    if (norm_x1 == nullptr) {
+      tmp_norm_x1.reserve(x1.get_structure().get_n_rows(), stream);
+      norm_x1 = tmp_norm_x1.data();
+      matrixRowNormL2(x1, norm_x1, stream);
+    }
+    if (norm_x2 == nullptr) {
+      tmp_norm_x2.reserve(x2.extent(0), stream);
+      norm_x2 = tmp_norm_x2.data();
+      matrixRowNormL2(x2, norm_x2, stream);
     }
+
+    // compute L2expanded
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    applyKernel(out.data_handle(),
+                ld_out,
+                out.extent(0),
+                out.extent(1),
+                norm_x1,
+                norm_x2,
+                is_row_major,
+                handle.get_stream());
   }
 
-  /** Customize distance function withe RBF epilogue */
-  void distance_rbf(const raft::distance::matrix::detail::DenseMatrix<math_t>& x1,
-                    const raft::distance::matrix::detail::DenseMatrix<math_t>& x2,
-                    raft::distance::matrix::detail::DenseMatrix<math_t>& out,
-                    cudaStream_t stream)
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param [in] handle raft handle
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                raft::device_resources const& handle,
+                math_t* norm_x1,
+                math_t* norm_x2)
   {
-    int minor1    = x1.is_row_major ? x1.n_cols : x1.n_rows;
-    int minor2    = x2.is_row_major ? x2.n_cols : x2.n_rows;
-    int minor_out = out.is_row_major ? out.n_cols : out.n_rows;
-    ASSERT(x1.ld == minor1, "RBF Kernel distance does not support ld1 parameter");
-    ASSERT(x2.ld == minor2, "RBF Kernel distance does not support ld2 parameter");
-    ASSERT(out.ld == minor_out, "RBF Kernel distance does not support ld_out parameter");
-    ASSERT(x1.is_row_major == x2.is_row_major,
-           "GramMatrix leading dimensions for x1 and x2 do not match");
-    ASSERT(x2.is_row_major == out.is_row_major,
-           "GramMatrix leading dimensions for x2 and out do not match");
+    cudaStream_t stream = handle.get_stream();
 
-    math_t gain   = this->gain;
-    using index_t = int64_t;
+    // lazy compute norms if not given
+    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+    if (norm_x1 == nullptr) {
+      tmp_norm_x1.reserve(x1.get_structure().get_n_rows(), stream);
+      norm_x1 = tmp_norm_x1.data();
+      matrixRowNormL2(x1, norm_x1, stream);
+    }
+    if (norm_x2 == nullptr) {
+      tmp_norm_x2.reserve(x2.get_structure().get_n_rows(), stream);
+      norm_x2 = tmp_norm_x2.data();
+      matrixRowNormL2(x2, norm_x2, stream);
+    }
 
-    auto fin_op = [gain] __device__(math_t d_val, index_t idx) { return exp(-gain * d_val); };
-    raft::distance::distance<raft::distance::DistanceType::L2Unexpanded,
-                             math_t,
-                             math_t,
-                             math_t,
-                             decltype(fin_op),
-                             index_t>(device_resources(stream),
-                                      const_cast<math_t*>(x1.data),
-                                      const_cast<math_t*>(x2.data),
-                                      out.data,
-                                      out.n_rows,
-                                      out.n_cols,
-                                      x1.n_cols,
-                                      NULL,
-                                      0,
-                                      fin_op,
-                                      out.is_row_major);
+    // compute L2expanded
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    applyKernel(out.data_handle(),
+                ld_out,
+                out.extent(0),
+                out.extent(1),
+                norm_x1,
+                norm_x2,
+                is_row_major,
+                handle.get_stream());
   }
 
   /** Evaluate the Gram matrix using the legacy interface.
@@ -531,12 +706,33 @@ class RBFKernel : public GramMatrixBase<math_t> {
   {
     ASSERT(GramMatrixBase<math_t>::legacy_interface,
            "Legacy interface can only be used with legacy ctor.");
-    raft::distance::matrix::detail::DenseMatrix dense1(
-      const_cast<math_t*>(x1), n1, n_cols, is_row_major, ld1);
-    raft::distance::matrix::detail::DenseMatrix dense2(
-      const_cast<math_t*>(x2), n2, n_cols, is_row_major, ld2);
-    raft::distance::matrix::detail::DenseMatrix dense_out(out, n1, n2, is_row_major, ld_out);
-    distance_rbf(dense1, dense2, dense_out, stream);
+    int minor1    = is_row_major ? n_cols : n1;
+    int minor2    = is_row_major ? n_cols : n2;
+    int minor_out = is_row_major ? n2 : n1;
+    ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
+    ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
+    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
+
+    math_t gain   = this->gain;
+    using index_t = int64_t;
+
+    auto fin_op = [gain] __device__(math_t d_val, index_t idx) { return exp(-gain * d_val); };
+    raft::distance::distance<raft::distance::DistanceType::L2Unexpanded,
+                             math_t,
+                             math_t,
+                             math_t,
+                             decltype(fin_op),
+                             index_t>(device_resources(stream),
+                                      const_cast<math_t*>(x1),
+                                      const_cast<math_t*>(x2),
+                                      out,
+                                      n1,
+                                      n2,
+                                      n_cols,
+                                      NULL,
+                                      0,
+                                      fin_op,
+                                      is_row_major);
   }
 };
 
diff --git a/cpp/include/raft/distance/detail/matrix/matrix.hpp b/cpp/include/raft/distance/detail/matrix/matrix.hpp
deleted file mode 100644
index d4a0dda691..0000000000
--- a/cpp/include/raft/distance/detail/matrix/matrix.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/core/error.hpp>
-
-namespace raft::distance::matrix::detail {
-
-template <typename math_t>
-class DenseMatrix;
-template <typename math_t>
-class CsrMatrix;
-
-/*
- * Thin matrix wrapper to allow single API for different matrix representations
- */
-template <typename math_t>
-class Matrix {
- public:
-  Matrix(int rows, int cols) : n_rows(rows), n_cols(cols){};
-  virtual bool isDense() const = 0;
-  virtual ~Matrix(){};
-
-  DenseMatrix<math_t>* asDense()
-  {
-    DenseMatrix<math_t>* cast = dynamic_cast<DenseMatrix<math_t>*>(this);
-    ASSERT(cast != nullptr, "Invalid cast! Please check for isDense() before casting.");
-    return cast;
-  };
-
-  CsrMatrix<math_t>* asCsr()
-  {
-    CsrMatrix<math_t>* cast = dynamic_cast<CsrMatrix<math_t>*>(this);
-    ASSERT(cast != nullptr, "Invalid cast! Please check for isDense() before casting.");
-    return cast;
-  };
-
-  const DenseMatrix<math_t>* asDense() const
-  {
-    const DenseMatrix<math_t>* cast = dynamic_cast<const DenseMatrix<math_t>*>(this);
-    ASSERT(cast != nullptr, "Invalid cast! Please check for isDense() before casting.");
-    return cast;
-  };
-
-  const CsrMatrix<math_t>* asCsr() const
-  {
-    const CsrMatrix<math_t>* cast = dynamic_cast<const CsrMatrix<math_t>*>(this);
-    ASSERT(cast != nullptr, "Invalid cast! Please check for isDense() before casting.");
-    return cast;
-  };
-
-  int n_rows;
-  int n_cols;
-};
-
-template <typename math_t>
-class DenseMatrix : public Matrix<math_t> {
- public:
-  DenseMatrix(math_t* data, int rows, int cols, bool row_major = false, int ld_in = 0)
-    : Matrix<math_t>(rows, cols), data(data), is_row_major(row_major), ld(ld_in)
-  {
-    if (ld <= 0) ld = is_row_major ? cols : rows;
-  }
-  bool isDense() const { return true; }
-  math_t* data;
-  bool is_row_major;
-  int ld;
-};
-
-template <typename math_t>
-class CsrMatrix : public Matrix<math_t> {
- public:
-  CsrMatrix(int* indptr, int* indices, math_t* data, int nnz, int rows, int cols)
-    : Matrix<math_t>(rows, cols), indptr(indptr), indices(indices), data(data), nnz(nnz)
-  {
-  }
-  bool isDense() const { return false; }
-
-  int nnz;
-  int* indptr;
-  int* indices;
-  math_t* data;
-};
-
-}  // namespace raft::distance::matrix::detail
\ No newline at end of file
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index 6a93fed0ad..4adc07b240 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -23,7 +23,6 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
-#include <raft/distance/detail/matrix/matrix.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/kernels.cuh>
 #include <raft/random/rng.cuh>
@@ -33,8 +32,6 @@
 
 namespace raft::distance::kernels {
 
-using namespace raft::distance::matrix::detail;
-
 struct GramMatrixInputs {
   int n1;      // feature vectors in matrix 1
   int n2;      // featuer vectors in matrix 2
@@ -111,16 +108,15 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
   void runTest()
   {
     std::unique_ptr<GramMatrixBase<math_t>> kernel =
-      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel, handle));
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
 
-    DenseMatrix<math_t> x1_dense(
+    auto x1_span = raft::make_device_matrix_view<const math_t, int>(
       x1.data(), params.n1, params.n_cols, params.is_row_major, params.ld1);
-    DenseMatrix<math_t> x2_dense(
+    auto x2_span = raft::make_device_matrix_view<const math_t, int>(
       x2.data(), params.n2, params.n_cols, params.is_row_major, params.ld2);
-    DenseMatrix<math_t> gram_dense(
+    auto out_span = raft::make_device_matrix_view<math_t, int>(
       gram.data(), params.n1, params.n2, params.is_row_major, params.ld_out);
-
-    (*kernel)(x1_dense, x2_dense, gram_dense, stream);
+    (*kernel)(x1_span, x2_span, out_span, handle);
 
     naiveGramMatrixKernel(params.n1,
                           params.n2,
diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu
index bd714d25b3..22f5e3b991 100644
--- a/cpp/test/sparse/gram.cu
+++ b/cpp/test/sparse/gram.cu
@@ -23,7 +23,6 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
-#include <raft/distance/detail/matrix/matrix.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/kernels.cuh>
 #include <raft/random/rng.cuh>
@@ -35,8 +34,6 @@
 
 namespace raft::distance::kernels {
 
-using namespace raft::distance::matrix::detail;
-
 /**
  * Structure to describe structure of the input matrices:
  *  - DENSE: dense, dense
@@ -213,56 +210,58 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
   void runTest()
   {
     std::unique_ptr<GramMatrixBase<math_t>> kernel =
-      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel, handle));
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
 
-    Matrix<math_t>* x1_matrix = nullptr;
-    Matrix<math_t>* x2_matrix = nullptr;
+    auto x1_span = raft::make_device_matrix_view<const math_t, int>(
+      x1.data(), params.n1, params.n_cols, params.is_row_major, params.ld1);
+    auto x2_span = raft::make_device_matrix_view<const math_t, int>(
+      x2.data(), params.n2, params.n_cols, params.is_row_major, params.ld2);
+    auto out_span = raft::make_device_matrix_view<math_t, int>(
+      gram.data(), params.n1, params.n2, params.is_row_major, params.ld_out);
 
-    if (params.sparse_input != SparseType::DENSE) {
+    if (params.sparse_input == SparseType::DENSE) {
+      (*kernel)(x1_span, x2_span, out_span, handle);
+    } else {
       x1_csr_indptr.reserve(params.n1 + 1, stream);
       x1_csr_indices.reserve(params.n1 * params.n_cols, stream);
       x1_csr_data.reserve(params.n1 * params.n_cols, stream);
-      int nnz   = prepareCsr(x1.data(),
-                           params.n1,
-                           params.ld1,
-                           x1_csr_indptr.data(),
-                           x1_csr_indices.data(),
-                           x1_csr_data.data());
-      x1_matrix = new CsrMatrix<math_t>(x1_csr_indptr.data(),
-                                        x1_csr_indices.data(),
-                                        x1_csr_data.data(),
-                                        nnz,
-                                        params.n1,
-                                        params.n_cols);
-    } else {
-      x1_matrix = new DenseMatrix<math_t>(
-        x1.data(), params.n1, params.n_cols, params.is_row_major, params.ld1);
-    }
-
-    if (params.sparse_input == SparseType::CSR) {
-      x2_csr_indptr.reserve(params.n2 + 1, stream);
-      x2_csr_indices.reserve(params.n2 * params.n_cols, stream);
-      x2_csr_data.reserve(params.n2 * params.n_cols, stream);
-      int nnz   = prepareCsr(x2.data(),
-                           params.n2,
-                           params.ld2,
-                           x2_csr_indptr.data(),
-                           x2_csr_indices.data(),
-                           x2_csr_data.data());
-      x2_matrix = new CsrMatrix<math_t>(x2_csr_indptr.data(),
-                                        x2_csr_indices.data(),
-                                        x2_csr_data.data(),
-                                        nnz,
-                                        params.n2,
-                                        params.n_cols);
-    } else {
-      x2_matrix = new DenseMatrix<math_t>(
-        x2.data(), params.n2, params.n_cols, params.is_row_major, params.ld2);
+      int x1_nnz = prepareCsr(x1.data(),
+                              params.n1,
+                              params.ld1,
+                              x1_csr_indptr.data(),
+                              x1_csr_indices.data(),
+                              x1_csr_data.data());
+
+      auto x1_csr_structure = raft::make_device_csr_structure_view<int, int, int>(
+        x1_csr_indptr.data(), x1_csr_indices.data(), params.n1, params.n_cols, x1_nnz);
+
+      auto x1_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
+        raft::device_span<const math_t>(x1_csr_data.data(), x1_csr_structure.get_nnz()),
+        x1_csr_structure);
+
+      if (params.sparse_input == SparseType::MIX) {
+        (*kernel)(x1_csr, x2_span, out_span, handle);
+      } else {
+        x2_csr_indptr.reserve(params.n2 + 1, stream);
+        x2_csr_indices.reserve(params.n2 * params.n_cols, stream);
+        x2_csr_data.reserve(params.n2 * params.n_cols, stream);
+        int x2_nnz = prepareCsr(x2.data(),
+                                params.n2,
+                                params.ld2,
+                                x2_csr_indptr.data(),
+                                x2_csr_indices.data(),
+                                x2_csr_data.data());
+
+        auto x2_csr_structure = raft::make_device_csr_structure_view<int, int, int>(
+          x2_csr_indptr.data(), x2_csr_indices.data(), params.n2, params.n_cols, x2_nnz);
+        auto x2_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
+          raft::device_span<const math_t>(x2_csr_data.data(), x2_csr_structure.get_nnz()),
+          x2_csr_structure);
+
+        (*kernel)(x1_csr, x2_csr, out_span, handle);
+      }
     }
 
-    DenseMatrix<math_t> gram_dense(
-      gram.data(), params.n1, params.n2, params.is_row_major, params.ld_out);
-
     naiveGramMatrixKernel(params.n1,
                           params.n2,
                           params.n_cols,
@@ -277,14 +276,10 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
                           stream,
                           handle);
 
-    (*kernel)(*x1_matrix, *x2_matrix, gram_dense, stream);
     handle.sync_stream(stream);
 
     ASSERT_TRUE(raft::devArrMatchHost(
       gram_host.data(), gram.data(), gram.size(), raft::CompareApprox<math_t>(1e-6f)));
-
-    delete x1_matrix;
-    delete x2_matrix;
   }
 
   raft::device_resources handle;

From 2403b2d73f5040e7b15204047861ae7e5443bcc4 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 30 Mar 2023 18:39:50 +0000
Subject: [PATCH 11/20] utilize public API for spmm, gemm

---
 cpp/include/raft/core/device_mdspan.hpp       |  21 +--
 .../distance/detail/kernels/gram_matrix.cuh   | 166 ++++++------------
 .../raft/sparse/linalg/detail/spmm.hpp        | 147 ++++++++++++++++
 cpp/include/raft/sparse/linalg/spmm.cuh       |  76 ++++++++
 cpp/test/distance/gram.cu                     |  25 ++-
 cpp/test/sparse/gram.cu                       |  24 ++-
 6 files changed, 325 insertions(+), 134 deletions(-)
 create mode 100644 cpp/include/raft/sparse/linalg/detail/spmm.hpp
 create mode 100644 cpp/include/raft/sparse/linalg/spmm.cuh

diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index ace7ea0f2c..c4a493503e 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -266,26 +266,27 @@ auto make_device_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_col
  *        pointer.
  * @tparam ElementType the data type of the matrix elements
  * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
  * @param[in] ptr on device to wrap
  * @param[in] n_rows number of rows in pointer
  * @param[in] n_cols number of columns in pointer
  * @param[in] is_row_major whether the data is in row major format (column major otherwise)
- * @param[in] ld leading dimension / stride of data
+ * @param[in] stride leading dimension / stride of data
  */
-template <typename ElementType, typename IndexType = std::uint32_t>
-auto make_device_matrix_view(
-  ElementType* ptr, IndexType n_rows, IndexType n_cols, bool is_row_major, IndexType ld)
+template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>
+auto make_device_strided_matrix_view(ElementType* ptr,
+                                     IndexType n_rows,
+                                     IndexType n_cols,
+                                     IndexType stride)
 {
-  IndexType stride0 = is_row_major ? (ld > 0 ? ld : n_cols) : 1;
-  IndexType stride1 = is_row_major ? 1 : (ld > 0 ? ld : n_rows);
+  constexpr auto is_row_major = std::is_same_v<LayoutPolicy, layout_c_contiguous>;
+  IndexType stride0           = is_row_major ? (stride > 0 ? stride : n_cols) : 1;
+  IndexType stride1           = is_row_major ? 1 : (stride > 0 ? stride : n_rows);
 
   assert(is_row_major ? stride0 >= n_cols : stride1 >= n_rows);
-
   matrix_extent<IndexType> extents{n_rows, n_cols};
-  std::array<IndexType, 2> strides{stride0, stride1};
-  using mapping_type  = typename layout_stride::template mapping<matrix_extent<IndexType>>;
-  mapping_type layout = {extents, strides};
 
+  auto layout = make_strided_layout(extents, std::array<IndexType, 2>{stride0, stride1});
   return device_matrix_view<ElementType, IndexType, layout_stride>{ptr, layout};
 }
 
diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index 9cce6cf5ee..31feb75e05 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -20,8 +20,9 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
-#include <raft/sparse/detail/cusparse_wrappers.h>
+//#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/distance/distance.cuh>
+#include <raft/sparse/linalg/spmm.cuh>
 
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/gemm.cuh>
@@ -315,18 +316,24 @@ class GramMatrixBase {
  protected:
   bool get_is_row_major(dense_output_matrix_view_t<math_t> matrix)
   {
-    ASSERT(matrix.stride(0) == 1 || matrix.stride(1) == 1,
-           "GramMatrix matrix layout minor stride needs to be 1");
     return (matrix.stride(1) == 1);
   }
 
   bool get_is_row_major(dense_input_matrix_view_t<math_t> matrix)
   {
-    ASSERT(matrix.stride(0) == 1 || matrix.stride(1) == 1,
-           "GramMatrix matrix layout minor stride needs to be 1");
     return (matrix.stride(1) == 1);
   }
 
+  bool get_is_col_major(dense_output_matrix_view_t<math_t> matrix)
+  {
+    return (matrix.stride(0) == 1);
+  }
+
+  bool get_is_col_major(dense_input_matrix_view_t<math_t> matrix)
+  {
+    return (matrix.stride(0) == 1);
+  }
+
   /** Calculates the Gram matrix using simple dot product between vector sets.
    *
    * out = x1 * x2
@@ -344,11 +351,10 @@ class GramMatrixBase {
               raft::device_resources const& handle)
   {
     // check is_row_major consistency
-    bool is_row_major = get_is_row_major(out);
-    ASSERT(is_row_major ? (x1.stride(1) == 1) : (x1.stride(0) == 1),
-           "GramMatrix leading dimensions for x1 and out do not match");
-    ASSERT(is_row_major ? (x2.stride(1) == 1) : (x2.stride(0) == 1),
-           "GramMatrix leading dimensions for x2 and out do not match");
+    bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out);
+    bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out);
+    ASSERT(is_row_major || is_col_major,
+           "GramMatrix leading dimensions for x1, x2 and out do not match");
 
     // check dimensions
     int n1     = out.extent(0);
@@ -366,39 +372,41 @@ class GramMatrixBase {
     math_t alpha = 1.0;
     math_t beta  = 0.0;
     if (is_row_major) {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),
-                                                       CUBLAS_OP_T,
-                                                       CUBLAS_OP_N,
-                                                       n2,
-                                                       n1,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x2.data_handle(),
-                                                       ld2,
-                                                       x1.data_handle(),
-                                                       ld1,
-                                                       &beta,
-                                                       out.data_handle(),
-                                                       ld_out,
-                                                       handle.get_stream()));
+      // #TODO: Use mdspan-based API when stride-capable
+      // https://github.com/rapidsai/raft/issues/875
+      raft::linalg::gemm(handle,
+                         true,
+                         false,
+                         n2,
+                         n1,
+                         n_cols,
+                         &alpha,
+                         x2.data_handle(),
+                         ld2,
+                         x1.data_handle(),
+                         ld1,
+                         &beta,
+                         out.data_handle(),
+                         ld_out,
+                         handle.get_stream());
     } else {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),
-                                                       CUBLAS_OP_N,
-                                                       CUBLAS_OP_T,
-                                                       n1,
-                                                       n2,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x1.data_handle(),
-                                                       ld1,
-                                                       x2.data_handle(),
-                                                       ld2,
-                                                       &beta,
-                                                       out.data_handle(),
-                                                       ld_out,
-                                                       handle.get_stream()));
+      // #TODO: Use mdspan-based API when stride-capable
+      // https://github.com/rapidsai/raft/issues/875
+      raft::linalg::gemm(handle,
+                         false,
+                         true,
+                         n1,
+                         n2,
+                         n_cols,
+                         &alpha,
+                         x1.data_handle(),
+                         ld1,
+                         x2.data_handle(),
+                         ld2,
+                         &beta,
+                         out.data_handle(),
+                         ld_out,
+                         handle.get_stream());
     }
   }
 
@@ -419,8 +427,9 @@ class GramMatrixBase {
               raft::device_resources const& handle)
   {
     // check is_row_major consistency
-    bool is_row_major = get_is_row_major(out);
-    ASSERT(is_row_major ? (x2.stride(1) == 1) : (x2.stride(0) == 1),
+    bool is_row_major = get_is_row_major(x2) && get_is_row_major(out);
+    bool is_col_major = get_is_col_major(x2) && get_is_col_major(out);
+    ASSERT(is_row_major || is_col_major,
            "GramMatrix leading dimensions for x2 and out do not match");
 
     // check dimensions
@@ -432,77 +441,10 @@ class GramMatrixBase {
     ASSERT(x2.extent(1) == x1_structure.get_n_cols(),
            "GramMatrix input matrix dimensions for x1 and x2 do not match");
 
-    // extract major stride
-    int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
-    int ld_out = is_row_major ? out.stride(0) : out.stride(1);
-
     math_t alpha = 1.0;
     math_t beta  = 0.0;
 
-    cusparseSpMatDescr_t descrX1;
-    RAFT_CUSPARSE_TRY(
-      raft::sparse::detail::cusparsecreatecsr(&descrX1,
-                                              x1_structure.get_n_rows(),
-                                              x1_structure.get_n_cols(),
-                                              x1_structure.get_nnz(),
-                                              const_cast<int*>(x1_structure.get_indptr().data()),
-                                              const_cast<int*>(x1_structure.get_indices().data()),
-                                              const_cast<math_t*>(x1.get_elements().data())));
-
-    auto order = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
-
-    cusparseDnMatDescr_t descrX2;
-    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
-      &descrX2, x2.extent(0), x2.extent(1), ld2, const_cast<math_t*>(x2.data_handle()), order));
-
-    cusparseDnMatDescr_t descrOut;
-    RAFT_CUSPARSE_TRY(
-      raft::sparse::detail::cusparsecreatednmat(&descrOut,
-                                                out.extent(0),
-                                                out.extent(1),
-                                                ld_out,
-                                                const_cast<math_t*>(out.data_handle()),
-                                                order));
-
-    auto alg = order == CUSPARSE_ORDER_COL ? CUSPARSE_SPMM_CSR_ALG1 : CUSPARSE_SPMM_CSR_ALG2;
-
-    // compute X1*X2^T
-    auto opX1 = CUSPARSE_OPERATION_NON_TRANSPOSE;
-    auto opX2 = CUSPARSE_OPERATION_TRANSPOSE;
-
-    size_t bufferSize;
-    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle.get_cusparse_handle(),
-                                                                    opX1,
-                                                                    opX2,
-                                                                    &alpha,
-                                                                    descrX1,
-                                                                    descrX2,
-                                                                    &beta,
-                                                                    descrOut,
-                                                                    alg,
-                                                                    &bufferSize,
-                                                                    handle.get_stream()));
-
-    raft::interruptible::synchronize(handle.get_stream());
-
-    rmm::device_uvector<math_t> tmp(bufferSize, handle.get_stream());
-
-    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle.get_cusparse_handle(),
-                                                         opX1,
-                                                         opX2,
-                                                         &alpha,
-                                                         descrX1,
-                                                         descrX2,
-                                                         &beta,
-                                                         descrOut,
-                                                         alg,
-                                                         tmp.data(),
-                                                         handle.get_stream()));
-
-    RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descrX1));
-    RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descrX2));
-    RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descrOut));
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out);
   }
 
   /** Calculates the Gram matrix using simple dot product between vector sets.
diff --git a/cpp/include/raft/sparse/linalg/detail/spmm.hpp b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
new file mode 100644
index 0000000000..ec5328f72e
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+namespace detail {
+
+/**
+ * @brief create a cuSparse dense descriptor
+ * @tparam ValueType Data type of dense_view (float/double)
+ * @tparam IndexType Type of dense_view
+ * @tparam LayoutPolicy layout of dense_view
+ * @param[in] handle raft handle
+ * @param[in] dense_view input raft::device_matrix_view
+ * @returns dense matrix descriptor to be used by cuSparse API
+ */
+template <typename ValueType, typename IndexType, typename LayoutPolicy>
+cusparseDnMatDescr_t create_descriptor(
+  raft::device_matrix_view<ValueType, IndexType, LayoutPolicy>& dense_view)
+{
+  ASSERT(dense_view.stride(0) == 1 || dense_view.stride(1) == 1, "Smallest stride needs to be 1");
+  bool is_row_major = dense_view.stride(1) == 1;
+  auto order        = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
+  IndexType ld      = is_row_major ? dense_view.stride(0) : dense_view.stride(1);
+  cusparseDnMatDescr_t descr;
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+    &descr,
+    dense_view.extent(0),
+    dense_view.extent(1),
+    ld,
+    const_cast<std::remove_const_t<ValueType>*>(dense_view.data_handle()),
+    order));
+  return descr;
+}
+
+/**
+ * @brief create a cuSparse sparse descriptor
+ * @tparam ValueType Data type of sparse_view (float/double)
+ * @tparam NZType Type of sparse_view
+ * @param[in] handle raft handle
+ * @param[in] sparse_view input raft::device_csr_matrix_view of size M rows x K columns
+ * @returns sparse matrix descriptor to be used by cuSparse API
+ */
+template <typename ValueType, typename NZType>
+cusparseSpMatDescr_t create_descriptor(
+  raft::device_csr_matrix_view<ValueType, int, int, NZType>& sparse_view)
+{
+  cusparseSpMatDescr_t descr;
+  auto csr_structure = sparse_view.get_structure();
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatecsr(
+    &descr,
+    csr_structure.get_n_rows(),
+    csr_structure.get_n_cols(),
+    csr_structure.get_nnz(),
+    const_cast<int*>(csr_structure.get_indptr().data()),
+    const_cast<int*>(csr_structure.get_indices().data()),
+    const_cast<std::remove_const_t<ValueType>*>(sparse_view.get_elements().data())));
+  return descr;
+}
+
+/**
+ * @brief SPMM function designed for handling all CSR * DENSE
+ * combinations of operand layouts for cuSparse.
+ * It computes the following equation: Z = alpha . X * Y + beta . Z
+ * where X is a CSR device matrix view and Y,Z are device matrix views
+ * @tparam ValueType Data type of input/output matrices (float/double)
+ * @tparam IndexType Type of Y and Z
+ * @tparam NZType Type of X
+ * @tparam LayoutPolicyY layout of Y
+ * @tparam LayoutPolicyZ layout of Z
+ * @param[in] handle raft handle
+ * @param[in] trans_x transpose operation for X
+ * @param[in] trans_y transpose operation for Y
+ * @param[in] alpha scalar
+ * @param[in] descr_x input sparse descriptor
+ * @param[in] descr_y input dense descriptor
+ * @param[in] beta scalar
+ * @param[out] descr_z output dense descriptor
+ */
+template <typename ValueType>
+void spmm(raft::device_resources const& handle,
+          const bool trans_x,
+          const bool trans_y,
+          const ValueType* alpha,
+          cusparseSpMatDescr_t& descr_x,
+          cusparseDnMatDescr_t& descr_y,
+          const ValueType* beta,
+          cusparseDnMatDescr_t& descr_z)
+{
+  auto opX = trans_x ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto opY = trans_y ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto alg = CUSPARSE_SPMM_CSR_ALG1;
+  size_t bufferSize;
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle.get_cusparse_handle(),
+                                                                  opX,
+                                                                  opY,
+                                                                  alpha,
+                                                                  descr_x,
+                                                                  descr_y,
+                                                                  beta,
+                                                                  descr_z,
+                                                                  alg,
+                                                                  &bufferSize,
+                                                                  handle.get_stream()));
+
+  raft::interruptible::synchronize(handle.get_stream());
+
+  rmm::device_uvector<ValueType> tmp(bufferSize, handle.get_stream());
+
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle.get_cusparse_handle(),
+                                                       opX,
+                                                       opY,
+                                                       alpha,
+                                                       descr_x,
+                                                       descr_y,
+                                                       beta,
+                                                       descr_z,
+                                                       alg,
+                                                       tmp.data(),
+                                                       handle.get_stream()));
+}
+
+}  // end namespace detail
+}  // end namespace linalg
+}  // end namespace sparse
+}  // end namespace raft
diff --git a/cpp/include/raft/sparse/linalg/spmm.cuh b/cpp/include/raft/sparse/linalg/spmm.cuh
new file mode 100644
index 0000000000..95396309bc
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/spmm.cuh
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPMM_H
+#define __SPMM_H
+
+#pragma once
+
+#include "detail/spmm.hpp"
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief SPMM function designed for handling all CSR * DENSE
+ * combinations of operand layouts for cuSparse.
+ * It computes the following equation: Z = alpha . X * Y + beta . Z
+ * where X is a CSR device matrix view and Y,Z are device matrix views
+ * @tparam ValueType Data type of input/output matrices (float/double)
+ * @tparam IndexType Type of Y and Z
+ * @tparam NZType Type of X
+ * @tparam LayoutPolicyY layout of Y
+ * @tparam LayoutPolicyZ layout of Z
+ * @param[in] handle raft handle
+ * @param[in] trans_x transpose operation for X
+ * @param[in] trans_y transpose operation for Y
+ * @param[in] alpha scalar
+ * @param[in] x input raft::device_csr_matrix_view
+ * @param[in] y input raft::device_matrix_view
+ * @param[in] beta scalar
+ * @param[out] z output raft::device_matrix_view
+ */
+template <typename ValueType,
+          typename IndexType,
+          typename NZType,
+          typename LayoutPolicyY,
+          typename LayoutPolicyZ>
+void spmm(raft::device_resources const& handle,
+          const bool trans_x,
+          const bool trans_y,
+          const ValueType* alpha,
+          raft::device_csr_matrix_view<const ValueType, int, int, NZType> x,
+          raft::device_matrix_view<const ValueType, IndexType, LayoutPolicyY> y,
+          const ValueType* beta,
+          raft::device_matrix_view<ValueType, IndexType, LayoutPolicyZ> z)
+{
+  auto descr_x = detail::create_descriptor(x);
+  auto descr_y = detail::create_descriptor(y);
+  auto descr_z = detail::create_descriptor(z);
+
+  detail::spmm(handle, trans_x, trans_y, alpha, descr_x, descr_y, beta, descr_z);
+
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descr_x));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_y));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_z));
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // end namespace linalg
+}  // end namespace sparse
+}  // end namespace raft
+
+#endif
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index 810dbbc45b..c4277c7c98 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -110,12 +110,25 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
     std::unique_ptr<GramMatrixBase<math_t>> kernel =
       std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
 
-    auto x1_span = raft::make_device_matrix_view<const math_t, int>(
-      x1.data(), params.n1, params.n_cols, params.is_row_major, params.ld1);
-    auto x2_span = raft::make_device_matrix_view<const math_t, int>(
-      x2.data(), params.n2, params.n_cols, params.is_row_major, params.ld2);
-    auto out_span = raft::make_device_matrix_view<math_t, int>(
-      gram.data(), params.n1, params.n2, params.is_row_major, params.ld_out);
+    auto x1_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1);
+    auto x2_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2);
+    auto out_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<math_t, int, raft::layout_c_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out)
+        : raft::make_device_strided_matrix_view<math_t, int, raft::layout_f_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out);
+
     (*kernel)(x1_span, x2_span, out_span, handle);
 
     naiveGramMatrixKernel(params.n1,
diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu
index 22f5e3b991..cf0ddfc921 100644
--- a/cpp/test/sparse/gram.cu
+++ b/cpp/test/sparse/gram.cu
@@ -212,12 +212,24 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
     std::unique_ptr<GramMatrixBase<math_t>> kernel =
       std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
 
-    auto x1_span = raft::make_device_matrix_view<const math_t, int>(
-      x1.data(), params.n1, params.n_cols, params.is_row_major, params.ld1);
-    auto x2_span = raft::make_device_matrix_view<const math_t, int>(
-      x2.data(), params.n2, params.n_cols, params.is_row_major, params.ld2);
-    auto out_span = raft::make_device_matrix_view<math_t, int>(
-      gram.data(), params.n1, params.n2, params.is_row_major, params.ld_out);
+    auto x1_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1);
+    auto x2_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2);
+    auto out_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<math_t, int, raft::layout_c_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out)
+        : raft::make_device_strided_matrix_view<math_t, int, raft::layout_f_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out);
 
     if (params.sparse_input == SparseType::DENSE) {
       (*kernel)(x1_span, x2_span, out_span, handle);

From f57be138f72361e9bf49b5f0c2133620cf9d47ed Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 30 Mar 2023 20:08:59 +0000
Subject: [PATCH 12/20] refactored rowNormCsr to utilize csr_row_op

---
 .../detail/kernels/kernel_matrices.cuh        | 30 +++---
 .../raft/sparse/linalg/detail/norm.cuh        | 97 +++++++------------
 cpp/include/raft/sparse/linalg/norm.cuh       | 18 ++--
 cpp/test/sparse/norm.cu                       |  2 +-
 4 files changed, 60 insertions(+), 87 deletions(-)

diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index cb93ee3cf8..1117165c76 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -498,9 +498,9 @@ class RBFKernel : public GramMatrixBase<math_t> {
   {
   }
 
-  void matrixRowNormL2(dense_input_matrix_view_t<math_t> matrix,
-                       math_t* target,
-                       cudaStream_t stream)
+  void matrixRowNormL2(raft::device_resources const& handle,
+                       dense_input_matrix_view_t<math_t> matrix,
+                       math_t* target)
   {
     bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(matrix);
     int minor         = is_row_major ? matrix.extent(1) : matrix.extent(0);
@@ -512,19 +512,21 @@ class RBFKernel : public GramMatrixBase<math_t> {
                           matrix.extent(0),
                           raft::linalg::NormType::L2Norm,
                           is_row_major,
-                          stream);
+                          handle.get_stream());
   }
 
-  void matrixRowNormL2(csr_input_matrix_view_t<math_t> matrix, math_t* target, cudaStream_t stream)
+  void matrixRowNormL2(raft::device_resources const& handle,
+                       csr_input_matrix_view_t<math_t> matrix,
+                       math_t* target)
   {
     auto matrix_structure = matrix.get_structure();
-    raft::sparse::linalg::rowNormCsr(target,
+    raft::sparse::linalg::rowNormCsr(handle,
                                      matrix_structure.get_indptr().data(),
                                      matrix.get_elements().data(),
                                      matrix_structure.get_nnz(),
                                      matrix_structure.get_n_rows(),
-                                     raft::linalg::NormType::L2Norm,
-                                     stream);
+                                     target,
+                                     raft::linalg::NormType::L2Norm);
   }
 
   /** Evaluate kernel matrix using RBF kernel.
@@ -555,12 +557,12 @@ class RBFKernel : public GramMatrixBase<math_t> {
     if (norm_x1 == nullptr) {
       tmp_norm_x1.reserve(x1.extent(0), stream);
       norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(x1, norm_x1, stream);
+      matrixRowNormL2(handle, x1, norm_x1);
     }
     if (norm_x2 == nullptr) {
       tmp_norm_x2.reserve(x2.extent(0), stream);
       norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(x2, norm_x2, stream);
+      matrixRowNormL2(handle, x2, norm_x2);
     }
 
     // compute L2expanded
@@ -605,12 +607,12 @@ class RBFKernel : public GramMatrixBase<math_t> {
     if (norm_x1 == nullptr) {
       tmp_norm_x1.reserve(x1.get_structure().get_n_rows(), stream);
       norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(x1, norm_x1, stream);
+      matrixRowNormL2(handle, x1, norm_x1);
     }
     if (norm_x2 == nullptr) {
       tmp_norm_x2.reserve(x2.extent(0), stream);
       norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(x2, norm_x2, stream);
+      matrixRowNormL2(handle, x2, norm_x2);
     }
 
     // compute L2expanded
@@ -655,12 +657,12 @@ class RBFKernel : public GramMatrixBase<math_t> {
     if (norm_x1 == nullptr) {
       tmp_norm_x1.reserve(x1.get_structure().get_n_rows(), stream);
       norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(x1, norm_x1, stream);
+      matrixRowNormL2(handle, x1, norm_x1);
     }
     if (norm_x2 == nullptr) {
       tmp_norm_x2.reserve(x2.get_structure().get_n_rows(), stream);
       norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(x2, norm_x2, stream);
+      matrixRowNormL2(handle, x2, norm_x2);
     }
 
     // compute L2expanded
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 5af7749c39..1e66af3d10 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -24,6 +24,8 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <raft/sparse/op/row_op.cuh>
+
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
@@ -173,88 +175,57 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
   RAFT_CUDA_TRY(cudaGetLastError());
 }
 
-template <int warpSize, int rpb>
-struct CsrReductionPolicy {
-  static constexpr int LogicalWarpSize = warpSize;
-  static constexpr int RowsPerBlock    = rpb;
-  static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
-};
-
-template <typename Policy,
-          typename Type,
-          typename IdxType,
-          typename MainLambda,
-          typename ReduceLambda,
-          typename FinalLambda>
-__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
-  csrReductionKernel(Type* norm,
-                     const IdxType* ia,
-                     const Type* data,
-                     IdxType N,
-                     Type init,
-                     MainLambda main_op,
-                     ReduceLambda reduce_op,
-                     FinalLambda final_op)
-{
-  IdxType i = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
-  if (i >= N) return;
-
-  Type acc = init;
-  for (IdxType j = ia[i] + threadIdx.x; j < ia[i + 1]; j += Policy::LogicalWarpSize) {
-    acc = reduce_op(acc, main_op(data[j]));
-  }
-  acc = raft::logicalWarpReduce<Policy::LogicalWarpSize>(acc, reduce_op);
-  if (threadIdx.x == 0) { norm[i] = final_op(acc); }
-}
-
-template <typename Policy,
-          typename Type,
+template <typename Type,
           typename IdxType      = int,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void csrReduction(Type* norm,
-                  const IdxType* ia,
-                  const Type* data,
-                  IdxType N,
-                  Type init,
-                  cudaStream_t stream,
-                  MainLambda main_op     = raft::identity_op(),
-                  ReduceLambda reduce_op = raft::add_op(),
-                  FinalLambda final_op   = raft::identity_op())
+void csr_row_op_wrapper(const IdxType* ia,
+                        const Type* data,
+                        IdxType nnz,
+                        IdxType N,
+                        Type init,
+                        Type* norm,
+                        cudaStream_t stream,
+                        MainLambda main_op     = raft::identity_op(),
+                        ReduceLambda reduce_op = raft::add_op(),
+                        FinalLambda final_op   = raft::identity_op())
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "csrReduction<%d,%d>", Policy::LogicalWarpSize, Policy::RowsPerBlock);
-  dim3 threads(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1);
-  dim3 blocks(ceildiv<IdxType>(N, Policy::RowsPerBlock), 1, 1);
-  csrReductionKernel<Policy>
-    <<<blocks, threads, 0, stream>>>(norm, ia, data, N, init, main_op, reduce_op, final_op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  op::csr_row_op<IdxType>(
+    ia,
+    N,
+    nnz,
+    [data, init, norm, main_op, reduce_op, final_op] __device__(
+      IdxType row, IdxType start_idx, IdxType stop_idx) {
+      norm[row] = init;
+      for (IdxType i = start_idx; i < stop_idx; i++)
+        norm[row] = final_op(reduce_op(norm[row], main_op(data[i])));
+    },
+    stream);
 }
 
 template <typename Type, typename IdxType, typename Lambda>
-void rowNormCsrCaller(Type* norm,
-                      const IdxType* ia,
+void rowNormCsrCaller(const IdxType* ia,
                       const Type* data,
                       IdxType nnz,
                       IdxType N,
+                      Type* norm,
                       raft::linalg::NormType type,
-                      cudaStream_t stream,
-                      Lambda fin_op)
+                      Lambda fin_op,
+                      cudaStream_t stream)
 {
-  // TODO: dispatch nnz to Policy?
   switch (type) {
     case raft::linalg::NormType::L1Norm:
-      csrReduction<CsrReductionPolicy<32, 4>>(
-        norm, ia, data, N, (Type)0, stream, raft::abs_op(), raft::add_op(), fin_op);
+      csr_row_op_wrapper(
+        ia, data, nnz, N, (Type)0, norm, stream, raft::abs_op(), raft::add_op(), fin_op);
       break;
     case raft::linalg::NormType::L2Norm:
-      csrReduction<CsrReductionPolicy<32, 4>>(
-        norm, ia, data, N, (Type)0, stream, raft::sq_op(), raft::add_op(), fin_op);
+      csr_row_op_wrapper(
+        ia, data, nnz, N, (Type)0, norm, stream, raft::sq_op(), raft::add_op(), fin_op);
       break;
     case raft::linalg::NormType::LinfNorm:
-      csrReduction<CsrReductionPolicy<32, 4>>(
-        norm, ia, data, N, (Type)0, stream, raft::abs_op(), raft::max_op(), fin_op);
+      csr_row_op_wrapper(
+        ia, data, nnz, N, (Type)0, norm, stream, raft::abs_op(), raft::max_op(), fin_op);
       break;
     default: THROW("Unsupported norm type: %d", type);
   };
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index 6f01569a98..6ddaca0cd6 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -77,25 +77,25 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
  * @tparam Type the data type
  * @tparam Lambda device final lambda
  * @tparam IdxType Integer type used to for addressing
- * @param norm the output vector of row-wise norm, size [N]
+ * @param stream cuda stream where to launch work
  * @param ia the input matrix row index array
  * @param data the input matrix nnz data
- * @param N number of rows of data
+ * @param nnz number of elements in data
+ * @param N number of rows
+ * @param norm the output vector of row-wise norm, size [N]
  * @param type the type of norm to be applied
- * @param stream cuda stream where to launch work
- * @param fin_op the final lambda op
  */
 template <typename Type, typename IdxType = int, typename Lambda = raft::identity_op>
-void rowNormCsr(Type* norm,
+void rowNormCsr(raft::device_resources const& handle,
                 const IdxType* ia,
                 const Type* data,
-                IdxType nnz,
-                IdxType N,
+                const IdxType nnz,
+                const IdxType N,
+                Type* norm,
                 raft::linalg::NormType type,
-                cudaStream_t stream,
                 Lambda fin_op = raft::identity_op())
 {
-  detail::rowNormCsrCaller(norm, ia, data, nnz, N, type, stream, fin_op);
+  detail::rowNormCsrCaller(ia, data, nnz, N, norm, type, fin_op, handle.get_stream());
 }
 
 };  // end NAMESPACE linalg
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index f1328fa52d..65d857652c 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -62,7 +62,7 @@ class CSRRowNormTest : public ::testing::TestWithParam<CSRRowNormInputs<Type_f,
     raft::update_device(data.data(), params.data.data(), nnz, stream);
     raft::update_device(verify.data(), params.verify.data(), n_rows, stream);
 
-    linalg::rowNormCsr(result.data(), indptr.data(), data.data(), nnz, n_rows, params.norm, stream);
+    linalg::rowNormCsr(handle, indptr.data(), data.data(), nnz, n_rows, result.data(), params.norm);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     ASSERT_TRUE(

From 3f61b64cd9c626c1519509a0fd30e590131b67ab Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 30 Mar 2023 20:47:29 +0000
Subject: [PATCH 13/20] changed order of arguments according to best practice

---
 .../distance/detail/kernels/gram_matrix.cuh   | 72 +++++++++---------
 .../detail/kernels/kernel_matrices.cuh        | 74 +++++++++----------
 cpp/test/distance/gram.cu                     |  2 +-
 cpp/test/sparse/gram.cu                       |  6 +-
 4 files changed, 77 insertions(+), 77 deletions(-)

diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index 31feb75e05..f03f746161 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -64,118 +64,118 @@ class GramMatrixBase {
   /** Convenience function to evaluate the Gram matrix for two vector sets.
    *  Vector sets are provided in Matrix format
    *
+   * @param [in] handle raft handle
    * @param [in] x1 dense device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
    * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
-  void operator()(dense_input_matrix_view_t<math_t> x1,
+  void operator()(raft::device_resources const& handle,
+                  dense_input_matrix_view_t<math_t> x1,
                   dense_input_matrix_view_t<math_t> x2,
                   dense_output_matrix_view_t<math_t> out,
-                  raft::device_resources const& handle,
                   math_t* norm_x1 = nullptr,
                   math_t* norm_x2 = nullptr)
   {
-    evaluate(x1, x2, out, handle, norm_x1, norm_x2);
+    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
   }
 
   /** Convenience function to evaluate the Gram matrix for two vector sets.
    *  Vector sets are provided in Matrix format
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
    * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
-  void operator()(csr_input_matrix_view_t<math_t> x1,
+  void operator()(raft::device_resources const& handle,
+                  csr_input_matrix_view_t<math_t> x1,
                   dense_input_matrix_view_t<math_t> x2,
                   dense_output_matrix_view_t<math_t> out,
-                  raft::device_resources const& handle,
                   math_t* norm_x1 = nullptr,
                   math_t* norm_x2 = nullptr)
   {
-    evaluate(x1, x2, out, handle, norm_x1, norm_x2);
+    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
   }
 
   /** Convenience function to evaluate the Gram matrix for two vector sets.
    *  Vector sets are provided in Matrix format
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 csr device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
    * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
-  void operator()(csr_input_matrix_view_t<math_t> x1,
+  void operator()(raft::device_resources const& handle,
+                  csr_input_matrix_view_t<math_t> x1,
                   csr_input_matrix_view_t<math_t> x2,
                   dense_output_matrix_view_t<math_t> out,
-                  raft::device_resources const& handle,
                   math_t* norm_x1 = nullptr,
                   math_t* norm_x2 = nullptr)
   {
-    evaluate(x1, x2, out, handle, norm_x1, norm_x2);
+    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
   }
 
   // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
 
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 dense device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  virtual void evaluate(dense_input_matrix_view_t<math_t> x1,
+  virtual void evaluate(raft::device_resources const& handle,
+                        dense_input_matrix_view_t<math_t> x1,
                         dense_input_matrix_view_t<math_t> x2,
                         dense_output_matrix_view_t<math_t> out,
-                        raft::device_resources const& handle,
                         math_t* norm_x1,
                         math_t* norm_x2)
   {
-    linear(x1, x2, out, handle);
+    linear(handle, x1, x2, out);
   }
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  virtual void evaluate(csr_input_matrix_view_t<math_t> x1,
+  virtual void evaluate(raft::device_resources const& handle,
+                        csr_input_matrix_view_t<math_t> x1,
                         dense_input_matrix_view_t<math_t> x2,
                         dense_output_matrix_view_t<math_t> out,
-                        raft::device_resources const& handle,
                         math_t* norm_x1,
                         math_t* norm_x2)
   {
-    linear(x1, x2, out, handle);
+    linear(handle, x1, x2, out);
   }
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 csr device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  virtual void evaluate(csr_input_matrix_view_t<math_t> x1,
+  virtual void evaluate(raft::device_resources const& handle,
+                        csr_input_matrix_view_t<math_t> x1,
                         csr_input_matrix_view_t<math_t> x2,
                         dense_output_matrix_view_t<math_t> out,
-                        raft::device_resources const& handle,
                         math_t* norm_x1,
                         math_t* norm_x2)
   {
-    linear(x1, x2, out, handle);
+    linear(handle, x1, x2, out);
   }
 
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
@@ -340,15 +340,15 @@ class GramMatrixBase {
    *
    * Can be used as a building block for more complex kernel functions.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 dense device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    */
-  void linear(dense_input_matrix_view_t<math_t> x1,
+  void linear(raft::device_resources const& handle,
+              dense_input_matrix_view_t<math_t> x1,
               dense_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out,
-              raft::device_resources const& handle)
+              dense_output_matrix_view_t<math_t> out)
   {
     // check is_row_major consistency
     bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out);
@@ -416,15 +416,15 @@ class GramMatrixBase {
    *
    * Can be used as a building block for more complex kernel functions.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    */
-  void linear(csr_input_matrix_view_t<math_t> x1,
+  void linear(raft::device_resources const& handle,
+              csr_input_matrix_view_t<math_t> x1,
               dense_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out,
-              raft::device_resources const& handle)
+              dense_output_matrix_view_t<math_t> out)
   {
     // check is_row_major consistency
     bool is_row_major = get_is_row_major(x2) && get_is_row_major(out);
@@ -453,15 +453,15 @@ class GramMatrixBase {
    *
    * Can be used as a building block for more complex kernel functions.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 csr device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    */
-  void linear(csr_input_matrix_view_t<math_t> x1,
+  void linear(raft::device_resources const& handle,
+              csr_input_matrix_view_t<math_t> x1,
               csr_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out,
-              raft::device_resources const& handle)
+              dense_output_matrix_view_t<math_t> out)
   {
     // check is_row_major consistency
     bool is_row_major = get_is_row_major(out);
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index 1117165c76..785c66a3a2 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -188,23 +188,23 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 dense device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  void evaluate(dense_input_matrix_view_t<math_t> x1,
+  void evaluate(raft::device_resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
                 dense_input_matrix_view_t<math_t> x2,
                 dense_output_matrix_view_t<math_t> out,
-                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
     bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
     int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
     applyKernel(
       out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
   }
@@ -215,23 +215,23 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  void evaluate(csr_input_matrix_view_t<math_t> x1,
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
                 dense_input_matrix_view_t<math_t> x2,
                 dense_output_matrix_view_t<math_t> out,
-                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
     bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
     int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
     applyKernel(
       out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
   }
@@ -242,23 +242,23 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 csr device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  void evaluate(csr_input_matrix_view_t<math_t> x1,
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
                 csr_input_matrix_view_t<math_t> x2,
                 dense_output_matrix_view_t<math_t> out,
-                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
     bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
     int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
     applyKernel(
       out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
   }
@@ -346,23 +346,23 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 dense device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  void evaluate(dense_input_matrix_view_t<math_t> x1,
+  void evaluate(raft::device_resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
                 dense_input_matrix_view_t<math_t> x2,
                 dense_output_matrix_view_t<math_t> out,
-                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
     bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
     int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
     applyKernel(
       out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
   }
@@ -373,23 +373,23 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  void evaluate(csr_input_matrix_view_t<math_t> x1,
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
                 dense_input_matrix_view_t<math_t> x2,
                 dense_output_matrix_view_t<math_t> out,
-                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
     bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
     int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
     applyKernel(
       out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
   }
@@ -400,23 +400,23 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 csr device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 unused.
    * @param norm_x2 unused.
    */
-  void evaluate(csr_input_matrix_view_t<math_t> x1,
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
                 csr_input_matrix_view_t<math_t> x2,
                 dense_output_matrix_view_t<math_t> out,
-                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
     bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
     int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
     applyKernel(
       out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
   }
@@ -535,17 +535,17 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and | | euclidean distance.
    *
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
    * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
-  void evaluate(dense_input_matrix_view_t<math_t> x1,
+  void evaluate(raft::device_resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
                 dense_input_matrix_view_t<math_t> x2,
                 dense_output_matrix_view_t<math_t> out,
-                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
@@ -568,7 +568,7 @@ class RBFKernel : public GramMatrixBase<math_t> {
     // compute L2expanded
     bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
     int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
     applyKernel(out.data_handle(),
                 ld_out,
                 out.extent(0),
@@ -585,17 +585,17 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and | | euclidean distance.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 dense device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
    * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
-  void evaluate(csr_input_matrix_view_t<math_t> x1,
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
                 dense_input_matrix_view_t<math_t> x2,
                 dense_output_matrix_view_t<math_t> out,
-                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
@@ -618,7 +618,7 @@ class RBFKernel : public GramMatrixBase<math_t> {
     // compute L2expanded
     bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
     int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
     applyKernel(out.data_handle(),
                 ld_out,
                 out.extent(0),
@@ -635,17 +635,17 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and | | euclidean distance.
    *
+   * @param [in] handle raft handle
    * @param [in] x1 csr device matrix view, size [n1*n_cols]
    * @param [in] x2 csr device matrix view, size [n2*n_cols]
    * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param [in] handle raft handle
    * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
    * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
    */
-  void evaluate(csr_input_matrix_view_t<math_t> x1,
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
                 csr_input_matrix_view_t<math_t> x2,
                 dense_output_matrix_view_t<math_t> out,
-                raft::device_resources const& handle,
                 math_t* norm_x1,
                 math_t* norm_x2)
   {
@@ -668,7 +668,7 @@ class RBFKernel : public GramMatrixBase<math_t> {
     // compute L2expanded
     bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
     int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(x1, x2, out, handle);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
     applyKernel(out.data_handle(),
                 ld_out,
                 out.extent(0),
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index c4277c7c98..47da201465 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -129,7 +129,7 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
         : raft::make_device_strided_matrix_view<math_t, int, raft::layout_f_contiguous>(
             gram.data(), params.n1, params.n2, params.ld_out);
 
-    (*kernel)(x1_span, x2_span, out_span, handle);
+    (*kernel)(handle, x1_span, x2_span, out_span);
 
     naiveGramMatrixKernel(params.n1,
                           params.n2,
diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu
index cf0ddfc921..e0bfb94f94 100644
--- a/cpp/test/sparse/gram.cu
+++ b/cpp/test/sparse/gram.cu
@@ -232,7 +232,7 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
             gram.data(), params.n1, params.n2, params.ld_out);
 
     if (params.sparse_input == SparseType::DENSE) {
-      (*kernel)(x1_span, x2_span, out_span, handle);
+      (*kernel)(handle, x1_span, x2_span, out_span);
     } else {
       x1_csr_indptr.reserve(params.n1 + 1, stream);
       x1_csr_indices.reserve(params.n1 * params.n_cols, stream);
@@ -252,7 +252,7 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
         x1_csr_structure);
 
       if (params.sparse_input == SparseType::MIX) {
-        (*kernel)(x1_csr, x2_span, out_span, handle);
+        (*kernel)(handle, x1_csr, x2_span, out_span);
       } else {
         x2_csr_indptr.reserve(params.n2 + 1, stream);
         x2_csr_indices.reserve(params.n2 * params.n_cols, stream);
@@ -270,7 +270,7 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
           raft::device_span<const math_t>(x2_csr_data.data(), x2_csr_structure.get_nnz()),
           x2_csr_structure);
 
-        (*kernel)(x1_csr, x2_csr, out_span, handle);
+        (*kernel)(handle, x1_csr, x2_csr, out_span);
       }
     }
 

From 2b6090a860e6fe36c6c63beb50939bceca13d6f2 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 30 Mar 2023 22:18:31 +0000
Subject: [PATCH 14/20] moved kernel computation to public section

---
 cpp/CMakeLists.txt                            | 17 +++++-----
 cpp/include/raft/distance/kernels.cuh         | 12 ++-----
 .../{detail => }/kernels/gram_matrix.cuh      | 10 +++---
 .../{detail => }/kernels/kernel_factory.cuh   |  4 +--
 .../{detail => }/kernels/kernel_matrices.cuh  |  4 +--
 .../specializations/detail/kernels.cuh        | 31 -------------------
 .../distance/specializations/distance.cuh     |  2 +-
 .../raft/distance/specializations/kernels.cuh | 30 ++++++++++++++++++
 .../kernels/gram_matrix_base_double.cu        |  4 +--
 .../kernels/gram_matrix_base_float.cu         |  4 +--
 .../kernels/polynomial_kernel_double_int.cu   |  4 +--
 .../kernels/polynomial_kernel_float_int.cu    |  4 +--
 .../{detail => }/kernels/rbf_kernel_double.cu |  4 +--
 .../{detail => }/kernels/rbf_kernel_float.cu  |  4 +--
 .../kernels/tanh_kernel_double.cu             |  4 +--
 .../{detail => }/kernels/tanh_kernel_float.cu |  4 +--
 16 files changed, 65 insertions(+), 77 deletions(-)
 rename cpp/include/raft/distance/{detail => }/kernels/gram_matrix.cuh (99%)
 rename cpp/include/raft/distance/{detail => }/kernels/kernel_factory.cuh (95%)
 rename cpp/include/raft/distance/{detail => }/kernels/kernel_matrices.cuh (99%)
 delete mode 100644 cpp/include/raft/distance/specializations/detail/kernels.cuh
 create mode 100644 cpp/include/raft/distance/specializations/kernels.cuh
 rename cpp/src/distance/specializations/{detail => }/kernels/gram_matrix_base_double.cu (83%)
 rename cpp/src/distance/specializations/{detail => }/kernels/gram_matrix_base_float.cu (83%)
 rename cpp/src/distance/specializations/{detail => }/kernels/polynomial_kernel_double_int.cu (82%)
 rename cpp/src/distance/specializations/{detail => }/kernels/polynomial_kernel_float_int.cu (82%)
 rename cpp/src/distance/specializations/{detail => }/kernels/rbf_kernel_double.cu (83%)
 rename cpp/src/distance/specializations/{detail => }/kernels/rbf_kernel_float.cu (84%)
 rename cpp/src/distance/specializations/{detail => }/kernels/tanh_kernel_double.cu (83%)
 rename cpp/src/distance/specializations/{detail => }/kernels/tanh_kernel_float.cu (83%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7bb458c44a..bb771d5e26 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -304,16 +304,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/specializations/detail/inner_product_double_double_double_int.cu
     src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
     src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
-    src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
-    src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
-    src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
-    src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
-    # These are somehow missing a kernel definition which is causing a compile error.
-    # src/distance/specializations/detail/kernels/rbf_kernel_double.cu
-    # src/distance/specializations/detail/kernels/rbf_kernel_float.cu
     src/neighbors/brute_force_knn_int64_t_float.cu
-    src/distance/specializations/detail/kernels/tanh_kernel_double.cu
-    src/distance/specializations/detail/kernels/tanh_kernel_float.cu
     src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
     src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
     src/distance/specializations/detail/l1_float_float_float_int.cu
@@ -332,6 +323,14 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/specializations/fused_l2_nn_double_int64.cu
     src/distance/specializations/fused_l2_nn_float_int.cu
     src/distance/specializations/fused_l2_nn_float_int64.cu
+    src/distance/specializations/kernels/gram_matrix_base_double.cu
+    src/distance/specializations/kernels/gram_matrix_base_float.cu
+    src/distance/specializations/kernels/polynomial_kernel_double_int.cu
+    src/distance/specializations/kernels/polynomial_kernel_float_int.cu
+    src/distance/specializations/kernels/rbf_kernel_double.cu
+    src/distance/specializations/kernels/rbf_kernel_float.cu
+    src/distance/specializations/kernels/tanh_kernel_double.cu
+    src/distance/specializations/kernels/tanh_kernel_float.cu
     src/matrix/specializations/detail/select_k_float_uint32_t.cu
     src/matrix/specializations/detail/select_k_float_int64_t.cu
     src/matrix/specializations/detail/select_k_half_uint32_t.cu
diff --git a/cpp/include/raft/distance/kernels.cuh b/cpp/include/raft/distance/kernels.cuh
index 86f9f82406..86a2107f82 100644
--- a/cpp/include/raft/distance/kernels.cuh
+++ b/cpp/include/raft/distance/kernels.cuh
@@ -16,17 +16,9 @@
 
 #pragma once
 
-#include <raft/distance/detail/kernels/gram_matrix.cuh>
-#include <raft/distance/detail/kernels/kernel_factory.cuh>
+#include <raft/distance/kernels/gram_matrix.cuh>
+#include <raft/distance/kernels/kernel_factory.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <raft/distance/distance.cuh>
 #include <raft/linalg/gemm.cuh>
-
-namespace raft::distance::kernels {
-
-// TODO: Need to expose formal APIs for this that are more consistent w/ other APIs in RAFT
-using raft::distance::kernels::detail::GramMatrixBase;
-using raft::distance::kernels::detail::KernelFactory;
-
-};  // end namespace raft::distance::kernels
diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/kernels/gram_matrix.cuh
similarity index 99%
rename from cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
rename to cpp/include/raft/distance/kernels/gram_matrix.cuh
index f03f746161..bdd02be1b1 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/kernels/gram_matrix.cuh
@@ -20,14 +20,12 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
-//#include <raft/sparse/detail/cusparse_wrappers.h>
-#include <raft/sparse/distance/distance.cuh>
-#include <raft/sparse/linalg/spmm.cuh>
-
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/gemm.cuh>
+#include <raft/sparse/distance/distance.cuh>
+#include <raft/sparse/linalg/spmm.cuh>
 
-namespace raft::distance::kernels::detail {
+namespace raft::distance::kernels {
 
 template <typename math_t>
 using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, layout_stride>;
@@ -507,4 +505,4 @@ class GramMatrixBase {
   }
 };
 
-};  // end namespace raft::distance::kernels::detail
+};  // end namespace raft::distance::kernels
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh b/cpp/include/raft/distance/kernels/kernel_factory.cuh
similarity index 95%
rename from cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
rename to cpp/include/raft/distance/kernels/kernel_factory.cuh
index 7c74e231d7..9999b29d85 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
+++ b/cpp/include/raft/distance/kernels/kernel_factory.cuh
@@ -21,7 +21,7 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/cudart_utils.hpp>
 
-namespace raft::distance::kernels::detail {
+namespace raft::distance::kernels {
 
 template <typename math_t>
 class KernelFactory {
@@ -61,4 +61,4 @@ class KernelFactory {
   }
 };
 
-};  // end namespace raft::distance::kernels::detail
+};  // end namespace raft::distance::kernels
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/kernels/kernel_matrices.cuh
similarity index 99%
rename from cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
rename to cpp/include/raft/distance/kernels/kernel_matrices.cuh
index 785c66a3a2..5bf011bd7a 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/kernels/kernel_matrices.cuh
@@ -23,7 +23,7 @@
 #include <raft/linalg/gemm.cuh>
 #include <raft/sparse/linalg/norm.cuh>
 
-namespace raft::distance::kernels::detail {
+namespace raft::distance::kernels {
 
 /** Epiloge function for polynomial kernel without padding.
  * Calculates output = (gain*in + offset)^exponent
@@ -738,4 +738,4 @@ class RBFKernel : public GramMatrixBase<math_t> {
   }
 };
 
-};  // end namespace raft::distance::kernels::detail
+};  // end namespace raft::distance::kernels
diff --git a/cpp/include/raft/distance/specializations/detail/kernels.cuh b/cpp/include/raft/distance/specializations/detail/kernels.cuh
deleted file mode 100644
index 75c9c023e8..0000000000
--- a/cpp/include/raft/distance/specializations/detail/kernels.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/gram_matrix.cuh>
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-
-extern template class raft::distance::kernels::detail::GramMatrixBase<double>;
-extern template class raft::distance::kernels::detail::GramMatrixBase<float>;
-
-extern template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
-extern template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
-
-extern template class raft::distance::kernels::detail::TanhKernel<double>;
-extern template class raft::distance::kernels::detail::TanhKernel<float>;
-
-// These are somehow missing a kernel definition which is causing a compile error
-// extern template class raft::distance::kernels::detail::RBFKernel<double>;
-// extern template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
index a34f696e9e..c2324a24cd 100644
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -23,7 +23,6 @@
 #include <raft/distance/specializations/detail/hellinger_expanded.cuh>
 #include <raft/distance/specializations/detail/inner_product.cuh>
 #include <raft/distance/specializations/detail/jensen_shannon.cuh>
-#include <raft/distance/specializations/detail/kernels.cuh>
 #include <raft/distance/specializations/detail/kl_divergence.cuh>
 #include <raft/distance/specializations/detail/l1.cuh>
 #include <raft/distance/specializations/detail/l2_expanded.cuh>
@@ -32,3 +31,4 @@
 #include <raft/distance/specializations/detail/lp_unexpanded.cuh>
 #include <raft/distance/specializations/detail/russel_rao.cuh>
 #include <raft/distance/specializations/fused_l2_nn_min.cuh>
+#include <raft/distance/specializations/kernels.cuh>
diff --git a/cpp/include/raft/distance/specializations/kernels.cuh b/cpp/include/raft/distance/specializations/kernels.cuh
new file mode 100644
index 0000000000..f213aeaf9a
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/kernels.cuh
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/kernels/gram_matrix.cuh>
+#include <raft/distance/kernels/kernel_matrices.cuh>
+
+extern template class raft::distance::kernels::GramMatrixBase<double>;
+extern template class raft::distance::kernels::GramMatrixBase<float>;
+
+extern template class raft::distance::kernels::PolynomialKernel<double, int>;
+extern template class raft::distance::kernels::PolynomialKernel<float, int>;
+
+extern template class raft::distance::kernels::TanhKernel<double>;
+extern template class raft::distance::kernels::TanhKernel<float>;
+
+extern template class raft::distance::kernels::RBFKernel<double>;
+extern template class raft::distance::kernels::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu b/cpp/src/distance/specializations/kernels/gram_matrix_base_double.cu
similarity index 83%
rename from cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
rename to cpp/src/distance/specializations/kernels/gram_matrix_base_double.cu
index 7c80eb29d0..c86bb2796f 100644
--- a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
+++ b/cpp/src/distance/specializations/kernels/gram_matrix_base_double.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/gram_matrix.cuh>
+#include <raft/distance/kernels/gram_matrix.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::detail::GramMatrixBase<double>;
\ No newline at end of file
+template class raft::distance::kernels::GramMatrixBase<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu b/cpp/src/distance/specializations/kernels/gram_matrix_base_float.cu
similarity index 83%
rename from cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
rename to cpp/src/distance/specializations/kernels/gram_matrix_base_float.cu
index d777e73dc9..6c160f7e9a 100644
--- a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
+++ b/cpp/src/distance/specializations/kernels/gram_matrix_base_float.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/gram_matrix.cuh>
+#include <raft/distance/kernels/gram_matrix.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::detail::GramMatrixBase<float>;
\ No newline at end of file
+template class raft::distance::kernels::GramMatrixBase<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu b/cpp/src/distance/specializations/kernels/polynomial_kernel_double_int.cu
similarity index 82%
rename from cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
rename to cpp/src/distance/specializations/kernels/polynomial_kernel_double_int.cu
index 28306d0c21..ae08ae9fef 100644
--- a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
+++ b/cpp/src/distance/specializations/kernels/polynomial_kernel_double_int.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+#include <raft/distance/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
\ No newline at end of file
+template class raft::distance::kernels::PolynomialKernel<double, int>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu b/cpp/src/distance/specializations/kernels/polynomial_kernel_float_int.cu
similarity index 82%
rename from cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
rename to cpp/src/distance/specializations/kernels/polynomial_kernel_float_int.cu
index 6609de69ac..7bcbe645e9 100644
--- a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
+++ b/cpp/src/distance/specializations/kernels/polynomial_kernel_float_int.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+#include <raft/distance/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
\ No newline at end of file
+template class raft::distance::kernels::PolynomialKernel<float, int>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu b/cpp/src/distance/specializations/kernels/rbf_kernel_double.cu
similarity index 83%
rename from cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
rename to cpp/src/distance/specializations/kernels/rbf_kernel_double.cu
index 7ea4b60e09..411c4b879f 100644
--- a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
+++ b/cpp/src/distance/specializations/kernels/rbf_kernel_double.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+#include <raft/distance/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::detail::RBFKernel<double>;
\ No newline at end of file
+template class raft::distance::kernels::RBFKernel<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu b/cpp/src/distance/specializations/kernels/rbf_kernel_float.cu
similarity index 84%
rename from cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
rename to cpp/src/distance/specializations/kernels/rbf_kernel_float.cu
index 423613dcd1..0a1ed92f4e 100644
--- a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
+++ b/cpp/src/distance/specializations/kernels/rbf_kernel_float.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+#include <raft/distance/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
+template class raft::distance::kernels::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu b/cpp/src/distance/specializations/kernels/tanh_kernel_double.cu
similarity index 83%
rename from cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
rename to cpp/src/distance/specializations/kernels/tanh_kernel_double.cu
index ab818db73b..7b58343367 100644
--- a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
+++ b/cpp/src/distance/specializations/kernels/tanh_kernel_double.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+#include <raft/distance/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::detail::TanhKernel<double>;
\ No newline at end of file
+template class raft::distance::kernels::TanhKernel<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu b/cpp/src/distance/specializations/kernels/tanh_kernel_float.cu
similarity index 83%
rename from cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
rename to cpp/src/distance/specializations/kernels/tanh_kernel_float.cu
index f7825e577a..8cc73bb81f 100644
--- a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
+++ b/cpp/src/distance/specializations/kernels/tanh_kernel_float.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+#include <raft/distance/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::detail::TanhKernel<float>;
\ No newline at end of file
+template class raft::distance::kernels::TanhKernel<float>;
\ No newline at end of file

From 563032c7f193e3e29906accb58c76658d33eaab4 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 5 Apr 2023 06:41:18 -0700
Subject: [PATCH 15/20] removed outdated docstring

---
 cpp/include/raft/core/device_mdspan.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index c4a493503e..1b9992212e 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -270,7 +270,6 @@ auto make_device_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_col
  * @param[in] ptr on device to wrap
  * @param[in] n_rows number of rows in pointer
  * @param[in] n_cols number of columns in pointer
- * @param[in] is_row_major whether the data is in row major format (column major otherwise)
  * @param[in] stride leading dimension / stride of data
  */
 template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>

From 23e308da27eebc3a6a5c073ee9c4b99c6e41714c Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 5 Apr 2023 15:07:10 +0000
Subject: [PATCH 16/20] fix row-major algorithm selection for cusparse spmm

---
 .../raft/distance/kernels/kernel_matrices.cuh |  2 +-
 .../raft/sparse/linalg/detail/spmm.hpp        | 35 ++++++++++++++-----
 cpp/include/raft/sparse/linalg/spmm.cuh       |  8 +++--
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/cpp/include/raft/distance/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/kernels/kernel_matrices.cuh
index 5bf011bd7a..592406876d 100644
--- a/cpp/include/raft/distance/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/kernels/kernel_matrices.cuh
@@ -106,7 +106,7 @@ __global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t ga
  * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij));
  *
  * Intended usage
- *   - input is the product of two matrices X and Y input_ij = \sum_k X_ik * Y_jk
+ *   - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk
  *   - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X
  *   - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y
  *
diff --git a/cpp/include/raft/sparse/linalg/detail/spmm.hpp b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
index ec5328f72e..75ed3d135b 100644
--- a/cpp/include/raft/sparse/linalg/detail/spmm.hpp
+++ b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
@@ -26,23 +26,41 @@ namespace sparse {
 namespace linalg {
 namespace detail {
 
+/**
+ * @brief determine common data layout for both dense matrices
+ * @tparam ValueType Data type of Y,Z (float/double)
+ * @tparam IndexType Type of Y,Z
+ * @tparam LayoutPolicyY layout of Y
+ * @tparam LayoutPolicyZ layout of Z
+ * @param[in] x input raft::device_matrix_view
+ * @param[in] y input raft::device_matrix_view
+ * @returns dense matrix descriptor to be used by cuSparse API
+ */
+template <typename ValueType, typename IndexType, typename LayoutPolicyY, typename LayoutPolicyZ>
+bool is_row_major(raft::device_matrix_view<const ValueType, IndexType, LayoutPolicyY>& y,
+                  raft::device_matrix_view<ValueType, IndexType, LayoutPolicyZ>& z)
+{
+  bool is_row_major = z.stride(1) == 1 && y.stride(1) == 1;
+  bool is_col_major = z.stride(0) == 1 && y.stride(0) == 1;
+  ASSERT(is_row_major || is_col_major, "Both matrices need to be either row or col major");
+  return is_row_major;
+}
+
 /**
  * @brief create a cuSparse dense descriptor
  * @tparam ValueType Data type of dense_view (float/double)
  * @tparam IndexType Type of dense_view
  * @tparam LayoutPolicy layout of dense_view
- * @param[in] handle raft handle
  * @param[in] dense_view input raft::device_matrix_view
+ * @param[in] is_row_major data layout of raft::device_matrix_view
  * @returns dense matrix descriptor to be used by cuSparse API
  */
 template <typename ValueType, typename IndexType, typename LayoutPolicy>
 cusparseDnMatDescr_t create_descriptor(
-  raft::device_matrix_view<ValueType, IndexType, LayoutPolicy>& dense_view)
+  raft::device_matrix_view<ValueType, IndexType, LayoutPolicy>& dense_view, const bool is_row_major)
 {
-  ASSERT(dense_view.stride(0) == 1 || dense_view.stride(1) == 1, "Smallest stride needs to be 1");
-  bool is_row_major = dense_view.stride(1) == 1;
-  auto order        = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
-  IndexType ld      = is_row_major ? dense_view.stride(0) : dense_view.stride(1);
+  auto order   = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
+  IndexType ld = is_row_major ? dense_view.stride(0) : dense_view.stride(1);
   cusparseDnMatDescr_t descr;
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
     &descr,
@@ -58,7 +76,6 @@ cusparseDnMatDescr_t create_descriptor(
  * @brief create a cuSparse sparse descriptor
  * @tparam ValueType Data type of sparse_view (float/double)
  * @tparam NZType Type of sparse_view
- * @param[in] handle raft handle
  * @param[in] sparse_view input raft::device_csr_matrix_view of size M rows x K columns
  * @returns sparse matrix descriptor to be used by cuSparse API
  */
@@ -92,6 +109,7 @@ cusparseSpMatDescr_t create_descriptor(
  * @param[in] handle raft handle
  * @param[in] trans_x transpose operation for X
  * @param[in] trans_y transpose operation for Y
+ * @param[in] is_row_major data layout of Y,Z
  * @param[in] alpha scalar
  * @param[in] descr_x input sparse descriptor
  * @param[in] descr_y input dense descriptor
@@ -102,6 +120,7 @@ template <typename ValueType>
 void spmm(raft::device_resources const& handle,
           const bool trans_x,
           const bool trans_y,
+          const bool is_row_major,
           const ValueType* alpha,
           cusparseSpMatDescr_t& descr_x,
           cusparseDnMatDescr_t& descr_y,
@@ -110,7 +129,7 @@ void spmm(raft::device_resources const& handle,
 {
   auto opX = trans_x ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
   auto opY = trans_y ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-  auto alg = CUSPARSE_SPMM_CSR_ALG1;
+  auto alg = is_row_major ? CUSPARSE_SPMM_CSR_ALG2 : CUSPARSE_SPMM_CSR_ALG1;
   size_t bufferSize;
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle.get_cusparse_handle(),
                                                                   opX,
diff --git a/cpp/include/raft/sparse/linalg/spmm.cuh b/cpp/include/raft/sparse/linalg/spmm.cuh
index 95396309bc..73170cfc70 100644
--- a/cpp/include/raft/sparse/linalg/spmm.cuh
+++ b/cpp/include/raft/sparse/linalg/spmm.cuh
@@ -57,11 +57,13 @@ void spmm(raft::device_resources const& handle,
           const ValueType* beta,
           raft::device_matrix_view<ValueType, IndexType, LayoutPolicyZ> z)
 {
+  bool is_row_major = detail::is_row_major(y, z);
+
   auto descr_x = detail::create_descriptor(x);
-  auto descr_y = detail::create_descriptor(y);
-  auto descr_z = detail::create_descriptor(z);
+  auto descr_y = detail::create_descriptor(y, is_row_major);
+  auto descr_z = detail::create_descriptor(z, is_row_major);
 
-  detail::spmm(handle, trans_x, trans_y, alpha, descr_x, descr_y, beta, descr_z);
+  detail::spmm(handle, trans_x, trans_y, is_row_major, alpha, descr_x, descr_y, beta, descr_z);
 
   RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descr_x));
   RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_y));

From a5ee783341160e663cac502824ecb0014558051c Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 5 Apr 2023 19:48:29 +0000
Subject: [PATCH 17/20] fixed doc build

---
 cpp/include/raft/sparse/linalg/norm.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index 6ddaca0cd6..95831f395e 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -77,13 +77,14 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
  * @tparam Type the data type
  * @tparam Lambda device final lambda
  * @tparam IdxType Integer type used to for addressing
- * @param stream cuda stream where to launch work
+ * @param handle raft handle
  * @param ia the input matrix row index array
  * @param data the input matrix nnz data
  * @param nnz number of elements in data
  * @param N number of rows
  * @param norm the output vector of row-wise norm, size [N]
  * @param type the type of norm to be applied
+ * @param fin_op the final lambda op
  */
 template <typename Type, typename IdxType = int, typename Lambda = raft::identity_op>
 void rowNormCsr(raft::device_resources const& handle,

From d7d2f5b7c24bfbcd84d8ed7373adc20a3fb824fe Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Tue, 18 Apr 2023 14:56:37 -0700
Subject: [PATCH 18/20] reverted changeset
 2b6090a860e6fe36c6c63beb50939bceca13d6f2

---
 cpp/CMakeLists.txt                            | 17 +++++-----
 .../{ => detail}/kernels/gram_matrix.cuh      | 10 +++---
 .../{ => detail}/kernels/kernel_factory.cuh   |  4 +--
 .../{ => detail}/kernels/kernel_matrices.cuh  |  4 +--
 cpp/include/raft/distance/kernels.cuh         | 12 +++++--
 .../specializations/detail/kernels.cuh        | 31 +++++++++++++++++++
 .../distance/specializations/distance.cuh     |  2 +-
 .../raft/distance/specializations/kernels.cuh | 30 ------------------
 .../kernels/gram_matrix_base_double.cu        |  4 +--
 .../kernels/gram_matrix_base_float.cu         |  4 +--
 .../kernels/polynomial_kernel_double_int.cu   |  4 +--
 .../kernels/polynomial_kernel_float_int.cu    |  4 +--
 .../{ => detail}/kernels/rbf_kernel_double.cu |  4 +--
 .../{ => detail}/kernels/rbf_kernel_float.cu  |  4 +--
 .../kernels/tanh_kernel_double.cu             |  4 +--
 .../{ => detail}/kernels/tanh_kernel_float.cu |  4 +--
 16 files changed, 77 insertions(+), 65 deletions(-)
 rename cpp/include/raft/distance/{ => detail}/kernels/gram_matrix.cuh (99%)
 rename cpp/include/raft/distance/{ => detail}/kernels/kernel_factory.cuh (95%)
 rename cpp/include/raft/distance/{ => detail}/kernels/kernel_matrices.cuh (99%)
 create mode 100644 cpp/include/raft/distance/specializations/detail/kernels.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/kernels.cuh
 rename cpp/src/distance/specializations/{ => detail}/kernels/gram_matrix_base_double.cu (83%)
 rename cpp/src/distance/specializations/{ => detail}/kernels/gram_matrix_base_float.cu (83%)
 rename cpp/src/distance/specializations/{ => detail}/kernels/polynomial_kernel_double_int.cu (82%)
 rename cpp/src/distance/specializations/{ => detail}/kernels/polynomial_kernel_float_int.cu (82%)
 rename cpp/src/distance/specializations/{ => detail}/kernels/rbf_kernel_double.cu (83%)
 rename cpp/src/distance/specializations/{ => detail}/kernels/rbf_kernel_float.cu (84%)
 rename cpp/src/distance/specializations/{ => detail}/kernels/tanh_kernel_double.cu (83%)
 rename cpp/src/distance/specializations/{ => detail}/kernels/tanh_kernel_float.cu (83%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cafa981ad6..144f58c4d6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -302,7 +302,16 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/specializations/detail/inner_product_double_double_double_int.cu
     src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
     src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+    src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
+    src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
+    src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
+    src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
+    # These are somehow missing a kernel definition which is causing a compile error.
+    # src/distance/specializations/detail/kernels/rbf_kernel_double.cu
+    # src/distance/specializations/detail/kernels/rbf_kernel_float.cu
     src/neighbors/brute_force_knn_int64_t_float.cu
+    src/distance/specializations/detail/kernels/tanh_kernel_double.cu
+    src/distance/specializations/detail/kernels/tanh_kernel_float.cu
     src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
     src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
     src/distance/specializations/detail/l1_float_float_float_int.cu
@@ -321,14 +330,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/specializations/fused_l2_nn_double_int64.cu
     src/distance/specializations/fused_l2_nn_float_int.cu
     src/distance/specializations/fused_l2_nn_float_int64.cu
-    src/distance/specializations/kernels/gram_matrix_base_double.cu
-    src/distance/specializations/kernels/gram_matrix_base_float.cu
-    src/distance/specializations/kernels/polynomial_kernel_double_int.cu
-    src/distance/specializations/kernels/polynomial_kernel_float_int.cu
-    src/distance/specializations/kernels/rbf_kernel_double.cu
-    src/distance/specializations/kernels/rbf_kernel_float.cu
-    src/distance/specializations/kernels/tanh_kernel_double.cu
-    src/distance/specializations/kernels/tanh_kernel_float.cu
     src/matrix/specializations/detail/select_k_float_uint32_t.cu
     src/matrix/specializations/detail/select_k_float_int64_t.cu
     src/matrix/specializations/detail/select_k_half_uint32_t.cu
diff --git a/cpp/include/raft/distance/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
similarity index 99%
rename from cpp/include/raft/distance/kernels/gram_matrix.cuh
rename to cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index bdd02be1b1..f03f746161 100644
--- a/cpp/include/raft/distance/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -20,12 +20,14 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/gemm.cuh>
+//#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/distance/distance.cuh>
 #include <raft/sparse/linalg/spmm.cuh>
 
-namespace raft::distance::kernels {
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemm.cuh>
+
+namespace raft::distance::kernels::detail {
 
 template <typename math_t>
 using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, layout_stride>;
@@ -505,4 +507,4 @@ class GramMatrixBase {
   }
 };
 
-};  // end namespace raft::distance::kernels
+};  // end namespace raft::distance::kernels::detail
diff --git a/cpp/include/raft/distance/kernels/kernel_factory.cuh b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
similarity index 95%
rename from cpp/include/raft/distance/kernels/kernel_factory.cuh
rename to cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
index 9999b29d85..7c74e231d7 100644
--- a/cpp/include/raft/distance/kernels/kernel_factory.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
@@ -21,7 +21,7 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/cudart_utils.hpp>
 
-namespace raft::distance::kernels {
+namespace raft::distance::kernels::detail {
 
 template <typename math_t>
 class KernelFactory {
@@ -61,4 +61,4 @@ class KernelFactory {
   }
 };
 
-};  // end namespace raft::distance::kernels
+};  // end namespace raft::distance::kernels::detail
diff --git a/cpp/include/raft/distance/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
similarity index 99%
rename from cpp/include/raft/distance/kernels/kernel_matrices.cuh
rename to cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index 592406876d..20893dfce9 100644
--- a/cpp/include/raft/distance/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -23,7 +23,7 @@
 #include <raft/linalg/gemm.cuh>
 #include <raft/sparse/linalg/norm.cuh>
 
-namespace raft::distance::kernels {
+namespace raft::distance::kernels::detail {
 
 /** Epiloge function for polynomial kernel without padding.
  * Calculates output = (gain*in + offset)^exponent
@@ -738,4 +738,4 @@ class RBFKernel : public GramMatrixBase<math_t> {
   }
 };
 
-};  // end namespace raft::distance::kernels
+};  // end namespace raft::distance::kernels::detail
diff --git a/cpp/include/raft/distance/kernels.cuh b/cpp/include/raft/distance/kernels.cuh
index 86a2107f82..86f9f82406 100644
--- a/cpp/include/raft/distance/kernels.cuh
+++ b/cpp/include/raft/distance/kernels.cuh
@@ -16,9 +16,17 @@
 
 #pragma once
 
-#include <raft/distance/kernels/gram_matrix.cuh>
-#include <raft/distance/kernels/kernel_factory.cuh>
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
+#include <raft/distance/detail/kernels/kernel_factory.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <raft/distance/distance.cuh>
 #include <raft/linalg/gemm.cuh>
+
+namespace raft::distance::kernels {
+
+// TODO: Need to expose formal APIs for this that are more consistent w/ other APIs in RAFT
+using raft::distance::kernels::detail::GramMatrixBase;
+using raft::distance::kernels::detail::KernelFactory;
+
+};  // end namespace raft::distance::kernels
diff --git a/cpp/include/raft/distance/specializations/detail/kernels.cuh b/cpp/include/raft/distance/specializations/detail/kernels.cuh
new file mode 100644
index 0000000000..75c9c023e8
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/kernels.cuh
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+extern template class raft::distance::kernels::detail::GramMatrixBase<double>;
+extern template class raft::distance::kernels::detail::GramMatrixBase<float>;
+
+extern template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
+extern template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
+
+extern template class raft::distance::kernels::detail::TanhKernel<double>;
+extern template class raft::distance::kernels::detail::TanhKernel<float>;
+
+// These are somehow missing a kernel definition which is causing a compile error
+// extern template class raft::distance::kernels::detail::RBFKernel<double>;
+// extern template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
index c2324a24cd..a34f696e9e 100644
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -23,6 +23,7 @@
 #include <raft/distance/specializations/detail/hellinger_expanded.cuh>
 #include <raft/distance/specializations/detail/inner_product.cuh>
 #include <raft/distance/specializations/detail/jensen_shannon.cuh>
+#include <raft/distance/specializations/detail/kernels.cuh>
 #include <raft/distance/specializations/detail/kl_divergence.cuh>
 #include <raft/distance/specializations/detail/l1.cuh>
 #include <raft/distance/specializations/detail/l2_expanded.cuh>
@@ -31,4 +32,3 @@
 #include <raft/distance/specializations/detail/lp_unexpanded.cuh>
 #include <raft/distance/specializations/detail/russel_rao.cuh>
 #include <raft/distance/specializations/fused_l2_nn_min.cuh>
-#include <raft/distance/specializations/kernels.cuh>
diff --git a/cpp/include/raft/distance/specializations/kernels.cuh b/cpp/include/raft/distance/specializations/kernels.cuh
deleted file mode 100644
index f213aeaf9a..0000000000
--- a/cpp/include/raft/distance/specializations/kernels.cuh
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/kernels/gram_matrix.cuh>
-#include <raft/distance/kernels/kernel_matrices.cuh>
-
-extern template class raft::distance::kernels::GramMatrixBase<double>;
-extern template class raft::distance::kernels::GramMatrixBase<float>;
-
-extern template class raft::distance::kernels::PolynomialKernel<double, int>;
-extern template class raft::distance::kernels::PolynomialKernel<float, int>;
-
-extern template class raft::distance::kernels::TanhKernel<double>;
-extern template class raft::distance::kernels::TanhKernel<float>;
-
-extern template class raft::distance::kernels::RBFKernel<double>;
-extern template class raft::distance::kernels::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/kernels/gram_matrix_base_double.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
similarity index 83%
rename from cpp/src/distance/specializations/kernels/gram_matrix_base_double.cu
rename to cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
index c86bb2796f..7c80eb29d0 100644
--- a/cpp/src/distance/specializations/kernels/gram_matrix_base_double.cu
+++ b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/kernels/gram_matrix.cuh>
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::GramMatrixBase<double>;
\ No newline at end of file
+template class raft::distance::kernels::detail::GramMatrixBase<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/kernels/gram_matrix_base_float.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
similarity index 83%
rename from cpp/src/distance/specializations/kernels/gram_matrix_base_float.cu
rename to cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
index 6c160f7e9a..d777e73dc9 100644
--- a/cpp/src/distance/specializations/kernels/gram_matrix_base_float.cu
+++ b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/kernels/gram_matrix.cuh>
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::GramMatrixBase<float>;
\ No newline at end of file
+template class raft::distance::kernels::detail::GramMatrixBase<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/kernels/polynomial_kernel_double_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
similarity index 82%
rename from cpp/src/distance/specializations/kernels/polynomial_kernel_double_int.cu
rename to cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
index ae08ae9fef..28306d0c21 100644
--- a/cpp/src/distance/specializations/kernels/polynomial_kernel_double_int.cu
+++ b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/kernels/kernel_matrices.cuh>
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::PolynomialKernel<double, int>;
\ No newline at end of file
+template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/kernels/polynomial_kernel_float_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
similarity index 82%
rename from cpp/src/distance/specializations/kernels/polynomial_kernel_float_int.cu
rename to cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
index 7bcbe645e9..6609de69ac 100644
--- a/cpp/src/distance/specializations/kernels/polynomial_kernel_float_int.cu
+++ b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/kernels/kernel_matrices.cuh>
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::PolynomialKernel<float, int>;
\ No newline at end of file
+template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/kernels/rbf_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
similarity index 83%
rename from cpp/src/distance/specializations/kernels/rbf_kernel_double.cu
rename to cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
index 411c4b879f..7ea4b60e09 100644
--- a/cpp/src/distance/specializations/kernels/rbf_kernel_double.cu
+++ b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/kernels/kernel_matrices.cuh>
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::RBFKernel<double>;
\ No newline at end of file
+template class raft::distance::kernels::detail::RBFKernel<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/kernels/rbf_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
similarity index 84%
rename from cpp/src/distance/specializations/kernels/rbf_kernel_float.cu
rename to cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
index 0a1ed92f4e..423613dcd1 100644
--- a/cpp/src/distance/specializations/kernels/rbf_kernel_float.cu
+++ b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/kernels/kernel_matrices.cuh>
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::RBFKernel<float>;
\ No newline at end of file
+template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/kernels/tanh_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
similarity index 83%
rename from cpp/src/distance/specializations/kernels/tanh_kernel_double.cu
rename to cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
index 7b58343367..ab818db73b 100644
--- a/cpp/src/distance/specializations/kernels/tanh_kernel_double.cu
+++ b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/kernels/kernel_matrices.cuh>
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::TanhKernel<double>;
\ No newline at end of file
+template class raft::distance::kernels::detail::TanhKernel<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/kernels/tanh_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
similarity index 83%
rename from cpp/src/distance/specializations/kernels/tanh_kernel_float.cu
rename to cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
index 8cc73bb81f..f7825e577a 100644
--- a/cpp/src/distance/specializations/kernels/tanh_kernel_float.cu
+++ b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/kernels/kernel_matrices.cuh>
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
 #include <raft/distance/specializations.cuh>
 
-template class raft::distance::kernels::TanhKernel<float>;
\ No newline at end of file
+template class raft::distance::kernels::detail::TanhKernel<float>;
\ No newline at end of file

From f2ebd76dc53b508e96550e069f85791f6ef6bdd6 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 20 Apr 2023 05:14:43 -0700
Subject: [PATCH 19/20] merge API conflicts with recent updates to sparse
 structures

---
 cpp/include/raft/distance/detail/kernels/gram_matrix.cuh  | 6 +++---
 .../raft/distance/detail/kernels/kernel_matrices.cuh      | 8 ++++----
 cpp/include/raft/sparse/linalg/detail/spmm.hpp            | 2 +-
 cpp/test/sparse/gram.cu                                   | 5 ++---
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index f03f746161..a68b904470 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -433,7 +433,7 @@ class GramMatrixBase {
            "GramMatrix leading dimensions for x2 and out do not match");
 
     // check dimensions
-    auto x1_structure = x1.get_structure();
+    auto x1_structure = x1.structure_view();
     ASSERT(x1_structure.get_n_rows() == out.extent(0),
            "GramMatrix input matrix dimensions for x1 and out do not match");
     ASSERT(x2.extent(0) == out.extent(1),
@@ -469,8 +469,8 @@ class GramMatrixBase {
     int minor_out     = is_row_major ? out.extent(1) : out.extent(0);
     ASSERT(ld_out == minor_out, "Sparse linear Kernel distance does not support ld_out parameter");
 
-    auto x1_structure = x1.get_structure();
-    auto x2_structure = x2.get_structure();
+    auto x1_structure = x1.structure_view();
+    auto x2_structure = x2.structure_view();
     raft::sparse::distance::distances_config_t<int, math_t> dist_config(handle);
 
     // switch a,b based on is_row_major
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index 20893dfce9..4b000add21 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -519,7 +519,7 @@ class RBFKernel : public GramMatrixBase<math_t> {
                        csr_input_matrix_view_t<math_t> matrix,
                        math_t* target)
   {
-    auto matrix_structure = matrix.get_structure();
+    auto matrix_structure = matrix.structure_view();
     raft::sparse::linalg::rowNormCsr(handle,
                                      matrix_structure.get_indptr().data(),
                                      matrix.get_elements().data(),
@@ -605,7 +605,7 @@ class RBFKernel : public GramMatrixBase<math_t> {
     rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
     rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
     if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.get_structure().get_n_rows(), stream);
+      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
       norm_x1 = tmp_norm_x1.data();
       matrixRowNormL2(handle, x1, norm_x1);
     }
@@ -655,12 +655,12 @@ class RBFKernel : public GramMatrixBase<math_t> {
     rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
     rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
     if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.get_structure().get_n_rows(), stream);
+      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
       norm_x1 = tmp_norm_x1.data();
       matrixRowNormL2(handle, x1, norm_x1);
     }
     if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.get_structure().get_n_rows(), stream);
+      tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream);
       norm_x2 = tmp_norm_x2.data();
       matrixRowNormL2(handle, x2, norm_x2);
     }
diff --git a/cpp/include/raft/sparse/linalg/detail/spmm.hpp b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
index 75ed3d135b..b61b561a12 100644
--- a/cpp/include/raft/sparse/linalg/detail/spmm.hpp
+++ b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
@@ -84,7 +84,7 @@ cusparseSpMatDescr_t create_descriptor(
   raft::device_csr_matrix_view<ValueType, int, int, NZType>& sparse_view)
 {
   cusparseSpMatDescr_t descr;
-  auto csr_structure = sparse_view.get_structure();
+  auto csr_structure = sparse_view.structure_view();
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatecsr(
     &descr,
     csr_structure.get_n_rows(),
diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu
index e0bfb94f94..86a2e0cf43 100644
--- a/cpp/test/sparse/gram.cu
+++ b/cpp/test/sparse/gram.cu
@@ -244,9 +244,8 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
                               x1_csr_indices.data(),
                               x1_csr_data.data());
 
-      auto x1_csr_structure = raft::make_device_csr_structure_view<int, int, int>(
+      auto x1_csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
         x1_csr_indptr.data(), x1_csr_indices.data(), params.n1, params.n_cols, x1_nnz);
-
       auto x1_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
         raft::device_span<const math_t>(x1_csr_data.data(), x1_csr_structure.get_nnz()),
         x1_csr_structure);
@@ -264,7 +263,7 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
                                 x2_csr_indices.data(),
                                 x2_csr_data.data());
 
-        auto x2_csr_structure = raft::make_device_csr_structure_view<int, int, int>(
+        auto x2_csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
           x2_csr_indptr.data(), x2_csr_indices.data(), params.n2, params.n_cols, x2_nnz);
         auto x2_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
           raft::device_span<const math_t>(x2_csr_data.data(), x2_csr_structure.get_nnz()),

From ae8fbb515835e5a36fb52cf421b0e1687928dd65 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 24 Apr 2023 18:29:29 -0400
Subject: [PATCH 20/20] Fixing build

---
 cpp/bench/prims/distance/fused_l2_nn.cu                     | 1 +
 cpp/include/raft/core/detail/nvtx.hpp                       | 2 +-
 cpp/include/raft/distance/detail/kernels/kernel_factory.cuh | 2 +-
 cpp/include/raft/sparse/linalg/detail/norm.cuh              | 2 +-
 cpp/include/raft/sparse/linalg/norm.cuh                     | 2 +-
 5 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/bench/prims/distance/fused_l2_nn.cu b/cpp/bench/prims/distance/fused_l2_nn.cu
index 1c45572782..a5115407dd 100644
--- a/cpp/bench/prims/distance/fused_l2_nn.cu
+++ b/cpp/bench/prims/distance/fused_l2_nn.cu
@@ -16,6 +16,7 @@
 
 #include <common/benchmark.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/norm.cuh>
 #include <raft/util/cudart_utils.hpp>
 #if defined RAFT_COMPILED
 #include <raft/distance/specializations.cuh>
diff --git a/cpp/include/raft/core/detail/nvtx.hpp b/cpp/include/raft/core/detail/nvtx.hpp
index adbf3a3666..e0f985cb73 100644
--- a/cpp/include/raft/core/detail/nvtx.hpp
+++ b/cpp/include/raft/core/detail/nvtx.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
index 7c74e231d7..bb3ff1c2f5 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 1e66af3d10..56ca2ebfa7 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index 95831f395e..2bd48c6dc6 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.