From 9d8f977636a09dc417b7bb2d941c1f4da6198ae7 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Thu, 23 Jul 2020 16:37:39 -0500
Subject: [PATCH 1/4] Added cusparseCsrmvEx. Added the possibility of
 rectangular sparse _matrix_t.

---
 cpp/include/raft/sparse/cusparse_wrappers.h   | 71 +++++++++++++++++++
 cpp/include/raft/spectral/matrix_wrappers.hpp | 25 +++++--
 2 files changed, 92 insertions(+), 4 deletions(-)
diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h
index 1190d92c94..9dfd803267 100644
--- a/cpp/include/raft/sparse/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/cusparse_wrappers.h
@@ -514,5 +514,76 @@ inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle,
 }
 /** @} */
 
+/**
+ * @defgroup CsrmvEx cusparse csrmvex operations
+ * @{
+ */
+template <typename T>
+cusparseStatus_t cusparsecsrmvex_bufferSize(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA,
+  const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x,
+  const T* beta, T* y, size_t* bufferSizeInBytes, cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmvex_bufferSize(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA,
+  const float* csrValA, const int* csrRowPtrA, const int* csrColIndA,
+  const float* x, const float* beta, float* y, size_t* bufferSizeInBytes,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx_bufferSize(
+    handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, descrA, csrValA,
+    CUDA_R_32F, csrRowPtrA, csrColIndA, x, CUDA_R_32F, beta, CUDA_R_32F, y,
+    CUDA_R_32F, CUDA_R_32F, bufferSizeInBytes);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmvex_bufferSize(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA,
+  const double* csrValA, const int* csrRowPtrA, const int* csrColIndA,
+  const double* x, const double* beta, double* y, size_t* bufferSizeInBytes,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx_bufferSize(
+    handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, descrA, csrValA,
+    CUDA_R_64F, csrRowPtrA, csrColIndA, x, CUDA_R_64F, beta, CUDA_R_64F, y,
+    CUDA_R_64F, CUDA_R_64F, bufferSizeInBytes);
+}
+
+template <typename T>
+cusparseStatus_t cusparsecsrmvex(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA,
+  const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x,
+  const T* beta, T* y, T* buffer, cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmvex(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA,
+  const float* csrValA, const int* csrRowPtrA, const int* csrColIndA,
+  const float* x, const float* beta, float* y, float* buffer,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F,
+                         descrA, csrValA, CUDA_R_32F, csrRowPtrA, csrColIndA, x,
+                         CUDA_R_32F, beta, CUDA_R_32F, y, CUDA_R_32F,
+                         CUDA_R_32F, buffer);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmvex(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA,
+  const double* csrValA, const int* csrRowPtrA, const int* csrColIndA,
+  const double* x, const double* beta, double* y, double* buffer,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F,
+                         descrA, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, x,
+                         CUDA_R_64F, beta, CUDA_R_64F, y, CUDA_R_64F,
+                         CUDA_R_64F, buffer);
+}
+/** @} */
+
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 2923c76902..d8704f1417 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -109,6 +109,18 @@ class vector_t {
 
 template <typename index_type, typename value_type>
 struct sparse_matrix_t {
+  sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
+                  index_type const* col_indices, value_type const* values,
+                  index_type const nrows, index_type const ncols,
+                  index_type const nnz)
+    : handle_(raft_handle),
+      row_offsets_(row_offsets),
+      col_indices_(col_indices),
+      values_(values),
+      nrows_(nrows),
+      ncols_(ncols),
+      nnz_(nnz) {}
+
   sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
                   index_type const* col_indices, value_type const* values,
                   index_type const nrows, index_type const nnz)
@@ -117,6 +129,7 @@ struct sparse_matrix_t {
       col_indices_(col_indices),
       values_(values),
       nrows_(nrows),
+      ncols_(nrows),
       nnz_(nnz) {}
 
   template <typename CSRView>
@@ -126,6 +139,7 @@ struct sparse_matrix_t {
       col_indices_(csr_view.indices),
       values_(csr_view.edge_data),
       nrows_(csr_view.number_of_vertices),
+      ncols_(csr_view.number_of_vertices),
       nnz_(csr_view.number_of_edges) {}
 
   virtual ~sparse_matrix_t(void) =
@@ -152,18 +166,20 @@ struct sparse_matrix_t {
         CUSPARSE_OPERATION_NON_TRANSPOSE;         //non-transpose
 
 #if __CUDACC_VER_MAJOR__ > 10
+    auto size_x = transpose ? nrows_ : ncols_;
+    auto size_y = transpose ? ncols_ : nrows_;
 
     //create descriptors:
     //
     cusparseSpMatDescr_t matA;
-    CUSPARSE_CHECK(cusparsecreatecsr(&matA, nrows_, nrows_, nnz_, row_offsets_,
+    CUSPARSE_CHECK(cusparsecreatecsr(&matA, nrows_, ncols_, nnz_, row_offsets_,
                                      col_indices_, values_));
 
     cusparseDnVecDescr_t vecX;
-    CUSPARSE_CHECK(cusparsecreatednvec(&vecX, nrows_, x));
+    CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x));
 
     cusparseDnVecDescr_t vecY;
-    CUSPARSE_CHECK(cusparsecreatednvec(&vecY, nrows_, y));
+    CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y));
 
     //get (scratch) external device buffer size:
     //
@@ -199,7 +215,7 @@ struct sparse_matrix_t {
       CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
     }
     CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
-    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, nrows_, nnz_,
+    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, ncols_, nnz_,
                                  &alpha, descr, values_, row_offsets_,
                                  col_indices_, x, &beta, y, stream));
     CUSPARSE_CHECK(cusparseDestroyMatDescr(descr));
@@ -215,6 +231,7 @@ struct sparse_matrix_t {
   index_type const* col_indices_;
   value_type const* values_;
   index_type const nrows_;
+  index_type const ncols_;
   index_type const nnz_;
 };
 

From a8fa761bc234a24d12a9814c6d8987068d13ecda Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Fri, 24 Jul 2020 17:56:05 -0500
Subject: [PATCH 2/4] Transition to cusparseSpMV().

---
 cpp/include/raft/sparse/cusparse_wrappers.h   | 12 +++++-------
 cpp/include/raft/spectral/matrix_wrappers.hpp | 10 +++++++---
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h
index 9dfd803267..98189f15bc 100644
--- a/cpp/include/raft/sparse/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/cusparse_wrappers.h
@@ -213,7 +213,7 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
 }
 /** @} */
 
-#if __CUDACC_VER_MAJOR__ > 10
+#if __CUDACC_VER_MAJOR__ >= 10 and __CUDACC_VER_MINOR__ > 0
 /**
  * @defgroup cusparse Create CSR operations
  * @{
@@ -226,9 +226,8 @@ cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
                                           int64_t rows, int64_t cols,
-                                          int64_t nnz, int32_t* csrRowOffsets,
-                                          int32_t* csrColInd,
-                                          float* csrValues) {
+                                          int64_t nnz, int* csrRowOffsets,
+                                          int* csrColInd, float* csrValues) {
   return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
                            csrColInd, csrValues, CUSPARSE_INDEX_32I,
                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
@@ -237,9 +236,8 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
                                           int64_t rows, int64_t cols,
-                                          int64_t nnz, int32_t* csrRowOffsets,
-                                          int32_t* csrColInd,
-                                          double* csrValues) {
+                                          int64_t nnz, int* csrRowOffsets,
+                                          int* csrColInd, double* csrValues) {
   return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
                            csrColInd, csrValues, CUSPARSE_INDEX_32I,
                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index d8704f1417..99749f9315 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -165,15 +165,19 @@ struct sparse_matrix_t {
       transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
         CUSPARSE_OPERATION_NON_TRANSPOSE;         //non-transpose
 
-#if __CUDACC_VER_MAJOR__ > 10
+#if __CUDACC_VER_MAJOR__ >= 10 and __CUDACC_VER_MINOR__ > 0
     auto size_x = transpose ? nrows_ : ncols_;
     auto size_y = transpose ? ncols_ : nrows_;
 
     //create descriptors:
+    //(below casts are necessary, because
+    // cusparseCreateCsr(...) takes non-const
+    // void*; the casts should be harmless)
     //
     cusparseSpMatDescr_t matA;
-    CUSPARSE_CHECK(cusparsecreatecsr(&matA, nrows_, ncols_, nnz_, row_offsets_,
-                                     col_indices_, values_));
+    CUSPARSE_CHECK(cusparsecreatecsr(
+      &matA, nrows_, ncols_, nnz_, const_cast<index_type*>(row_offsets_),
+      const_cast<index_type*>(col_indices_), const_cast<value_type*>(values_)));
 
     cusparseDnVecDescr_t vecX;
     CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x));

From 21eda5bba1f86ad93d5d206b4724e62172609dff Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Sun, 26 Jul 2020 18:08:29 -0500
Subject: [PATCH 3/4] Added alg selection to SpMv, as part of migration to
 bettet cusparseSpMV().

---
 cpp/include/raft/spectral/matrix_wrappers.hpp | 55 ++++++++++++++-----
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 99749f9315..4b531231f0 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -38,6 +38,16 @@ namespace matrix {
 
 using size_type = int;  // for now; TODO: move it in appropriate header
 
+// specifies type of algorithm used
+// for SpMv:
+//
+enum struct sparse_mv_alg_t : int {
+  SPARSE_MV_UNDEFINED = -1,
+  SPARSE_MV_ALG_DEFAULT,  // generic, for any sparse matrix
+  SPARSE_MV_ALG1,         // typical for CSR
+  SPARSE_MV_ALG2  // may provide better performamce for irregular sparse matrices
+};
+
 // Vector "view"-like aggregate for linear algebra purposes
 //
 template <typename value_type>
@@ -151,8 +161,9 @@ struct sparse_matrix_t {
   // down is dangerous)
   //
   virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
-                  value_type* __restrict__ y, bool transpose = false,
-                  bool symmetric = false) const {
+                  value_type* __restrict__ y,
+                  sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
+                  bool transpose = false, bool symmetric = false) const {
     using namespace sparse;
 
     RAFT_EXPECTS(x != nullptr, "Null x buffer.");
@@ -169,6 +180,8 @@ struct sparse_matrix_t {
     auto size_x = transpose ? nrows_ : ncols_;
     auto size_y = transpose ? ncols_ : nrows_;
 
+    cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg);
+
     //create descriptors:
     //(below casts are necessary, because
     // cusparseCreateCsr(...) takes non-const
@@ -188,9 +201,9 @@ struct sparse_matrix_t {
     //get (scratch) external device buffer size:
     //
     size_t bufferSize;
-    CUSPARSE_CHECK(
-      cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA, vecX, &beta,
-                              vecY, CUSPARSE_CSRMV_ALG1, &bufferSize, stream));
+    CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA,
+                                           vecX, &beta, vecY, spmv_alg,
+                                           &bufferSize, stream));
 
     //allocate external buffer:
     //
@@ -199,8 +212,7 @@ struct sparse_matrix_t {
     //finally perform SpMV:
     //
     CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta,
-                                vecY, CUSPARSE_CSRMV_ALG1,
-                                external_buffer.raw(), stream));
+                                vecY, spmv_alg, external_buffer.raw(), stream));
 
     //free descriptors:
     //(TODO: maybe wrap them in a RAII struct?)
@@ -228,6 +240,19 @@ struct sparse_matrix_t {
 
   handle_t const& get_handle(void) const { return handle_; }
 
+#if __CUDACC_VER_MAJOR__ >= 10 and __CUDACC_VER_MINOR__ > 0
+  cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const {
+    switch (alg) {
+      case sparse_mv_alg_t::SPARSE_MV_ALG1:
+        return CUSPARSE_CSRMV_ALG1;
+      case sparse_mv_alg_t::SPARSE_MV_ALG2:
+        return CUSPARSE_CSRMV_ALG2;
+      default:
+        return CUSPARSE_MV_ALG_DEFAULT;
+    }
+  }
+#endif
+
   //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate
 
   handle_t const& handle_;
@@ -273,8 +298,9 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
   // y = alpha*A*x + beta*y
   //
   void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
-          value_type* __restrict__ y, bool transpose = false,
-          bool symmetric = false) const override {
+          value_type* __restrict__ y,
+          sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
+          bool transpose = false, bool symmetric = false) const override {
     constexpr int BLOCK_SIZE = 1024;
     auto n = sparse_matrix_t<index_type, value_type>::nrows_;
 
@@ -303,7 +329,8 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
     // Apply adjacency matrix
     //
-    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y);
+    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y, alg, transpose,
+                                                symmetric);
   }
 
   vector_t<value_type> diagonal_;
@@ -337,8 +364,9 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
   // y = alpha*A*x + beta*y
   //
   void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
-          value_type* __restrict__ y, bool transpose = false,
-          bool symmetric = false) const override {
+          value_type* __restrict__ y,
+          sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
+          bool transpose = false, bool symmetric = false) const override {
     auto n = sparse_matrix_t<index_type, value_type>::nrows_;
 
     auto cublas_h =
@@ -348,7 +376,8 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
 
     // y = A*x
     //
-    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y);
+    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y, alg, transpose,
+                                                symmetric);
     value_type dot_res;
 
     // gamma = d'*x

From 6e210f3923f1a462919f235cc179cedbc4c3253a Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Sun, 26 Jul 2020 18:27:16 -0500
Subject: [PATCH 4/4] Updated CHANGELOG.md.

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0eb61cc062..a55599eb4f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@
 - PR #22: Preserve order in comms workers for rank initialization
 - PR #38: Remove #include <cudart_utils.h> from `raft/mr/`
 - PR #39: Adding a virtual destructor to `raft::handle_t` and `raft::comms::comms_t`
+- PR #41: Upgrade to `cusparseSpMV()`, alg selection, and rectangular matrices.
 
 ## Bug Fixes
 - PR #17: Make destructor inline to avoid redeclaration error