taichi-dev · FantasyVR · Aug 17, 2022 · Mar 31, 2022 · May 4, 2022 · May 4, 2022
diff --git a/python/taichi/linalg/sparse_matrix.py b/python/taichi/linalg/sparse_matrix.py
@@ -6,7 +6,7 @@
 from taichi.lang.impl import get_runtime
 from taichi.lang.matrix import Ndarray
 from taichi.lang.util import warning
-from taichi.types import annotations, f32
+from taichi.types import annotations, f32, i32
 
 
 class SparseMatrix:
@@ -198,6 +198,54 @@ def build_from_ndarray(self, ndarray):
                 'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray]'
             )
 
+    def build_csr_cusparse(self, data, indices, indptr):
+        """Build a csr format sparse matrix using cuSparse where the column indices
+            for row i are stored in ``indices[indptr[i]:indptr[i+1]]``
+            and their corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
+
+        Args:
+            data (ti.ndarray): CSR format data array of the matrix.
+            indices (ti.ndarray): CSR format index array of the matrix.
+            indptr (ti.ndarray): CSR format index pointer array of the matrix.
+        """
+        if not isinstance(data, Ndarray) or not isinstance(
+                indices, Ndarray) or not isinstance(indptr, Ndarray):
+            raise TaichiRuntimeError(
+                'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray].'
+            )
+        elif data.dtype != f32 or indices.dtype != i32 or indptr.dtype != i32:
+            raise TaichiRuntimeError(
+                'Sparse matrix only supports building from float32 data and int32 indices/indptr.'
+            )
+        else:
+            get_runtime().prog.make_sparse_matrix_from_ndarray_cusparse(
+                self.matrix, indptr.arr, indices.arr, data.arr)
+
+    def spmv(self, x, y):
+        """Sparse matrix-vector multiplication using cuSparse.
+
+        Args:
+            x (ti.ndarray): the vector to be multiplied.
+            y (ti.ndarray): the result of matrix-vector multiplication.
+
+        Example::
+            >>> x = ti.ndarray(shape=4, dtype=val_dt)
+            >>> y = ti.ndarray(shape=4, dtype=val_dt)
+            >>> A = ti.linalg.SparseMatrix(n=4, m=4, dtype=ti.f32)
+            >>> A.build_from_ndarray_cusparse(row_csr, col_csr, value_csr)
+            >>> A.spmv(x, y)
+        """
+        if not isinstance(x, Ndarray) or not isinstance(y, Ndarray):
+            raise TaichiRuntimeError(
+                'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray]'
+            )
+        if self.m != x.shape[0]:
+            raise TaichiRuntimeError(
+                f"Dimension mismatch between sparse matrix ({self.n}, {self.m}) and vector ({x.shape})"
+            )
+
+        self.matrix.spmv(get_runtime().prog, x.arr, y.arr)
+
 
 class SparseMatrixBuilder:
     """A python wrap around sparse matrix builder.

diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
@@ -162,6 +162,13 @@ std::unique_ptr<SparseMatrix> make_sparse_matrix(
              storage_format);
 }
 
+std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(int rows,
+                                                    int cols,
+                                                    DataType dt) {
+  return std::unique_ptr<SparseMatrix>(
+      std::make_unique<CuSparseMatrix>(rows, cols, dt));
+}
+
 template <typename T>
 void build_ndarray_template(SparseMatrix &sm,
                             intptr_t data_ptr,
@@ -191,5 +198,75 @@ void make_sparse_matrix_from_ndarray(Program *prog,
   }
 }
 
+void CuSparseMatrix::build_csr(void *csr_ptr,
+                               void *csr_indices_ptr,
+                               void *csr_values_ptr,
+                               int nnz) {
+#if defined(TI_WITH_CUDA)
+  CUSPARSEDriver::get_instance().cpCreateCsr(
+      &matrix_, rows_, cols_, nnz, csr_ptr, csr_indices_ptr, csr_values_ptr,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+      CUDA_R_32F);
+#endif
+}
+CuSparseMatrix::~CuSparseMatrix() {
+#if defined(TI_WITH_CUDA)
+  CUSPARSEDriver::get_instance().cpDestroySpMat(matrix_);
+#endif
+}
+void make_sparse_matrix_from_ndarray_cusparse(Program *prog,
+                                              SparseMatrix &sm,
+                                              const Ndarray &row_offsets,
+                                              const Ndarray &col_indices,
+                                              const Ndarray &values) {
+#if defined(TI_WITH_CUDA)
+  std::string sdtype = taichi::lang::data_type_name(sm.get_data_type());
+  if (!CUSPARSEDriver::get_instance().is_loaded()) {
+    bool load_success = CUSPARSEDriver::get_instance().load_cusparse();
+    if (!load_success) {
+      TI_ERROR("Failed to load cusparse library!");
+    }
+  }
+  size_t row_csr = prog->get_ndarray_data_ptr_as_int(&row_offsets);
+  size_t col_csr = prog->get_ndarray_data_ptr_as_int(&col_indices);
+  size_t values_csr = prog->get_ndarray_data_ptr_as_int(&values);
+  int nnz = values.get_nelement();
+  sm.build_csr((void *)row_csr, (void *)col_csr, (void *)values_csr, nnz);
+#endif
+}
+
+void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, Ndarray &y) {
+#if defined(TI_WITH_CUDA)
+  size_t dX = prog->get_ndarray_data_ptr_as_int(&x);
+  size_t dY = prog->get_ndarray_data_ptr_as_int(&y);
+
+  cusparseDnVecDescr_t vecX, vecY;
+  CUSPARSEDriver::get_instance().cpCreateDnVec(&vecX, cols_, (void *)dX,
+                                               CUDA_R_32F);
+  CUSPARSEDriver::get_instance().cpCreateDnVec(&vecY, rows_, (void *)dY,
+                                               CUDA_R_32F);
+
+  cusparseHandle_t cusparse_handle;
+  CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
+  float alpha = 1.0f, beta = 0.0f;
+  size_t bufferSize = 0;
+  CUSPARSEDriver::get_instance().cpSpMV_bufferSize(
+      cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX,
+      &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, &bufferSize);
+
+  void *dBuffer = NULL;
+  if (bufferSize > 0)
+    CUDADriver::get_instance().malloc(&dBuffer, bufferSize);
+  CUSPARSEDriver::get_instance().cpSpMV(
+      cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX,
+      &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, dBuffer);
+
+  CUSPARSEDriver::get_instance().cpDestroyDnVec(vecX);
+  CUSPARSEDriver::get_instance().cpDestroyDnVec(vecY);
+  CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
+  CUDADriver::get_instance().mem_free(dBuffer);
+#endif
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "taichi/rhi/cuda/cuda_driver.h"
 #include "taichi/common/core.h"
 #include "taichi/inc/constants.h"
 #include "taichi/ir/type_utils.h"
@@ -58,7 +59,16 @@ class SparseMatrix {
   }
   virtual ~SparseMatrix() = default;
 
-  virtual void build_triplets(void *triplets_adr){};
+  virtual void build_triplets(void *triplets_adr) {
+    TI_NOT_IMPLEMENTED;
+  };
+
+  virtual void build_csr(void *csr_ptr,
+                         void *csr_indices_ptr,
+                         void *csr_values_ptr,
+                         int nnz) {
+    TI_NOT_IMPLEMENTED;
+  };
 
   inline const int num_rows() const {
     return rows_;
@@ -189,14 +199,40 @@ class EigenSparseMatrix : public SparseMatrix {
   EigenMatrix matrix_;
 };
 
+class CuSparseMatrix : public SparseMatrix {
+ public:
+  explicit CuSparseMatrix(int rows, int cols, DataType dt)
+      : SparseMatrix(rows, cols, dt) {
+  }
+
+  virtual ~CuSparseMatrix();
+  void build_csr(void *csr_ptr,
+                 void *csr_indices_ptr,
+                 void *csr_values_ptr,
+                 int nnz) override;
+
+  void spmv(Program *prog, const Ndarray &x, Ndarray &y);
+
+ private:
+  cusparseSpMatDescr_t matrix_;
+};
+
 std::unique_ptr<SparseMatrix> make_sparse_matrix(
     int rows,
     int cols,
     DataType dt,
     const std::string &storage_format);
+std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(int rows,
+                                                    int cols,
+                                                    DataType dt);
 
 void make_sparse_matrix_from_ndarray(Program *prog,
                                      SparseMatrix &sm,
                                      const Ndarray &ndarray);
+void make_sparse_matrix_from_ndarray_cusparse(Program *prog,
+                                              SparseMatrix &sm,
+                                              const Ndarray &row_offsets,
+                                              const Ndarray &col_indices,
+                                              const Ndarray &values);
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
@@ -374,23 +374,37 @@ void export_lang(py::module &m) {
            [](Program *program, int n, int m, uint64 max_num_entries,
               DataType dtype, const std::string &storage_format) {
              TI_ERROR_IF(!arch_is_cpu(program->config.arch),
-                         "SparseMatrix only supports CPU for now.");
+                         "SparseMatrix Builder only supports CPU for now.");
              return SparseMatrixBuilder(n, m, max_num_entries, dtype,
                                         storage_format);
            })
       .def("create_sparse_matrix",
            [](Program *program, int n, int m, DataType dtype,
               std::string storage_format) {
-             TI_ERROR_IF(!arch_is_cpu(program->config.arch),
-                         "SparseMatrix only supports CPU for now.");
-             return make_sparse_matrix(n, m, dtype, storage_format);
+             TI_ERROR_IF(!arch_is_cpu(program->config.arch) &&
+                             !arch_is_cuda(program->config.arch),
+                         "SparseMatrix only supports CPU and CUDA for now.");
+             if (arch_is_cpu(program->config.arch))
+               return make_sparse_matrix(n, m, dtype, storage_format);
+             else
+               return make_cu_sparse_matrix(n, m, dtype);
            })
       .def("make_sparse_matrix_from_ndarray",
            [](Program *program, SparseMatrix &sm, const Ndarray &ndarray) {
-             TI_ERROR_IF(!arch_is_cpu(program->config.arch),
-                         "SparseMatrix only supports CPU for now.");
+             TI_ERROR_IF(!arch_is_cpu(program->config.arch) &&
+                             !arch_is_cuda(program->config.arch),
+                         "SparseMatrix only supports CPU and CUDA for now.");
              return make_sparse_matrix_from_ndarray(program, sm, ndarray);
            })
+      .def("make_sparse_matrix_from_ndarray_cusparse",
+           [](Program *program, CuSparseMatrix &sm, const Ndarray &row_csr,
+              const Ndarray &col_csr, const Ndarray &val_csr) {
+             TI_ERROR_IF(
+                 !arch_is_cuda(program->config.arch),
+                 "SparseMatrix based on GPU only supports CUDA for now.");
+             return make_sparse_matrix_from_ndarray_cusparse(
+                 program, sm, row_csr, col_csr, val_csr);
+           })
       .def("no_activate",
            [](Program *program, SNode *snode) {
              // TODO(#2193): Also apply to @ti.func?
@@ -1163,6 +1177,9 @@ void export_lang(py::module &m) {
   MAKE_SPARSE_MATRIX(64, ColMajor, d);
   MAKE_SPARSE_MATRIX(64, RowMajor, d);
 
+  py::class_<CuSparseMatrix>(m, "CuSparseMatrix")
+      .def("spmv", &CuSparseMatrix::spmv);
+
   py::class_<SparseSolver>(m, "SparseSolver")
       .def("compute", &SparseSolver::compute)
       .def("analyze_pattern", &SparseSolver::analyze_pattern)

diff --git a/taichi/rhi/arch.cpp b/taichi/rhi/arch.cpp
@@ -43,6 +43,10 @@ bool arch_is_cpu(Arch arch) {
   }
 }
 
+bool arch_is_cuda(Arch arch) {
+  return arch == Arch::cuda;
+}
+
 bool arch_uses_llvm(Arch arch) {
   return (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::cuda ||
           arch == Arch::wasm);

diff --git a/taichi/rhi/arch.h b/taichi/rhi/arch.h
@@ -18,6 +18,8 @@ Arch arch_from_name(const std::string &arch);
 
 bool arch_is_cpu(Arch arch);
 
+bool arch_is_cuda(Arch arch);
+
 bool arch_uses_llvm(Arch arch);
 
 bool arch_is_gpu(Arch arch);

diff --git a/taichi/rhi/cuda/cuda_driver.cpp b/taichi/rhi/cuda/cuda_driver.cpp
@@ -15,30 +15,39 @@ std::string get_cuda_error_message(uint32 err) {
   return fmt::format("CUDA Error {}: {}", err_name_ptr, err_string_ptr);
 }
 
-bool CUDADriver::detected() {
-  return !disabled_by_env_ && cuda_version_valid_ && loader_->loaded();
-}
-
-CUDADriver::CUDADriver() {
+CUDADriverBase::CUDADriverBase() {
   disabled_by_env_ = (get_environ_config("TI_ENABLE_CUDA", 1) == 0);
   if (disabled_by_env_) {
-    TI_TRACE(
-        "CUDA driver disabled by environment variable \"TI_ENABLE_CUDA\".");
-    return;
+    TI_TRACE("CUDA driver disabled by enviroment variable \"TI_ENABLE_CUDA\".");
   }
+}
 
+bool CUDADriverBase::load_lib(std::string lib_linux, std::string lib_windows) {
 #if defined(TI_PLATFORM_LINUX)
-  loader_ = std::make_unique<DynamicLoader>("libcuda.so");
+  auto lib_name = lib_linux;
 #elif defined(TI_PLATFORM_WINDOWS)
-  loader_ = std::make_unique<DynamicLoader>("nvcuda.dll");
+  auto lib_name = lib_windows;
 #else
   static_assert(false, "Taichi CUDA driver supports only Windows and Linux.");
 #endif
 
+  loader_ = std::make_unique<DynamicLoader>(lib_name);
   if (!loader_->loaded()) {
-    TI_WARN("CUDA driver not found.");
-    return;
+    TI_WARN("{} lib not found.", lib_name);
+    return false;
+  } else {
+    TI_TRACE("{} loaded!", lib_name);
+    return true;
   }
+}
+
+bool CUDADriver::detected() {
+  return !disabled_by_env_ && cuda_version_valid_ && loader_->loaded();
+}
+
+CUDADriver::CUDADriver() {
+  if (!load_lib("libcuda.so", "nvcuda.dll"))
+    return;
 
   loader_->load_function("cuGetErrorName", get_error_name);
   loader_->load_function("cuGetErrorString", get_error_string);
@@ -79,4 +88,36 @@ CUDADriver &CUDADriver::get_instance() {
   return get_instance_without_context();
 }
 
+CUSPARSEDriver::CUSPARSEDriver() {
+}
+
+CUSPARSEDriver &CUSPARSEDriver::get_instance() {
+  static CUSPARSEDriver *instance = new CUSPARSEDriver();
+  return *instance;
+}
+
+bool CUSPARSEDriver::load_cusparse() {
+  cusparse_loaded_ = load_lib("libcusparse.so", "cusparse64_11.dll");
+
+  if (!cusparse_loaded_) {
+    return false;
+  }
+#define PER_CUSPARSE_FUNCTION(name, symbol_name, ...) \
+  name.set(loader_->load_function(#symbol_name));     \
+  name.set_lock(&lock_);                              \
+  name.set_names(#name, #symbol_name);
+#include "taichi/rhi/cuda/cusparse_functions.inc.h"
+#undef PER_CUSPARSE_FUNCTION
+  return cusparse_loaded_;
+}
+
+CUSOLVERDriver::CUSOLVERDriver() {
+  load_lib("libcusolver.so", "cusolver.dll");
+}
+
+CUSOLVERDriver &CUSOLVERDriver::get_instance() {
+  static CUSOLVERDriver *instance = new CUSOLVERDriver();
+  return *instance;
+}
+
 TLANG_NAMESPACE_END