diff --git a/python/taichi/linalg/sparse_matrix.py b/python/taichi/linalg/sparse_matrix.py
index f04ddeb786340..e729052b05f05 100644
--- a/python/taichi/linalg/sparse_matrix.py
+++ b/python/taichi/linalg/sparse_matrix.py
@@ -6,7 +6,7 @@
 from taichi.lang.impl import get_runtime
 from taichi.lang.matrix import Ndarray
 from taichi.lang.util import warning
-from taichi.types import annotations, f32
+from taichi.types import annotations, f32, i32
 
 
 class SparseMatrix:
@@ -198,6 +198,54 @@ def build_from_ndarray(self, ndarray):
                 'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray]'
             )
 
+    def build_csr_cusparse(self, data, indices, indptr):
+        """Build a csr format sparse matrix using cuSparse where the column indices
+            for row i are stored in ``indices[indptr[i]:indptr[i+1]]``
+            and their corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
+
+        Args:
+            data (ti.ndarray): CSR format data array of the matrix.
+            indices (ti.ndarray): CSR format index array of the matrix.
+            indptr (ti.ndarray): CSR format index pointer array of the matrix.
+        """
+        if not isinstance(data, Ndarray) or not isinstance(
+                indices, Ndarray) or not isinstance(indptr, Ndarray):
+            raise TaichiRuntimeError(
+                'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray].'
+            )
+        elif data.dtype != f32 or indices.dtype != i32 or indptr.dtype != i32:
+            raise TaichiRuntimeError(
+                'Sparse matrix only supports building from float32 data and int32 indices/indptr.'
+            )
+        else:
+            get_runtime().prog.make_sparse_matrix_from_ndarray_cusparse(
+                self.matrix, indptr.arr, indices.arr, data.arr)
+
+    def spmv(self, x, y):
+        """Sparse matrix-vector multiplication using cuSparse.
+
+        Args:
+            x (ti.ndarray): the vector to be multiplied.
+            y (ti.ndarray): the result of matrix-vector multiplication.
+
+        Example::
+            >>> x = ti.ndarray(shape=4, dtype=val_dt)
+            >>> y = ti.ndarray(shape=4, dtype=val_dt)
+            >>> A = ti.linalg.SparseMatrix(n=4, m=4, dtype=ti.f32)
+            >>> A.build_from_ndarray_cusparse(row_csr, col_csr, value_csr)
+            >>> A.spmv(x, y)
+        """
+        if not isinstance(x, Ndarray) or not isinstance(y, Ndarray):
+            raise TaichiRuntimeError(
+                'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray]'
+            )
+        if self.m != x.shape[0]:
+            raise TaichiRuntimeError(
+                f"Dimension mismatch between sparse matrix ({self.n}, {self.m}) and vector ({x.shape})"
+            )
+
+        self.matrix.spmv(get_runtime().prog, x.arr, y.arr)
+
 
 class SparseMatrixBuilder:
     """A python wrap around sparse matrix builder.
diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
index 975d6b344f538..bdf646ee7594b 100644
--- a/taichi/program/sparse_matrix.cpp
+++ b/taichi/program/sparse_matrix.cpp
@@ -162,6 +162,13 @@ std::unique_ptr<SparseMatrix> make_sparse_matrix(
              storage_format);
 }
 
+std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(int rows,
+                                                    int cols,
+                                                    DataType dt) {
+  return std::unique_ptr<SparseMatrix>(
+      std::make_unique<CuSparseMatrix>(rows, cols, dt));
+}
+
 template <typename T>
 void build_ndarray_template(SparseMatrix &sm,
                             intptr_t data_ptr,
@@ -191,5 +198,75 @@ void make_sparse_matrix_from_ndarray(Program *prog,
   }
 }
 
+void CuSparseMatrix::build_csr(void *csr_ptr,
+                               void *csr_indices_ptr,
+                               void *csr_values_ptr,
+                               int nnz) {
+#if defined(TI_WITH_CUDA)
+  CUSPARSEDriver::get_instance().cpCreateCsr(
+      &matrix_, rows_, cols_, nnz, csr_ptr, csr_indices_ptr, csr_values_ptr,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+      CUDA_R_32F);
+#endif
+}
+CuSparseMatrix::~CuSparseMatrix() {
+#if defined(TI_WITH_CUDA)
+  CUSPARSEDriver::get_instance().cpDestroySpMat(matrix_);
+#endif
+}
+void make_sparse_matrix_from_ndarray_cusparse(Program *prog,
+                                              SparseMatrix &sm,
+                                              const Ndarray &row_offsets,
+                                              const Ndarray &col_indices,
+                                              const Ndarray &values) {
+#if defined(TI_WITH_CUDA)
+  std::string sdtype = taichi::lang::data_type_name(sm.get_data_type());
+  if (!CUSPARSEDriver::get_instance().is_loaded()) {
+    bool load_success = CUSPARSEDriver::get_instance().load_cusparse();
+    if (!load_success) {
+      TI_ERROR("Failed to load cusparse library!");
+    }
+  }
+  size_t row_csr = prog->get_ndarray_data_ptr_as_int(&row_offsets);
+  size_t col_csr = prog->get_ndarray_data_ptr_as_int(&col_indices);
+  size_t values_csr = prog->get_ndarray_data_ptr_as_int(&values);
+  int nnz = values.get_nelement();
+  sm.build_csr((void *)row_csr, (void *)col_csr, (void *)values_csr, nnz);
+#endif
+}
+
+void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, Ndarray &y) {
+#if defined(TI_WITH_CUDA)
+  size_t dX = prog->get_ndarray_data_ptr_as_int(&x);
+  size_t dY = prog->get_ndarray_data_ptr_as_int(&y);
+
+  cusparseDnVecDescr_t vecX, vecY;
+  CUSPARSEDriver::get_instance().cpCreateDnVec(&vecX, cols_, (void *)dX,
+                                               CUDA_R_32F);
+  CUSPARSEDriver::get_instance().cpCreateDnVec(&vecY, rows_, (void *)dY,
+                                               CUDA_R_32F);
+
+  cusparseHandle_t cusparse_handle;
+  CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
+  float alpha = 1.0f, beta = 0.0f;
+  size_t bufferSize = 0;
+  CUSPARSEDriver::get_instance().cpSpMV_bufferSize(
+      cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX,
+      &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, &bufferSize);
+
+  void *dBuffer = NULL;
+  if (bufferSize > 0)
+    CUDADriver::get_instance().malloc(&dBuffer, bufferSize);
+  CUSPARSEDriver::get_instance().cpSpMV(
+      cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX,
+      &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, dBuffer);
+
+  CUSPARSEDriver::get_instance().cpDestroyDnVec(vecX);
+  CUSPARSEDriver::get_instance().cpDestroyDnVec(vecY);
+  CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
+  CUDADriver::get_instance().mem_free(dBuffer);
+#endif
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h
index 9501fc2781469..be776045bad7d 100644
--- a/taichi/program/sparse_matrix.h
+++ b/taichi/program/sparse_matrix.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "taichi/rhi/cuda/cuda_driver.h"
 #include "taichi/common/core.h"
 #include "taichi/inc/constants.h"
 #include "taichi/ir/type_utils.h"
@@ -58,7 +59,16 @@ class SparseMatrix {
   }
   virtual ~SparseMatrix() = default;
 
-  virtual void build_triplets(void *triplets_adr){};
+  virtual void build_triplets(void *triplets_adr) {
+    TI_NOT_IMPLEMENTED;
+  };
+
+  virtual void build_csr(void *csr_ptr,
+                         void *csr_indices_ptr,
+                         void *csr_values_ptr,
+                         int nnz) {
+    TI_NOT_IMPLEMENTED;
+  };
 
   inline const int num_rows() const {
     return rows_;
@@ -189,14 +199,40 @@ class EigenSparseMatrix : public SparseMatrix {
   EigenMatrix matrix_;
 };
 
+class CuSparseMatrix : public SparseMatrix {
+ public:
+  explicit CuSparseMatrix(int rows, int cols, DataType dt)
+      : SparseMatrix(rows, cols, dt) {
+  }
+
+  virtual ~CuSparseMatrix();
+  void build_csr(void *csr_ptr,
+                 void *csr_indices_ptr,
+                 void *csr_values_ptr,
+                 int nnz) override;
+
+  void spmv(Program *prog, const Ndarray &x, Ndarray &y);
+
+ private:
+  cusparseSpMatDescr_t matrix_;
+};
+
 std::unique_ptr<SparseMatrix> make_sparse_matrix(
     int rows,
     int cols,
     DataType dt,
     const std::string &storage_format);
+std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(int rows,
+                                                    int cols,
+                                                    DataType dt);
 
 void make_sparse_matrix_from_ndarray(Program *prog,
                                      SparseMatrix &sm,
                                      const Ndarray &ndarray);
+void make_sparse_matrix_from_ndarray_cusparse(Program *prog,
+                                              SparseMatrix &sm,
+                                              const Ndarray &row_offsets,
+                                              const Ndarray &col_indices,
+                                              const Ndarray &values);
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 39a2149db656f..e2aaf88f8b4f4 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -380,23 +380,37 @@ void export_lang(py::module &m) {
            [](Program *program, int n, int m, uint64 max_num_entries,
               DataType dtype, const std::string &storage_format) {
              TI_ERROR_IF(!arch_is_cpu(program->config.arch),
-                         "SparseMatrix only supports CPU for now.");
+                         "SparseMatrix Builder only supports CPU for now.");
              return SparseMatrixBuilder(n, m, max_num_entries, dtype,
                                         storage_format);
            })
       .def("create_sparse_matrix",
            [](Program *program, int n, int m, DataType dtype,
               std::string storage_format) {
-             TI_ERROR_IF(!arch_is_cpu(program->config.arch),
-                         "SparseMatrix only supports CPU for now.");
-             return make_sparse_matrix(n, m, dtype, storage_format);
+             TI_ERROR_IF(!arch_is_cpu(program->config.arch) &&
+                             !arch_is_cuda(program->config.arch),
+                         "SparseMatrix only supports CPU and CUDA for now.");
+             if (arch_is_cpu(program->config.arch))
+               return make_sparse_matrix(n, m, dtype, storage_format);
+             else
+               return make_cu_sparse_matrix(n, m, dtype);
            })
       .def("make_sparse_matrix_from_ndarray",
            [](Program *program, SparseMatrix &sm, const Ndarray &ndarray) {
-             TI_ERROR_IF(!arch_is_cpu(program->config.arch),
-                         "SparseMatrix only supports CPU for now.");
+             TI_ERROR_IF(!arch_is_cpu(program->config.arch) &&
+                             !arch_is_cuda(program->config.arch),
+                         "SparseMatrix only supports CPU and CUDA for now.");
              return make_sparse_matrix_from_ndarray(program, sm, ndarray);
            })
+      .def("make_sparse_matrix_from_ndarray_cusparse",
+           [](Program *program, CuSparseMatrix &sm, const Ndarray &row_csr,
+              const Ndarray &col_csr, const Ndarray &val_csr) {
+             TI_ERROR_IF(
+                 !arch_is_cuda(program->config.arch),
+                 "SparseMatrix based on GPU only supports CUDA for now.");
+             return make_sparse_matrix_from_ndarray_cusparse(
+                 program, sm, row_csr, col_csr, val_csr);
+           })
       .def("no_activate",
            [](Program *program, SNode *snode) {
              // TODO(#2193): Also apply to @ti.func?
@@ -1171,6 +1185,9 @@ void export_lang(py::module &m) {
   MAKE_SPARSE_MATRIX(64, ColMajor, d);
   MAKE_SPARSE_MATRIX(64, RowMajor, d);
 
+  py::class_<CuSparseMatrix>(m, "CuSparseMatrix")
+      .def("spmv", &CuSparseMatrix::spmv);
+
   py::class_<SparseSolver>(m, "SparseSolver")
       .def("compute", &SparseSolver::compute)
       .def("analyze_pattern", &SparseSolver::analyze_pattern)
diff --git a/taichi/rhi/arch.cpp b/taichi/rhi/arch.cpp
index a6eff634c84e5..bca52aefe49bd 100644
--- a/taichi/rhi/arch.cpp
+++ b/taichi/rhi/arch.cpp
@@ -43,6 +43,10 @@ bool arch_is_cpu(Arch arch) {
   }
 }
 
+bool arch_is_cuda(Arch arch) {
+  return arch == Arch::cuda;
+}
+
 bool arch_uses_llvm(Arch arch) {
   return (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::cuda ||
           arch == Arch::wasm);
diff --git a/taichi/rhi/arch.h b/taichi/rhi/arch.h
index 2d7cffde8950f..47e74ef3acbb0 100644
--- a/taichi/rhi/arch.h
+++ b/taichi/rhi/arch.h
@@ -18,6 +18,8 @@ Arch arch_from_name(const std::string &arch);
 
 bool arch_is_cpu(Arch arch);
 
+bool arch_is_cuda(Arch arch);
+
 bool arch_uses_llvm(Arch arch);
 
 bool arch_is_gpu(Arch arch);
diff --git a/taichi/rhi/cuda/cuda_driver.cpp b/taichi/rhi/cuda/cuda_driver.cpp
index ffa7653dab5ee..f882b75cd5a62 100644
--- a/taichi/rhi/cuda/cuda_driver.cpp
+++ b/taichi/rhi/cuda/cuda_driver.cpp
@@ -15,30 +15,39 @@ std::string get_cuda_error_message(uint32 err) {
   return fmt::format("CUDA Error {}: {}", err_name_ptr, err_string_ptr);
 }
 
-bool CUDADriver::detected() {
-  return !disabled_by_env_ && cuda_version_valid_ && loader_->loaded();
-}
-
-CUDADriver::CUDADriver() {
+CUDADriverBase::CUDADriverBase() {
   disabled_by_env_ = (get_environ_config("TI_ENABLE_CUDA", 1) == 0);
   if (disabled_by_env_) {
-    TI_TRACE(
-        "CUDA driver disabled by environment variable \"TI_ENABLE_CUDA\".");
-    return;
+    TI_TRACE("CUDA driver disabled by enviroment variable \"TI_ENABLE_CUDA\".");
   }
+}
 
+bool CUDADriverBase::load_lib(std::string lib_linux, std::string lib_windows) {
 #if defined(TI_PLATFORM_LINUX)
-  loader_ = std::make_unique<DynamicLoader>("libcuda.so");
+  auto lib_name = lib_linux;
 #elif defined(TI_PLATFORM_WINDOWS)
-  loader_ = std::make_unique<DynamicLoader>("nvcuda.dll");
+  auto lib_name = lib_windows;
 #else
   static_assert(false, "Taichi CUDA driver supports only Windows and Linux.");
 #endif
 
+  loader_ = std::make_unique<DynamicLoader>(lib_name);
   if (!loader_->loaded()) {
-    TI_WARN("CUDA driver not found.");
-    return;
+    TI_WARN("{} lib not found.", lib_name);
+    return false;
+  } else {
+    TI_TRACE("{} loaded!", lib_name);
+    return true;
   }
+}
+
+bool CUDADriver::detected() {
+  return !disabled_by_env_ && cuda_version_valid_ && loader_->loaded();
+}
+
+CUDADriver::CUDADriver() {
+  if (!load_lib("libcuda.so", "nvcuda.dll"))
+    return;
 
   loader_->load_function("cuGetErrorName", get_error_name);
   loader_->load_function("cuGetErrorString", get_error_string);
@@ -79,4 +88,36 @@ CUDADriver &CUDADriver::get_instance() {
   return get_instance_without_context();
 }
 
+CUSPARSEDriver::CUSPARSEDriver() {
+}
+
+CUSPARSEDriver &CUSPARSEDriver::get_instance() {
+  static CUSPARSEDriver *instance = new CUSPARSEDriver();
+  return *instance;
+}
+
+bool CUSPARSEDriver::load_cusparse() {
+  cusparse_loaded_ = load_lib("libcusparse.so", "cusparse64_11.dll");
+
+  if (!cusparse_loaded_) {
+    return false;
+  }
+#define PER_CUSPARSE_FUNCTION(name, symbol_name, ...) \
+  name.set(loader_->load_function(#symbol_name));     \
+  name.set_lock(&lock_);                              \
+  name.set_names(#name, #symbol_name);
+#include "taichi/rhi/cuda/cusparse_functions.inc.h"
+#undef PER_CUSPARSE_FUNCTION
+  return cusparse_loaded_;
+}
+
+CUSOLVERDriver::CUSOLVERDriver() {
+  load_lib("libcusolver.so", "cusolver.dll");
+}
+
+CUSOLVERDriver &CUSOLVERDriver::get_instance() {
+  static CUSOLVERDriver *instance = new CUSOLVERDriver();
+  return *instance;
+}
+
 TLANG_NAMESPACE_END
diff --git a/taichi/rhi/cuda/cuda_driver.h b/taichi/rhi/cuda/cuda_driver.h
index 35bd0f2105d2b..25491bbb44ee0 100644
--- a/taichi/rhi/cuda/cuda_driver.h
+++ b/taichi/rhi/cuda/cuda_driver.h
@@ -95,7 +95,20 @@ class CUDADriverFunction {
   std::mutex *driver_lock_{nullptr};
 };
 
-class CUDADriver {
+class CUDADriverBase {
+ public:
+  ~CUDADriverBase() = default;
+
+ protected:
+  std::unique_ptr<DynamicLoader> loader_;
+  CUDADriverBase();
+
+  bool load_lib(std::string lib_linux, std::string lib_windows);
+
+  bool disabled_by_env_{false};
+};
+
+class CUDADriver : protected CUDADriverBase {
  public:
 #define PER_CUDA_FUNCTION(name, symbol_name, ...) \
   CUDADriverFunction<__VA_ARGS__> name;
@@ -110,8 +123,6 @@ class CUDADriver {
 
   bool detected();
 
-  ~CUDADriver() = default;
-
   static CUDADriver &get_instance();
 
   static CUDADriver &get_instance_without_context();
@@ -119,12 +130,39 @@ class CUDADriver {
  private:
   CUDADriver();
 
-  std::unique_ptr<DynamicLoader> loader_;
-
   std::mutex lock_;
 
-  bool disabled_by_env_{false};
   bool cuda_version_valid_{false};
 };
 
+class CUSPARSEDriver : protected CUDADriverBase {
+ public:
+  static CUSPARSEDriver &get_instance();
+
+#define PER_CUSPARSE_FUNCTION(name, symbol_name, ...) \
+  CUDADriverFunction<__VA_ARGS__> name;
+#include "taichi/rhi/cuda/cusparse_functions.inc.h"
+#undef PER_CUSPARSE_FUNCTION
+
+  bool load_cusparse();
+
+  inline bool is_loaded() {
+    return cusparse_loaded_;
+  }
+
+ private:
+  CUSPARSEDriver();
+  std::mutex lock_;
+  bool cusparse_loaded_{false};
+};
+
+class CUSOLVERDriver : protected CUDADriverBase {
+ public:
+  // TODO: Add cusolver function APIs
+  static CUSOLVERDriver &get_instance();
+
+ private:
+  CUSOLVERDriver();
+};
+
 TLANG_NAMESPACE_END
diff --git a/taichi/rhi/cuda/cuda_types.h b/taichi/rhi/cuda/cuda_types.h
index 2652335b4e7c7..3e4d36ec1c526 100644
--- a/taichi/rhi/cuda/cuda_types.h
+++ b/taichi/rhi/cuda/cuda_types.h
@@ -3,6 +3,7 @@
 #if defined(TI_WITH_CUDA_TOOLKIT)
 
 #include <cuda.h>
+#include <cusparse.h>
 
 #else
 
@@ -434,3 +435,68 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
 #define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20
 
 #endif
+
+// copy from cusparse.h
+struct cusparseContext;
+typedef struct cusparseContext *cusparseHandle_t;
+
+struct cusparseDnVecDescr;
+struct cusparseSpMatDescr;
+typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t;
+typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t;
+typedef enum {
+  CUSPARSE_INDEX_16U = 1,  ///< 16-bit unsigned integer for matrix/vector
+                           ///< indices
+  CUSPARSE_INDEX_32I = 2,  ///< 32-bit signed integer for matrix/vector indices
+  CUSPARSE_INDEX_64I = 3   ///< 64-bit signed integer for matrix/vector indices
+} cusparseIndexType_t;
+
+typedef enum {
+  CUSPARSE_INDEX_BASE_ZERO = 0,
+  CUSPARSE_INDEX_BASE_ONE = 1
+} cusparseIndexBase_t;
+
+typedef enum cudaDataType_t {
+  CUDA_R_16F = 2,   /* real as a half */
+  CUDA_C_16F = 6,   /* complex as a pair of half numbers */
+  CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
+  CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
+  CUDA_R_32F = 0,   /* real as a float */
+  CUDA_C_32F = 4,   /* complex as a pair of float numbers */
+  CUDA_R_64F = 1,   /* real as a double */
+  CUDA_C_64F = 5,   /* complex as a pair of double numbers */
+  CUDA_R_4I = 16,   /* real as a signed 4-bit int */
+  CUDA_C_4I = 17,   /* complex as a pair of signed 4-bit int numbers */
+  CUDA_R_4U = 18,   /* real as a unsigned 4-bit int */
+  CUDA_C_4U = 19,   /* complex as a pair of unsigned 4-bit int numbers */
+  CUDA_R_8I = 3,    /* real as a signed 8-bit int */
+  CUDA_C_8I = 7,    /* complex as a pair of signed 8-bit int numbers */
+  CUDA_R_8U = 8,    /* real as a unsigned 8-bit int */
+  CUDA_C_8U = 9,    /* complex as a pair of unsigned 8-bit int numbers */
+  CUDA_R_16I = 20,  /* real as a signed 16-bit int */
+  CUDA_C_16I = 21,  /* complex as a pair of signed 16-bit int numbers */
+  CUDA_R_16U = 22,  /* real as a unsigned 16-bit int */
+  CUDA_C_16U = 23,  /* complex as a pair of unsigned 16-bit int numbers */
+  CUDA_R_32I = 10,  /* real as a signed 32-bit int */
+  CUDA_C_32I = 11,  /* complex as a pair of signed 32-bit int numbers */
+  CUDA_R_32U = 12,  /* real as a unsigned 32-bit int */
+  CUDA_C_32U = 13,  /* complex as a pair of unsigned 32-bit int numbers */
+  CUDA_R_64I = 24,  /* real as a signed 64-bit int */
+  CUDA_C_64I = 25,  /* complex as a pair of signed 64-bit int numbers */
+  CUDA_R_64U = 26,  /* real as a unsigned 64-bit int */
+  CUDA_C_64U = 27   /* complex as a pair of unsigned 64-bit int numbers */
+} cudaDataType;
+
+typedef enum {
+  CUSPARSE_OPERATION_NON_TRANSPOSE = 0,
+  CUSPARSE_OPERATION_TRANSPOSE = 1,
+  CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
+} cusparseOperation_t;
+
+typedef enum {
+  CUSPARSE_SPMV_ALG_DEFAULT = 0,
+  CUSPARSE_SPMV_COO_ALG1 = 1,
+  CUSPARSE_SPMV_CSR_ALG1 = 2,
+  CUSPARSE_SPMV_CSR_ALG2 = 3,
+  CUSPARSE_SPMV_COO_ALG2 = 4
+} cusparseSpMVAlg_t;
diff --git a/taichi/rhi/cuda/cusparse_functions.inc.h b/taichi/rhi/cuda/cusparse_functions.inc.h
new file mode 100644
index 0000000000000..37df588ebdb66
--- /dev/null
+++ b/taichi/rhi/cuda/cusparse_functions.inc.h
@@ -0,0 +1,18 @@
+// clang-format off
+
+// cusparse setup
+PER_CUSPARSE_FUNCTION(cpCreate, cusparseCreate, cusparseHandle_t *);
+PER_CUSPARSE_FUNCTION(cpDestroy, cusparseDestroy, cusparseHandle_t);
+
+// cusparse sparse matrix description
+PER_CUSPARSE_FUNCTION(cpCreateCoo, cusparseCreateCoo, cusparseSpMatDescr_t*, int, int, int,void*, void*, void*,cusparseIndexType_t, cusparseIndexBase_t,cudaDataType );
+PER_CUSPARSE_FUNCTION(cpCreateCsr, cusparseCreateCsr, cusparseSpMatDescr_t*, int, int, int,void*, void*, void*,cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,cudaDataType );
+PER_CUSPARSE_FUNCTION(cpDestroySpMat, cusparseDestroySpMat, cusparseSpMatDescr_t);
+
+// cusparse dense vector description
+PER_CUSPARSE_FUNCTION(cpCreateDnVec, cusparseCreateDnVec, cusparseDnVecDescr_t*, int, void*, cudaDataType);
+PER_CUSPARSE_FUNCTION(cpDestroyDnVec, cusparseDestroyDnVec, cusparseDnVecDescr_t);
+
+// cusparse sparse matrix-vector multiplication
+PER_CUSPARSE_FUNCTION(cpSpMV_bufferSize, cusparseSpMV_bufferSize, cusparseHandle_t, cusparseOperation_t, const void*,cusparseSpMatDescr_t, cusparseDnVecDescr_t,const void*, cusparseDnVecDescr_t,cudaDataType, cusparseSpMVAlg_t, size_t*);
+PER_CUSPARSE_FUNCTION(cpSpMV, cusparseSpMV, cusparseHandle_t, cusparseOperation_t, const void*,cusparseSpMatDescr_t, cusparseDnVecDescr_t,const void*, cusparseDnVecDescr_t,cudaDataType, cusparseSpMVAlg_t, void*);
diff --git a/tests/python/test_sparse_matrix.py b/tests/python/test_sparse_matrix.py
index d4f6b320c8484..69a0d66f5958d 100644
--- a/tests/python/test_sparse_matrix.py
+++ b/tests/python/test_sparse_matrix.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pytest
 
 import taichi as ti
@@ -374,3 +375,42 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
     for i in range(n):
         for j in range(m):
             assert C[i, j] == GT[i][j]
+
+
+@test_utils.test(arch=ti.cuda)
+def test_gpu_sparse_matrix():
+    h_row_csr = np.asarray([0, 3, 4, 7, 9], dtype=np.int32)
+    h_col_csr = np.asarray([0, 2, 3, 1, 0, 2, 3, 1, 3], dtype=np.int32)
+    h_value_csr = np.asarray([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0],
+                             dtype=np.float32)
+    h_X = np.asarray([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
+    h_Y = np.asarray([19.0, 8.0, 51.0, 52.0], dtype=np.float32)
+
+    # Data structure for building the CSR matrix A using Taichi Sparse Matrix
+    idx_dt = ti.int32
+    val_dt = ti.f32
+    row_csr = ti.ndarray(shape=5, dtype=idx_dt)
+    col_csr = ti.ndarray(shape=9, dtype=idx_dt)
+    value_csr = ti.ndarray(shape=9, dtype=val_dt)
+    # Dense vector x
+    X = ti.ndarray(shape=4, dtype=val_dt)
+    # Results for A @ x
+    Y = ti.ndarray(shape=4, dtype=val_dt)
+
+    # Initialize the CSR matrix and vectors with numpy array
+    row_csr.from_numpy(h_row_csr)
+    col_csr.from_numpy(h_col_csr)
+    value_csr.from_numpy(h_value_csr)
+    X.from_numpy(h_X)
+    Y.fill(0.0)
+
+    # Define the CSR matrix A
+    A = ti.linalg.SparseMatrix(n=4, m=4, dtype=ti.f32)
+
+    # Build the CSR matrix A with Taichi ndarray
+    A.build_csr_cusparse(value_csr, col_csr, row_csr)
+
+    # Compute Y = A @ X
+    A.spmv(X, Y)
+    for i in range(4):
+        assert Y[i] == h_Y[i]