diff --git a/misc/test_gpu_sm_op.py b/misc/test_gpu_sm_op.py
new file mode 100644
index 0000000000000..2de61af1d9a48
--- /dev/null
+++ b/misc/test_gpu_sm_op.py
@@ -0,0 +1,84 @@
+import numpy as np
+import scipy
+from numpy.random import default_rng
+from scipy import stats
+from scipy.sparse import random
+
+import taichi as ti
+
+ti.init(arch=ti.cuda)
+
+idx_dt = ti.i32
+val_dt = ti.f32
+
+seed = 2
+np.random.seed(seed)
+
+rng = default_rng(seed)
+rvs = stats.poisson(3, loc=1).rvs
+N = 5
+np_dtype = np.float32
+rows = N
+cols = N - 1
+
+S1 = random(rows, cols, density=0.5, random_state=rng,
+            data_rvs=rvs).astype(np_dtype)
+S2 = random(rows, cols, density=0.5, random_state=rng,
+            data_rvs=rvs).astype(np_dtype)
+
+nnz_A = len(S1.data)
+nnz_B = len(S2.data)
+
+coo_S1 = scipy.sparse.coo_matrix(S1)
+coo_S2 = scipy.sparse.coo_matrix(S2)
+
+row_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt)
+col_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt)
+value_coo_A = ti.ndarray(shape=nnz_A, dtype=val_dt)
+row_coo_A.from_numpy(coo_S1.row)
+col_coo_A.from_numpy(coo_S1.col)
+value_coo_A.from_numpy(coo_S1.data)
+
+row_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt)
+col_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt)
+value_coo_B = ti.ndarray(shape=nnz_B, dtype=val_dt)
+row_coo_B.from_numpy(coo_S2.row)
+col_coo_B.from_numpy(coo_S2.col)
+value_coo_B.from_numpy(coo_S2.data)
+
+A = ti.linalg.SparseMatrix(n=rows, m=cols, dtype=ti.f32)
+B = ti.linalg.SparseMatrix(n=rows, m=cols, dtype=ti.f32)
+A.build_coo(row_coo_A, col_coo_A, value_coo_A)
+B.build_coo(row_coo_B, col_coo_B, value_coo_B)
+
+print('>>>> A:')
+print(A)
+print('>>>> B:')
+print(B)
+
+print('>>>> C = A + B:')
+C = A + B
+print(C)
+print('>>>> verify:')
+S3 = S1 + S2
+print(S3.A)
+print('>>>> C - A:')
+D = C - A
+print(D)
+print('>>>> verify:')
+print((S3 - S1).A)
+print('>>>> A * 2.5:')
+E = A * 2.5
+print(E)
+print('>>>> verify:')
+print((2.5 * S1).A)
+print('>>>> A.T:')
+F = A.transpose()
+print(F)
+print('>>>> verify:')
+print(S1.T.A)
+print('>>>> A @ B.T:')
+G = A @ B.transpose()
+print(G)
+print('>>>> verify:')
+print((S1 @ S2.T).A)
diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
index f142766111c64..af11862dafe06 100644
--- a/taichi/program/sparse_matrix.cpp
+++ b/taichi/program/sparse_matrix.cpp
@@ -33,6 +33,28 @@ struct key_hash {
     return h1 ^ h2;
   }
 };
+
+template <typename T, typename T1, typename T2>
+void print_triplet_from_csr(int64_t n_rows,
+                            int n_cols,
+                            T *row,
+                            T1 *col,
+                            T2 *value,
+                            std::ostringstream &ostr) {
+  using Triplets = Eigen::Triplet<T2>;
+  std::vector<Triplets> trips;
+  for (int64_t i = 1; i <= n_rows; ++i) {
+    auto n_i = row[i] - row[i - 1];
+    for (auto j = 0; j < n_i; ++j) {
+      trips.push_back({i - 1, col[row[i - 1] + j], value[row[i - 1] + j]});
+    }
+  }
+  Eigen::SparseMatrix<float> m(n_rows, n_cols);
+  m.setFromTriplets(trips.begin(), trips.end());
+  Eigen::IOFormat clean_fmt(4, 0, ", ", "\n", "[", "]");
+  ostr << Eigen::MatrixXf(m.cast<float>()).format(clean_fmt);
+}
+
 }  // namespace
 
 namespace taichi {
@@ -169,6 +191,14 @@ std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(int rows,
       std::make_unique<CuSparseMatrix>(rows, cols, dt));
 }
 
+std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(cusparseSpMatDescr_t mat,
+                                                    int rows,
+                                                    int cols,
+                                                    DataType dt) {
+  return std::unique_ptr<SparseMatrix>(
+      std::make_unique<CuSparseMatrix>(mat, rows, cols, dt));
+}
+
 template <typename T>
 void build_ndarray_template(SparseMatrix &sm,
                             intptr_t data_ptr,
@@ -204,11 +234,11 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
                                         int nnz) {
 #if defined(TI_WITH_CUDA)
   // Step 1: Sort coo first
-  cusparseHandle_t cusparse_handle = NULL;
+  cusparseHandle_t cusparse_handle = nullptr;
   CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
   cusparseSpVecDescr_t vec_permutation;
   cusparseDnVecDescr_t vec_values;
-  void *d_permutation = NULL, *d_values_sorted = NULL;
+  void *d_permutation = nullptr, *d_values_sorted = nullptr;
   CUDADriver::get_instance().malloc(&d_permutation, nnz * sizeof(int));
   CUDADriver::get_instance().malloc(&d_values_sorted, nnz * sizeof(float));
   CUSPARSEDriver::get_instance().cpCreateSpVec(
@@ -220,7 +250,7 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
   CUSPARSEDriver::get_instance().cpXcoosort_bufferSizeExt(
       cusparse_handle, rows_, cols_, nnz, coo_row_ptr, coo_col_ptr,
       &bufferSize);
-  void *dbuffer = NULL;
+  void *dbuffer = nullptr;
   if (bufferSize > 0)
     CUDADriver::get_instance().malloc(&dbuffer, bufferSize);
   // Setup permutation vector to identity
@@ -234,7 +264,7 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
   CUDADriver::get_instance().memcpy_device_to_device(
       coo_values_ptr, d_values_sorted, nnz * sizeof(float));
   // Step 2: coo to csr
-  void *csr_row_offset_ptr = NULL;
+  void *csr_row_offset_ptr = nullptr;
   CUDADriver::get_instance().malloc(&csr_row_offset_ptr,
                                     sizeof(int) * (rows_ + 1));
   CUSPARSEDriver::get_instance().cpCoo2Csr(
@@ -276,6 +306,285 @@ void make_sparse_matrix_from_ndarray_cusparse(Program *prog,
 #endif
 }
 
+// Reference::https://docs.nvidia.com/cuda/cusparse/index.html#csrgeam2
+std::unique_ptr<SparseMatrix> CuSparseMatrix::addition(
+    const CuSparseMatrix &other,
+    const float alpha,
+    const float beta) const {
+#if defined(TI_WITH_CUDA)
+  // Get information of this matrix: A
+  size_t nrows_A = 0, ncols_A = 0, nnz_A = 0;
+  void *drow_offsets_A = nullptr, *dcol_indices_A = nullptr,
+       *dvalues_A = nullptr;
+  cusparseIndexType_t csrRowOffsetsType_A, csrColIndType_A;
+  cusparseIndexBase_t idxBase_A;
+  cudaDataType valueType_A;
+  TI_ASSERT(matrix_ != nullptr);
+
+  CUSPARSEDriver::get_instance().cpCsrGet(
+      matrix_, &nrows_A, &ncols_A, &nnz_A, &drow_offsets_A, &dcol_indices_A,
+      &dvalues_A, &csrRowOffsetsType_A, &csrColIndType_A, &idxBase_A,
+      &valueType_A);
+  // Get information of other matrix: B
+  size_t nrows_B = 0, ncols_B = 0, nnz_B = 0;
+  void *drow_offsets_B = nullptr, *dcol_indices_B = nullptr,
+       *dvalues_B = nullptr;
+  cusparseIndexType_t csrRowOffsetsType_B, csrColIndType_B;
+  cusparseIndexBase_t idxBase_B;
+  cudaDataType valueType_B;
+  CUSPARSEDriver::get_instance().cpCsrGet(
+      other.matrix_, &nrows_B, &ncols_B, &nnz_B, &drow_offsets_B,
+      &dcol_indices_B, &dvalues_B, &csrRowOffsetsType_B, &csrColIndType_B,
+      &idxBase_B, &valueType_B);
+
+  // Create sparse matrix: C
+  int *drow_offsets_C = nullptr;
+  int *dcol_indices_C = nullptr;
+  float *dvalues_C = nullptr;
+  cusparseMatDescr_t descrA = nullptr, descrB = nullptr, descrC = nullptr;
+  CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrA);
+  CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrB);
+  CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrC);
+  CUSPARSEDriver::get_instance().cpSetMatType(descrA,
+                                              CUSPARSE_MATRIX_TYPE_GENERAL);
+  CUSPARSEDriver::get_instance().cpSetMatType(descrB,
+                                              CUSPARSE_MATRIX_TYPE_GENERAL);
+  CUSPARSEDriver::get_instance().cpSetMatType(descrC,
+                                              CUSPARSE_MATRIX_TYPE_GENERAL);
+  CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrC,
+                                                   CUSPARSE_INDEX_BASE_ZERO);
+  CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrA,
+                                                   CUSPARSE_INDEX_BASE_ZERO);
+  CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrB,
+                                                   CUSPARSE_INDEX_BASE_ZERO);
+
+  // Start to do addition
+  cusparseHandle_t cusparse_handle;
+  CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
+  // alpha, nnzTotalDevHostPtr points to host memory
+  size_t BufferSizeInBytes;
+  char *buffer = nullptr;
+  int nnzC;
+  int *nnzTotalDevHostPtr = &nnzC;
+  CUSPARSEDriver::get_instance().cpSetPointerMode(cusparse_handle,
+                                                  CUSPARSE_POINTER_MODE_HOST);
+  CUDADriver::get_instance().malloc((void **)(&drow_offsets_C),
+                                    sizeof(int) * (nrows_A + 1));
+  // Prepare buffer
+  CUSPARSEDriver::get_instance().cpScsrgeam2_bufferSizeExt(
+      cusparse_handle, nrows_A, ncols_A, (void *)(&alpha), descrA, nnz_A,
+      dvalues_A, drow_offsets_A, dcol_indices_A, (void *)&beta, descrB, nnz_B,
+      dvalues_B, drow_offsets_B, dcol_indices_B, descrC, dvalues_C,
+      drow_offsets_C, dcol_indices_C, &BufferSizeInBytes);
+
+  if (BufferSizeInBytes > 0)
+    CUDADriver::get_instance().malloc((void **)(&buffer), BufferSizeInBytes);
+
+  // Determine drow_offsets_C and the total number of nonzero elements.
+  CUSPARSEDriver::get_instance().cpXcsrgeam2Nnz(
+      cusparse_handle, nrows_A, ncols_A, descrA, nnz_A, drow_offsets_A,
+      dcol_indices_A, descrB, nnz_B, drow_offsets_B, dcol_indices_B, descrC,
+      drow_offsets_C, nnzTotalDevHostPtr, buffer);
+
+  int baseC;
+  if (nullptr != nnzTotalDevHostPtr) {
+    nnzC = *nnzTotalDevHostPtr;
+  } else {
+    CUDADriver::get_instance().memcpy_device_to_host(
+        (void *)(&nnzC), (void *)(drow_offsets_C + nrows_A), sizeof(int));
+    CUDADriver::get_instance().memcpy_device_to_host(
+        (void *)(&baseC), (void *)(drow_offsets_C), sizeof(int));
+    nnzC -= baseC;
+  }
+
+  CUDADriver::get_instance().malloc((void **)&dcol_indices_C,
+                                    sizeof(int) * nnzC);
+  CUDADriver::get_instance().malloc((void **)&dvalues_C, sizeof(float) * nnzC);
+
+  CUSPARSEDriver::get_instance().cpScsrgeam2(
+      cusparse_handle, nrows_A, ncols_A, (void *)(&alpha), descrA, nnz_A,
+      dvalues_A, drow_offsets_A, dcol_indices_A, (void *)(&beta), descrB, nnz_B,
+      dvalues_B, drow_offsets_B, dcol_indices_B, descrC, dvalues_C,
+      drow_offsets_C, dcol_indices_C, buffer);
+
+  cusparseSpMatDescr_t matrix_C;
+  CUSPARSEDriver::get_instance().cpCreateCsr(
+      &matrix_C, rows_, cols_, nnzC, drow_offsets_C, dcol_indices_C, dvalues_C,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+      CUDA_R_32F);
+
+  CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
+  CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrA);
+  CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrB);
+  CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrC);
+  CUDADriver::get_instance().mem_free(buffer);
+  return make_cu_sparse_matrix(matrix_C, rows_, cols_, PrimitiveType::f32);
+#else
+  TI_NOT_IMPLEMENTED;
+  return std::unique_ptr<SparseMatrix>();
+#endif
+}
+
+std::unique_ptr<SparseMatrix> CuSparseMatrix::matmul(
+    const CuSparseMatrix &other) const {
+#if defined(TI_WITH_CUDA)
+  return gemm(other, 1.0f, 1.0f);
+#else
+  TI_NOT_IMPLEMENTED;
+  return std::unique_ptr<SparseMatrix>();
+#endif
+}
+
+// Reference:
+// https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/spgemm
+std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other,
+                                                   const float alpha,
+                                                   const float beta) const {
+#if defined(TI_WITH_CUDA)
+  cusparseHandle_t handle;
+  CUSPARSEDriver::get_instance().cpCreate(&handle);
+  cusparseOperation_t op_A = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseOperation_t op_B = CUSPARSE_OPERATION_NON_TRANSPOSE;
+
+  size_t nrows_A = rows_;
+  size_t ncols_B = other.cols_;
+  auto mat_A = matrix_;
+  auto mat_B = other.matrix_;
+
+  // 1. create resulting matrix `C`
+  cusparseSpMatDescr_t mat_C;
+  CUSPARSEDriver::get_instance().cpCreateCsr(
+      &mat_C, nrows_A, ncols_B, 0, nullptr, nullptr, nullptr,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+      CUDA_R_32F);
+
+  // 2. create gemm descr
+  cusparseSpGEMMDescr_t spgemm_desc;
+  CUSPARSEDriver::get_instance().cpCreateSpGEMM(&spgemm_desc);
+
+  // 3. ask buffer_size1 bytes for external memory
+  void *d_buffer1;
+  size_t buffer_size1 = 0;
+  CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation(
+      handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C,
+      CUDA_R_32F, CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size1, nullptr);
+  CUDADriver::get_instance().malloc((void **)&d_buffer1, buffer_size1);
+  // 4. inspect the matrices A and B to understand the memory requirement for
+  // the next step
+  CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation(
+      handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C,
+      CUDA_R_32F, CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size1,
+      d_buffer1);
+
+  // 5. ask buffer_size2 bytes for external memory
+  size_t buffer_size2 = 0;
+  CUSPARSEDriver::get_instance().cpSpGEMM_compute(
+      handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F,
+      CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, nullptr);
+  void *d_buffer2;
+  CUDADriver::get_instance().malloc((void **)&d_buffer2, buffer_size2);
+
+  // 6. compute the intermediate product of A * B
+  CUSPARSEDriver::get_instance().cpSpGEMM_compute(
+      handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F,
+      CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, d_buffer2);
+
+  // 7. get info of matrix C
+  size_t nrows_C, cols_C, nnz_C;
+  CUSPARSEDriver::get_instance().cpGetSize(mat_C, &nrows_C, &cols_C, &nnz_C);
+
+  // 8. allocate matric C
+  int *d_csr_row_ptr_C, *d_csr_col_ind_C;
+  float *d_values_C;
+  CUDADriver::get_instance().malloc((void **)&d_csr_row_ptr_C,
+                                    (nrows_A + 1) * sizeof(int));
+  CUDADriver::get_instance().malloc((void **)&d_csr_col_ind_C,
+                                    nnz_C * sizeof(int));
+  CUDADriver::get_instance().malloc((void **)&d_values_C,
+                                    nnz_C * sizeof(float));
+
+  // 9. update matrix C with new pointers
+  CUSPARSEDriver::get_instance().cpCsrSetPointers(mat_C, d_csr_row_ptr_C,
+                                                  d_csr_col_ind_C, d_values_C);
+
+  // 10. copy the final products of C.
+  CUSPARSEDriver::get_instance().cpSpGEMM_copy(
+      handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F,
+      CUSPARSE_SPGEMM_DEFAULT, spgemm_desc);
+
+  CUDADriver::get_instance().mem_free(d_buffer1);
+  CUDADriver::get_instance().mem_free(d_buffer2);
+  CUSPARSEDriver::get_instance().cpDestroy(handle);
+  CUSPARSEDriver::get_instance().cpDestroySpGEMM(spgemm_desc);
+
+  return make_cu_sparse_matrix(mat_C, nrows_A, ncols_B, PrimitiveType::f32);
+#else
+  TI_NOT_IMPLEMENTED;
+  return std::unique_ptr<SparseMatrix>();
+#endif
+}
+
+// Convert CSR to CSC format using routine `Csr2cscEx2`
+// to implement transpose.
+// Reference
+// https://stackoverflow.com/questions/57368010/how-to-transpose-a-sparse-matrix-in-cusparse
+std::unique_ptr<SparseMatrix> CuSparseMatrix::transpose() const {
+#if defined(TI_WITH_CUDA)
+  cusparseHandle_t handle;
+  CUSPARSEDriver::get_instance().cpCreate(&handle);
+  size_t nrows_A, ncols_A, nnz;
+  void *d_csr_val = nullptr, *d_csr_val_AT = nullptr;
+  int *d_csr_row_ptr = nullptr, *d_csr_col_ind = nullptr;
+  int *d_csr_row_ptr_AT = nullptr, *d_csr_col_ptr_AT = nullptr;
+  cusparseIndexType_t csr_row_otr_type, csr_col_otr_type;
+  cusparseIndexBase_t idx_base_type;
+  cudaDataType value_type;
+  size_t buffer_size;
+
+  // 1. get pointers of A
+  CUSPARSEDriver::get_instance().cpCsrGet(
+      matrix_, &nrows_A, &ncols_A, &nnz, (void **)&d_csr_row_ptr,
+      (void **)&d_csr_col_ind, (void **)&d_csr_val, &csr_row_otr_type,
+      &csr_col_otr_type, &idx_base_type, &value_type);
+
+  // 2. ask bufer size for Csr2cscEx2
+  CUSPARSEDriver::get_instance().cpCsr2cscEx2_bufferSize(
+      handle, nrows_A, ncols_A, nnz, (void *)&d_csr_val, (int *)&d_csr_row_ptr,
+      (int *)&d_csr_col_ind, (void *)&d_csr_val_AT, (int *)&d_csr_row_ptr_AT,
+      (int *)&d_csr_col_ptr_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC,
+      CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, &buffer_size);
+  void *buffer = nullptr;
+  CUDADriver::get_instance().malloc((void **)&buffer, buffer_size);
+
+  CUDADriver::get_instance().malloc((void **)&d_csr_val_AT,
+                                    nnz * sizeof(float));
+  CUDADriver::get_instance().malloc((void **)&d_csr_row_ptr_AT,
+                                    (ncols_A + 1) * sizeof(int));
+  CUDADriver::get_instance().malloc((void **)&d_csr_col_ptr_AT,
+                                    nnz * sizeof(int));
+
+  // 3. execute Csr2cscEx2
+  CUSPARSEDriver::get_instance().cpCsr2cscEx2(
+      handle, nrows_A, ncols_A, nnz, d_csr_val, d_csr_row_ptr, d_csr_col_ind,
+      d_csr_val_AT, d_csr_row_ptr_AT, d_csr_col_ptr_AT, CUDA_R_32F,
+      CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
+      buffer);
+
+  // 4. create AT.
+  cusparseSpMatDescr_t mat_AT;
+  CUSPARSEDriver::get_instance().cpCreateCsr(
+      &mat_AT, ncols_A, nrows_A, nnz, (void *)d_csr_row_ptr_AT,
+      (void *)d_csr_col_ptr_AT, (void *)d_csr_val_AT, CUSPARSE_INDEX_32I,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+  CUDADriver::get_instance().mem_free(buffer);
+  CUSPARSEDriver::get_instance().cpDestroy(handle);
+  return make_cu_sparse_matrix(mat_AT, ncols_A, nrows_A, PrimitiveType::f32);
+#else
+  TI_NOT_IMPLEMENTED;
+  return std::unique_ptr<SparseMatrix>();
+#endif
+}
+
 void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, Ndarray &y) {
 #if defined(TI_WITH_CUDA)
   size_t dX = prog->get_ndarray_data_ptr_as_int(&x);
@@ -295,7 +604,7 @@ void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, Ndarray &y) {
       cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX,
       &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, &bufferSize);
 
-  void *dBuffer = NULL;
+  void *dBuffer = nullptr;
   if (bufferSize > 0)
     CUDADriver::get_instance().malloc(&dBuffer, bufferSize);
   CUSPARSEDriver::get_instance().cpSpMV(
@@ -309,5 +618,34 @@ void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, Ndarray &y) {
 #endif
 }
 
+const std::string CuSparseMatrix::to_string() const {
+  std::ostringstream ostr;
+#ifdef TI_WITH_CUDA
+  size_t rows, cols, nnz;
+  float *dR;
+  int *dC, *dV;
+  cusparseIndexType_t row_type, column_type;
+  cusparseIndexBase_t idx_base;
+  cudaDataType value_type;
+  CUSPARSEDriver::get_instance().cpCsrGet(
+      matrix_, &rows, &cols, &nnz, (void **)&dR, (void **)&dC, (void **)&dV,
+      &row_type, &column_type, &idx_base, &value_type);
+
+  auto *hR = new int[rows + 1];
+  auto *hC = new int[nnz];
+  auto *hV = new float[nnz];
+
+  CUDADriver::get_instance().memcpy_device_to_host((void *)hR, (void *)dR,
+                                                   (rows + 1) * sizeof(int));
+  CUDADriver::get_instance().memcpy_device_to_host((void *)hC, (void *)dC,
+                                                   (nnz) * sizeof(int));
+  CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV,
+                                                   (nnz) * sizeof(float));
+
+  print_triplet_from_csr<int, int, float>(rows, cols, hR, hC, hV, ostr);
+#endif
+  return ostr.str();
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h
index 53d262919cb06..2065703bdb252 100644
--- a/taichi/program/sparse_matrix.h
+++ b/taichi/program/sparse_matrix.h
@@ -126,6 +126,7 @@ class EigenSparseMatrix : public SparseMatrix {
   }
 
   ~EigenSparseMatrix() override = default;
+
   void build_triplets(void *triplets_adr) override;
   const std::string to_string() const override;
 
@@ -211,19 +212,64 @@ class CuSparseMatrix : public SparseMatrix {
     }
 #endif
   }
+  explicit CuSparseMatrix(cusparseSpMatDescr_t A,
+                          int rows,
+                          int cols,
+                          DataType dt)
+      : SparseMatrix(rows, cols, dt), matrix_(A) {
+  }
+  CuSparseMatrix(const CuSparseMatrix &sm)
+      : SparseMatrix(sm.rows_, sm.cols_, sm.dtype_), matrix_(sm.matrix_) {
+  }
 
   virtual ~CuSparseMatrix();
+
+  // TODO: Overload +=, -= and *=
+  friend std::unique_ptr<SparseMatrix> operator+(const CuSparseMatrix &lhs,
+                                                 const CuSparseMatrix &rhs) {
+    auto m = lhs.addition(rhs, 1.0, 1.0);
+    return m;
+  };
+
+  friend std::unique_ptr<SparseMatrix> operator-(const CuSparseMatrix &lhs,
+                                                 const CuSparseMatrix &rhs) {
+    return lhs.addition(rhs, 1.0, -1.0);
+  };
+
+  friend std::unique_ptr<SparseMatrix> operator*(const CuSparseMatrix &sm,
+                                                 float scale) {
+    return sm.addition(sm, scale, 0.0);
+  }
+
+  friend std::unique_ptr<SparseMatrix> operator*(float scale,
+                                                 const CuSparseMatrix &sm) {
+    return sm.addition(sm, scale, 0.0);
+  }
+
+  std::unique_ptr<SparseMatrix> addition(const CuSparseMatrix &other,
+                                         const float alpha,
+                                         const float beta) const;
+
+  std::unique_ptr<SparseMatrix> matmul(const CuSparseMatrix &other) const;
+
+  std::unique_ptr<SparseMatrix> gemm(const CuSparseMatrix &other,
+                                     const float alpha,
+                                     const float beta) const;
+
+  std::unique_ptr<SparseMatrix> transpose() const;
+
   void build_csr_from_coo(void *coo_row_ptr,
                           void *coo_col_ptr,
                           void *coo_values_ptr,
                           int nnz) override;
+
   void spmv(Program *prog, const Ndarray &x, Ndarray &y);
 
   const void *get_matrix() const override {
     return &matrix_;
   };
 
-  void print_info();
+  const std::string to_string() const override;
 
  private:
   cusparseSpMatDescr_t matrix_;
@@ -237,6 +283,10 @@ std::unique_ptr<SparseMatrix> make_sparse_matrix(
 std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(int rows,
                                                     int cols,
                                                     DataType dt);
+std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(cusparseSpMatDescr_t mat,
+                                                    int rows,
+                                                    int cols,
+                                                    DataType dt);
 
 void make_sparse_matrix_from_ndarray(Program *prog,
                                      SparseMatrix &sm,
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 92a3067f0f0bc..3795ba380aebb 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -1221,7 +1221,15 @@ void export_lang(py::module &m) {
 
   py::class_<CuSparseMatrix, SparseMatrix>(m, "CuSparseMatrix")
       .def(py::init<int, int, DataType>())
-      .def("spmv", &CuSparseMatrix::spmv);
+      .def(py::init<const CuSparseMatrix &>())
+      .def("spmv", &CuSparseMatrix::spmv)
+      .def(py::self + py::self)
+      .def(py::self - py::self)
+      .def(py::self * float32())
+      .def(float32() * py::self)
+      .def("matmul", &CuSparseMatrix::matmul)
+      .def("transpose", &CuSparseMatrix::transpose)
+      .def("to_string", &CuSparseMatrix::to_string);
 
   py::class_<SparseSolver>(m, "SparseSolver")
       .def("compute", &SparseSolver::compute)
diff --git a/taichi/rhi/cuda/cuda_types.h b/taichi/rhi/cuda/cuda_types.h
index 88bb33951e3e7..d44abd7124992 100644
--- a/taichi/rhi/cuda/cuda_types.h
+++ b/taichi/rhi/cuda/cuda_types.h
@@ -447,6 +447,10 @@ struct cusparseSpMatDescr;
 typedef struct cusparseSpVecDescr *cusparseSpVecDescr_t;
 typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t;
 typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t;
+
+struct cusparseSpGEMMDescr;
+typedef struct cusparseSpGEMMDescr *cusparseSpGEMMDescr_t;
+
 typedef enum {
   CUSPARSE_INDEX_16U = 1,  ///< 16-bit unsigned integer for matrix/vector
                            ///< indices
@@ -504,6 +508,27 @@ typedef enum {
   CUSPARSE_SPMV_COO_ALG2 = 4
 } cusparseSpMVAlg_t;
 
+typedef enum {
+  CUSPARSE_SPGEMM_DEFAULT = 0,
+  CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC = 1,
+  CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC = 2
+} cusparseSpGEMMAlg_t;
+
+typedef enum {
+  CUSPARSE_POINTER_MODE_HOST = 0,
+  CUSPARSE_POINTER_MODE_DEVICE = 1
+} cusparsePointerMode_t;
+
+typedef enum {
+  CUSPARSE_ACTION_SYMBOLIC = 0,
+  CUSPARSE_ACTION_NUMERIC = 1
+} cusparseAction_t;
+
+typedef enum {
+  CUSPARSE_CSR2CSC_ALG1 = 1,  // faster than V2 (in general), deterministc
+  CUSPARSE_CSR2CSC_ALG2 = 2   // low memory requirement, non-deterministc
+} cusparseCsr2CscAlg_t;
+
 typedef enum {
   CUSPARSE_MATRIX_TYPE_GENERAL = 0,
   CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1,
diff --git a/taichi/rhi/cuda/cusparse_functions.inc.h b/taichi/rhi/cuda/cusparse_functions.inc.h
index 1a70e36de7883..53e7881e036b7 100644
--- a/taichi/rhi/cuda/cusparse_functions.inc.h
+++ b/taichi/rhi/cuda/cusparse_functions.inc.h
@@ -10,6 +10,7 @@ PER_CUSPARSE_FUNCTION(cpCreateCsr, cusparseCreateCsr, cusparseSpMatDescr_t*, int
 PER_CUSPARSE_FUNCTION(cpCoo2Csr, cusparseXcoo2csr, cusparseHandle_t ,const void*, int, int,void*, cusparseIndexBase_t );
 PER_CUSPARSE_FUNCTION(cpCsrGet, cusparseCsrGet, cusparseSpMatDescr_t ,size_t *,size_t *,size_t *,void**,void**,void**,cusparseIndexType_t* ,cusparseIndexType_t* ,cusparseIndexBase_t* ,cudaDataType*);
 PER_CUSPARSE_FUNCTION(cpCreateMatDescr, cusparseCreateMatDescr, cusparseMatDescr_t*);
+PER_CUSPARSE_FUNCTION(cpDestroyMatDescr, cusparseDestroyMatDescr, cusparseMatDescr_t);
 PER_CUSPARSE_FUNCTION(cpSetMatType, cusparseSetMatType, cusparseMatDescr_t, cusparseMatrixType_t);
 PER_CUSPARSE_FUNCTION(cpSetMatIndexBase, cusparseSetMatIndexBase, cusparseMatDescr_t, cusparseIndexBase_t);
 PER_CUSPARSE_FUNCTION(cpDestroySpMat, cusparseDestroySpMat, cusparseSpMatDescr_t);
@@ -19,6 +20,8 @@ PER_CUSPARSE_FUNCTION(cpCreateIdentityPermutation, cusparseCreateIdentityPermuta
 PER_CUSPARSE_FUNCTION(cpXcoosort_bufferSizeExt, cusparseXcoosort_bufferSizeExt, cusparseHandle_t,int ,int,int, void* ,void* ,void*);
 PER_CUSPARSE_FUNCTION(cpXcoosortByRow, cusparseXcoosortByRow, cusparseHandle_t,int,int,int,void* ,void* ,void* ,void*);
 PER_CUSPARSE_FUNCTION(cpGather, cusparseGather, cusparseHandle_t, cusparseDnVecDescr_t, cusparseSpVecDescr_t);
+PER_CUSPARSE_FUNCTION(cpSetPointerMode, cusparseSetPointerMode, cusparseHandle_t, cusparsePointerMode_t);
+PER_CUSPARSE_FUNCTION(cpCsrSetPointers, cusparseCsrSetPointers, cusparseSpMatDescr_t,void*, void*, void*);
 
 // cusparse dense vector description
 PER_CUSPARSE_FUNCTION(cpCreateDnVec, cusparseCreateDnVec, cusparseDnVecDescr_t*, int, void*, cudaDataType);
@@ -27,3 +30,19 @@ PER_CUSPARSE_FUNCTION(cpDestroyDnVec, cusparseDestroyDnVec, cusparseDnVecDescr_t
 // cusparse sparse matrix-vector multiplication
 PER_CUSPARSE_FUNCTION(cpSpMV_bufferSize, cusparseSpMV_bufferSize, cusparseHandle_t, cusparseOperation_t, const void*,cusparseSpMatDescr_t, cusparseDnVecDescr_t,const void*, cusparseDnVecDescr_t,cudaDataType, cusparseSpMVAlg_t, size_t*);
 PER_CUSPARSE_FUNCTION(cpSpMV, cusparseSpMV, cusparseHandle_t, cusparseOperation_t, const void*,cusparseSpMatDescr_t, cusparseDnVecDescr_t,const void*, cusparseDnVecDescr_t,cudaDataType, cusparseSpMVAlg_t, void*);
+
+
+// cusparse sparse matrix-matrix operation
+PER_CUSPARSE_FUNCTION(cpScsrgeam2_bufferSizeExt, cusparseScsrgeam2_bufferSizeExt, cusparseHandle_t  ,int,int,void*,const cusparseMatDescr_t ,int, void*, void*, void*,void*,const cusparseMatDescr_t ,int, void*,void*,void*,const cusparseMatDescr_t ,void*,void*,void*,void*);
+PER_CUSPARSE_FUNCTION(cpGetSize, cusparseSpMatGetSize, cusparseSpMatDescr_t, size_t* , size_t* , size_t*);
+PER_CUSPARSE_FUNCTION(cpXcsrgeam2Nnz, cusparseXcsrgeam2Nnz, cusparseHandle_t,int,int,const cusparseMatDescr_t ,int,void*,void*,const cusparseMatDescr_t ,int,void*,void*,const cusparseMatDescr_t ,void*,void*,void*);
+PER_CUSPARSE_FUNCTION(cpScsrgeam2, cusparseScsrgeam2, cusparseHandle_t,int,int,void*,const cusparseMatDescr_t ,int, void*,void*,void*,void*,const cusparseMatDescr_t ,int,void*,void*,void*,const cusparseMatDescr_t ,void*,void*,void*,void*);
+PER_CUSPARSE_FUNCTION(cpSpGEMM_workEstimation, cusparseSpGEMM_workEstimation, cusparseHandle_t,cusparseOperation_t,cusparseOperation_t,const void*,cusparseSpMatDescr_t,cusparseSpMatDescr_t,const void*,cusparseSpMatDescr_t,cudaDataType,cusparseSpGEMMAlg_t,cusparseSpGEMMDescr_t,size_t*,void*);
+PER_CUSPARSE_FUNCTION(cpSpGEMM_compute, cusparseSpGEMM_compute, cusparseHandle_t,cusparseOperation_t,cusparseOperation_t,const void*,cusparseSpMatDescr_t,cusparseSpMatDescr_t,const void*,cusparseSpMatDescr_t,cudaDataType,cusparseSpGEMMAlg_t,cusparseSpGEMMDescr_t,size_t*,void*);
+PER_CUSPARSE_FUNCTION(cpSpGEMM_copy, cusparseSpGEMM_copy, cusparseHandle_t,cusparseOperation_t,cusparseOperation_t,const void*,cusparseSpMatDescr_t,cusparseSpMatDescr_t,const void*,cusparseSpMatDescr_t,cudaDataType,cusparseSpGEMMAlg_t,cusparseSpGEMMDescr_t);
+PER_CUSPARSE_FUNCTION(cpCreateSpGEMM, cusparseSpGEMM_createDescr, cusparseSpGEMMDescr_t* );
+PER_CUSPARSE_FUNCTION(cpDestroySpGEMM, cusparseSpGEMM_destroyDescr, cusparseSpGEMMDescr_t );
+
+// cusparse sparse matrix convertions
+PER_CUSPARSE_FUNCTION(cpCsr2cscEx2_bufferSize, cusparseCsr2cscEx2_bufferSize, cusparseHandle_t, int, int, int, const void*, const int*, const int*, void*, int*, int*, cudaDataType, cusparseAction_t, cusparseIndexBase_t, cusparseCsr2CscAlg_t, size_t*);
+PER_CUSPARSE_FUNCTION(cpCsr2cscEx2, cusparseCsr2cscEx2, cusparseHandle_t, int, int, int, const void*, const int*, const int*, void*, int*, int*, cudaDataType, cusparseAction_t, cusparseIndexBase_t, cusparseCsr2CscAlg_t, void*);