From 2953a018aaec284892897098ae30fc954f3d2939 Mon Sep 17 00:00:00 2001
From: pengyu <6712304+FantasyVR@users.noreply.github.com>
Date: Mon, 14 Nov 2022 12:10:43 +0800
Subject: [PATCH] [lang] Build sparse matrix using ndarray (#6563)

Issue: #2906

### Brief Summary
Replace the array storing triplets in sparse matrix builder with
ndarray. It unifies the sparse matrix builder creation on the CPU and
GPU.

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>
---
 python/taichi/lang/kernel_impl.py             |  2 +-
 python/taichi/linalg/sparse_matrix.py         |  4 ++
 taichi/program/sparse_matrix.cpp              | 49 +++++++++----------
 taichi/program/sparse_matrix.h                | 11 +++--
 taichi/python/export_lang.cpp                 |  8 +--
 .../llvm/runtime_module/internal_functions.h  | 22 ++++-----
 tests/python/test_sparse_linear_solver.py     | 39 ++++-----------
 7 files changed, 58 insertions(+), 77 deletions(-)
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index e16fefecd6f62..f8644c27b946b 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -680,7 +680,7 @@ def func__(*args):
                 elif isinstance(needed, sparse_matrix_builder):
                     # Pass only the base pointer of the ti.types.sparse_matrix_builder() argument
                     launch_ctx.set_arg_uint(actual_argument_slot,
-                                            v._get_addr())
+                                            v._get_ndarray_addr())
                 elif isinstance(needed,
                                 ndarray_type.NdarrayType) and isinstance(
                                     v, taichi.lang._ndarray.Ndarray):
diff --git a/python/taichi/linalg/sparse_matrix.py b/python/taichi/linalg/sparse_matrix.py
index 034d3805954a1..6224a9822b5b0 100644
--- a/python/taichi/linalg/sparse_matrix.py
+++ b/python/taichi/linalg/sparse_matrix.py
@@ -260,6 +260,10 @@ def _get_addr(self):
         """Get the address of the sparse matrix"""
         return self.ptr.get_addr()
 
+    def _get_ndarray_addr(self):
+        """Get the address of the ndarray"""
+        return self.ptr.get_ndarray_data_ptr()
+
     def print_triplets(self):
         """Print the triplets stored in the builder"""
         self.ptr.print_triplets()
diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
index bf096ac57528e..cfe157a935550 100644
--- a/taichi/program/sparse_matrix.cpp
+++ b/taichi/program/sparse_matrix.cpp
@@ -79,53 +79,48 @@ SparseMatrixBuilder::SparseMatrixBuilder(int rows,
                                          int cols,
                                          int max_num_triplets,
                                          DataType dtype,
-                                         const std::string &storage_format)
+                                         const std::string &storage_format,
+                                         Program *prog)
     : rows_(rows),
       cols_(cols),
       max_num_triplets_(max_num_triplets),
       dtype_(dtype),
-      storage_format_(storage_format) {
+      storage_format_(storage_format),
+      prog_(prog) {
   auto element_size = data_type_size(dtype);
   TI_ASSERT((element_size == 4 || element_size == 8));
-  data_base_ptr_ =
-      std::make_unique<uchar[]>(max_num_triplets_ * 3 * element_size);
+  ndarray_data_base_ptr_ = std::make_unique<Ndarray>(
+      prog_, dtype_, std::vector<int>{3 * (int)max_num_triplets_ + 1});
 }
 
-template <typename T, typename G>
-void SparseMatrixBuilder::print_template() {
+void SparseMatrixBuilder::print_triplets() {
+  num_triplets_ = ndarray_data_base_ptr_->read_int(std::vector<int>{0});
   fmt::print("n={}, m={}, num_triplets={} (max={})\n", rows_, cols_,
              num_triplets_, max_num_triplets_);
-  T *data = reinterpret_cast<T *>(data_base_ptr_.get());
-  for (int64 i = 0; i < num_triplets_; i++) {
-    fmt::print("({}, {}) val={}\n", ((G *)data)[i * 3], ((G *)data)[i * 3 + 1],
-               taichi_union_cast<T>(data[i * 3 + 2]));
+  for (int i = 0; i < num_triplets_; i++) {
+    auto idx = 3 * i + 1;
+    auto row = ndarray_data_base_ptr_->read_int(std::vector<int>{idx});
+    auto col = ndarray_data_base_ptr_->read_int(std::vector<int>{idx + 1});
+    auto val = ndarray_data_base_ptr_->read_float(std::vector<int>{idx + 2});
+    fmt::print("[{}, {}] = {}\n", row, col, val);
   }
-  fmt::print("\n");
 }
 
-void SparseMatrixBuilder::print_triplets() {
-  auto element_size = data_type_size(dtype_);
-  switch (element_size) {
-    case 4:
-      print_template<float32, int32>();
-      break;
-    case 8:
-      print_template<float64, int64>();
-      break;
-    default:
-      TI_ERROR("Unsupported sparse matrix data type!");
-      break;
-  }
+intptr_t SparseMatrixBuilder::get_ndarray_data_ptr() const {
+  return prog_->get_ndarray_data_ptr_as_int(ndarray_data_base_ptr_.get());
 }
 
 template <typename T, typename G>
 void SparseMatrixBuilder::build_template(std::unique_ptr<SparseMatrix> &m) {
   using V = Eigen::Triplet<T>;
   std::vector<V> triplets;
-  T *data = reinterpret_cast<T *>(data_base_ptr_.get());
+  auto ptr = get_ndarray_data_ptr();
+  G *data = reinterpret_cast<G *>(ptr);
+  num_triplets_ = data[0];
+  data += 1;
   for (int i = 0; i < num_triplets_; i++) {
-    triplets.push_back(V(((G *)data)[i * 3], ((G *)data)[i * 3 + 1],
-                         taichi_union_cast<T>(data[i * 3 + 2])));
+    triplets.push_back(
+        V(data[i * 3], data[i * 3 + 1], taichi_union_cast<T>(data[i * 3 + 2])));
   }
   m->build_triplets(static_cast<void *>(&triplets));
   clear();
diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h
index 781f694f3a68f..4b976d6f176da 100644
--- a/taichi/program/sparse_matrix.h
+++ b/taichi/program/sparse_matrix.h
@@ -19,30 +19,31 @@ class SparseMatrixBuilder {
                       int cols,
                       int max_num_triplets,
                       DataType dtype,
-                      const std::string &storage_format);
+                      const std::string &storage_format,
+                      Program *prog);
 
   void print_triplets();
 
+  intptr_t get_ndarray_data_ptr() const;
+
   std::unique_ptr<SparseMatrix> build();
 
   void clear();
 
  private:
-  template <typename T, typename G>
-  void print_template();
-
   template <typename T, typename G>
   void build_template(std::unique_ptr<SparseMatrix> &);
 
  private:
   uint64 num_triplets_{0};
-  std::unique_ptr<uchar[]> data_base_ptr_{nullptr};
+  std::unique_ptr<Ndarray> ndarray_data_base_ptr_{nullptr};
   int rows_{0};
   int cols_{0};
   uint64 max_num_triplets_{0};
   bool built_{false};
   DataType dtype_{PrimitiveType::f32};
   std::string storage_format_{"col_major"};
+  Program *prog_{nullptr};
 };
 
 class SparseMatrix {
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 774894b1f78a9..7887ee4f78414 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -398,10 +398,11 @@ void export_lang(py::module &m) {
       .def("create_sparse_matrix_builder",
            [](Program *program, int n, int m, uint64 max_num_entries,
               DataType dtype, const std::string &storage_format) {
-             TI_ERROR_IF(!arch_is_cpu(program->this_thread_config().arch),
-                         "SparseMatrix Builder only supports CPU for now.");
+             TI_ERROR_IF(!arch_is_cpu(program->this_thread_config().arch) &&
+                             !arch_is_cuda(program->this_thread_config().arch),
+                         "SparseMatrix only supports CPU and CUDA for now.");
              return SparseMatrixBuilder(n, m, max_num_entries, dtype,
-                                        storage_format);
+                                        storage_format, program);
            })
       .def("create_sparse_matrix",
            [](Program *program, int n, int m, DataType dtype,
@@ -1204,6 +1205,7 @@ void export_lang(py::module &m) {
   // Sparse Matrix
   py::class_<SparseMatrixBuilder>(m, "SparseMatrixBuilder")
       .def("print_triplets", &SparseMatrixBuilder::print_triplets)
+      .def("get_ndarray_data_ptr", &SparseMatrixBuilder::get_ndarray_data_ptr)
       .def("build", &SparseMatrixBuilder::build)
       .def("get_addr", [](SparseMatrixBuilder *mat) { return uint64(mat); });
 
diff --git a/taichi/runtime/llvm/runtime_module/internal_functions.h b/taichi/runtime/llvm/runtime_module/internal_functions.h
index 89c23fd998877..ad66fa7cba3e1 100644
--- a/taichi/runtime/llvm/runtime_module/internal_functions.h
+++ b/taichi/runtime/llvm/runtime_module/internal_functions.h
@@ -9,15 +9,15 @@
     }                                                        \
   } while (0)
 
-#define ATOMIC_INSERT(T)                                             \
-  do {                                                               \
-    auto base_ptr = (int64 *)base_ptr_;                              \
-    int64 *num_triplets = base_ptr;                                  \
-    auto data_base_ptr = *(T **)(base_ptr + 1);                      \
-    auto triplet_id = atomic_add_i64(num_triplets, 1);               \
-    data_base_ptr[triplet_id * 3] = i;                               \
-    data_base_ptr[triplet_id * 3 + 1] = j;                           \
-    data_base_ptr[triplet_id * 3 + 2] = taichi_union_cast<T>(value); \
+#define ATOMIC_INSERT(T)                                                  \
+  do {                                                                    \
+    auto base_ptr = reinterpret_cast<int##T *>(base_ptr_);                \
+    int##T *num_triplets = base_ptr;                                      \
+    auto data_base_ptr = base_ptr + 1;                                    \
+    auto triplet_id = atomic_add_i##T(num_triplets, 1);                   \
+    data_base_ptr[triplet_id * 3] = i;                                    \
+    data_base_ptr[triplet_id * 3 + 1] = j;                                \
+    data_base_ptr[triplet_id * 3 + 2] = taichi_union_cast<int##T>(value); \
   } while (0);
 
 i32 do_nothing(RuntimeContext *context) {
@@ -36,7 +36,7 @@ i32 insert_triplet_f32(RuntimeContext *context,
                        int i,
                        int j,
                        float value) {
-  ATOMIC_INSERT(int32);
+  ATOMIC_INSERT(32);
   return 0;
 }
 
@@ -45,7 +45,7 @@ i32 insert_triplet_f64(RuntimeContext *context,
                        int i,
                        int j,
                        float64 value) {
-  ATOMIC_INSERT(int64);
+  ATOMIC_INSERT(64);
   return 0;
 }
 
diff --git a/tests/python/test_sparse_linear_solver.py b/tests/python/test_sparse_linear_solver.py
index f31e7bd7df988..cd48f0581b3af 100644
--- a/tests/python/test_sparse_linear_solver.py
+++ b/tests/python/test_sparse_linear_solver.py
@@ -4,40 +4,16 @@
 import taichi as ti
 from tests import test_utils
 
-"""
-The symmetric positive definite matrix is created in matlab using the following script:
-    A = diag([1,2,3,4]);
-    OrthM = [1 0 1 0; -1 -2 0 1; 0 1 -1 0; 0, 1, 0 1];
-    U = orth(OrthM);
-    Aarray = U * A * U';
-    b = [1,2,3,4]';
-    res = inv(A) * b;
-""" # yapf: disable
-
-Aarray = np.array([[
-    2.73999501130921, 0.518002544441220, 0.745119303009342, 0.0508907745638859
-], [0.518002544441220, 1.45111665837647, 0.757997555750432, 0.290885785873098],
-                   [
-                       0.745119303009342, 0.757997555750432, 2.96711176987733,
-                       -0.518002544441220
-                   ],
-                   [
-                       0.0508907745638859, 0.290885785873098,
-                       -0.518002544441220, 2.84177656043698
-                   ]])
-
-res = np.array([
-    -0.0754984396447588, 0.469972700892492, 1.18527357933586, 1.57686870529319
-])
-
 
 @pytest.mark.parametrize("dtype", [ti.f32])
 @pytest.mark.parametrize("solver_type", ["LLT", "LDLT", "LU"])
 @pytest.mark.parametrize("ordering", ["AMD", "COLAMD"])
 @test_utils.test(arch=ti.cpu)
 def test_sparse_LLT_solver(dtype, solver_type, ordering):
-    n = 4
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    n = 10
+    A = np.random.rand(n, n)
+    A_psd = np.dot(A, A.transpose())
+    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=300)
     b = ti.field(ti.f32, shape=n)
 
     @ti.kernel
@@ -48,16 +24,19 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
         for i in range(n):
             b[i] = i + 1
 
-    fill(Abuilder, Aarray, b)
+    fill(Abuilder, A_psd, b)
     A = Abuilder.build()
+    print(A)
     solver = ti.linalg.SparseSolver(dtype=dtype,
                                     solver_type=solver_type,
                                     ordering=ordering)
     solver.analyze_pattern(A)
     solver.factorize(A)
     x = solver.solve(b)
+
+    res = np.linalg.solve(A_psd, b.to_numpy())
     for i in range(n):
-        assert x[i] == test_utils.approx(res[i])
+        assert x[i] == test_utils.approx(res[i], rel=1.0)
 
 
 @test_utils.test(arch=ti.cuda)