From 8630f4d25e80d64ea72ce1031d43883228203b29 Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Mon, 27 Jun 2022 17:33:11 +0800
Subject: [PATCH] [Lang] [type] Refine SNode with quant 2/n: Enable struct for
 on bit_array with bit_vectorize off (#5253)

* [Lang] [type] Refine SNode with quant 2/n: Enable struct for on bit_array with bit_vectorize off

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/codegen/llvm/codegen_llvm.cpp | 70 ++++++++++++----------------
 taichi/ir/frontend_ir.h              |  4 +-
 taichi/ir/ir_builder.h               |  6 +--
 taichi/ir/snode.cpp                  |  3 +-
 taichi/ir/statements.cpp             |  1 +
 taichi/ir/statements.h               |  1 +
 taichi/transforms/offload.cpp        |  7 +--
 tests/python/test_bit_array.py       | 34 ++++++++++++++
 8 files changed, 75 insertions(+), 51 deletions(-)
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 5776aca5a66cc..8fc36feaec3b6 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -241,6 +241,9 @@ std::unique_ptr<RuntimeObject> CodeGenLLVM::emit_struct_meta_object(
     meta =
         std::make_unique<RuntimeObject>("BitmaskedMeta", this, builder.get());
     emit_struct_meta_base("Bitmasked", meta->ptr, snode);
+  } else if (snode->type == SNodeType::bit_array) {
+    meta = std::make_unique<RuntimeObject>("DenseMeta", this, builder.get());
+    emit_struct_meta_base("Dense", meta->ptr, snode);
   } else {
     TI_P(snode_type_name(snode->type));
     TI_NOT_IMPLEMENTED;
@@ -1725,16 +1728,16 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {
   llvm::Function *body = nullptr;
   auto leaf_block = stmt->snode;
 
-  // When looping over bit_arrays, we always vectorize and generate struct for
-  // on their parent node (usually "dense") instead of itself for higher
-  // performance. Also, note that the loop must be bit_vectorized for
-  // bit_arrays, and their parent must be "dense".
-  if (leaf_block->type == SNodeType::bit_array) {
-    if (leaf_block->parent->type == SNodeType::dense) {
+  // For a bit-vectorized loop over a bit array, we generate struct for on its
+  // parent node (must be "dense") instead of itself for higher performance.
+  if (stmt->bit_vectorize != 1) {
+    if (leaf_block->type == SNodeType::bit_array &&
+        leaf_block->parent->type == SNodeType::dense) {
       leaf_block = leaf_block->parent;
     } else {
       TI_ERROR(
-          "Struct-for looping through bit array but its parent is not dense")
+          "A bit-vectorized struct-for must loop over a bit array with a dense "
+          "parent");
     }
   }
 
@@ -1866,20 +1869,14 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {
     create_call(refine, {parent_coordinates, new_coordinates,
                          builder->CreateLoad(loop_index)});
 
-    // One more refine step is needed for bit_arrays to make final coordinates
-    // non-consecutive, since each thread will process multiple
-    // coordinates via vectorization
-    if (stmt->snode->type == SNodeType::bit_array && stmt->snode->parent) {
-      if (stmt->snode->parent->type == SNodeType::dense) {
-        refine =
-            get_runtime_function(stmt->snode->refine_coordinates_func_name());
-
-        create_call(refine,
-                    {new_coordinates, new_coordinates, tlctx->get_constant(0)});
-      } else {
-        TI_ERROR(
-            "Struct-for looping through bit array but its parent is not dense");
-      }
+    // For a bit-vectorized loop over a bit array, one more refine step is
+    // needed to make final coordinates non-consecutive, since each thread will
+    // process multiple coordinates via vectorization
+    if (stmt->bit_vectorize != 1) {
+      refine =
+          get_runtime_function(stmt->snode->refine_coordinates_func_name());
+      create_call(refine,
+                  {new_coordinates, new_coordinates, tlctx->get_constant(0)});
     }
 
     current_coordinates = new_coordinates;
@@ -1889,37 +1886,28 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {
     //  - if leaf block is bitmasked, make sure we only loop over active
     //    voxels
     auto exec_cond = tlctx->get_constant(true);
-    auto snode = stmt->snode;
-    if (snode->type == SNodeType::bit_array && snode->parent) {
-      if (snode->parent->type == SNodeType::dense) {
-        snode = snode->parent;
-      } else {
-        TI_ERROR(
-            "Struct-for looping through bit array but its parent is not dense");
-      }
-    }
-
     auto coord_object = RuntimeObject(kLLVMPhysicalCoordinatesName, this,
                                       builder.get(), new_coordinates);
     if (!prog->config.packed) {
-      for (int i = 0; i < snode->num_active_indices; i++) {
-        auto j = snode->physical_index_position[i];
+      for (int i = 0; i < leaf_block->num_active_indices; i++) {
+        auto j = leaf_block->physical_index_position[i];
         if (!bit::is_power_of_two(
-                snode->extractors[j].num_elements_from_root)) {
+                leaf_block->extractors[j].num_elements_from_root)) {
           auto coord = coord_object.get("val", tlctx->get_constant(j));
           exec_cond = builder->CreateAnd(
-              exec_cond, builder->CreateICmp(
-                             llvm::CmpInst::ICMP_SLT, coord,
-                             tlctx->get_constant(
-                                 snode->extractors[j].num_elements_from_root)));
+              exec_cond,
+              builder->CreateICmp(
+                  llvm::CmpInst::ICMP_SLT, coord,
+                  tlctx->get_constant(
+                      leaf_block->extractors[j].num_elements_from_root)));
         }
       }
     }
 
-    if (snode->type == SNodeType::bitmasked ||
-        snode->type == SNodeType::pointer) {
+    if (leaf_block->type == SNodeType::bitmasked ||
+        leaf_block->type == SNodeType::pointer) {
       // test whether the current voxel is active or not
-      auto is_active = call(snode, element.get("element"), "is_active",
+      auto is_active = call(leaf_block, element.get("element"), "is_active",
                             {builder->CreateLoad(loop_index)});
       is_active =
           builder->CreateTrunc(is_active, llvm::Type::getInt1Ty(*llvm_context));
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index 5acb9393f093a..e5fc3027156b8 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -14,7 +14,7 @@
 TLANG_NAMESPACE_BEGIN
 
 struct ForLoopConfig {
-  int bit_vectorize{0};
+  int bit_vectorize{1};
   int num_cpu_threads{0};
   bool strictly_serialized{false};
   MemoryAccessOptions mem_access_opt;
@@ -803,7 +803,7 @@ class ASTBuilder {
     }
 
     void reset() {
-      config.bit_vectorize = -1;
+      config.bit_vectorize = 1;
       config.num_cpu_threads = 0;
       config.uniform = false;
       config.mem_access_opt.clear();
diff --git a/taichi/ir/ir_builder.h b/taichi/ir/ir_builder.h
index 48fd37a078b9d..28a62611c9b37 100644
--- a/taichi/ir/ir_builder.h
+++ b/taichi/ir/ir_builder.h
@@ -103,17 +103,17 @@ class IRBuilder {
   // Control flows.
   RangeForStmt *create_range_for(Stmt *begin,
                                  Stmt *end,
-                                 int bit_vectorize = -1,
+                                 int bit_vectorize = 1,
                                  int num_cpu_threads = 0,
                                  int block_dim = 0,
                                  bool strictly_serialized = false);
   StructForStmt *create_struct_for(SNode *snode,
-                                   int bit_vectorize = -1,
+                                   int bit_vectorize = 1,
                                    int num_cpu_threads = 0,
                                    int block_dim = 0);
   MeshForStmt *create_mesh_for(mesh::Mesh *mesh,
                                mesh::MeshElementType element_type,
-                               int bit_vectorize = -1,
+                               int bit_vectorize = 1,
                                int num_cpu_threads = 0,
                                int block_dim = 0);
   WhileStmt *create_while_true();
diff --git a/taichi/ir/snode.cpp b/taichi/ir/snode.cpp
index 06c45e67a8e74..30f9878dc7b03 100644
--- a/taichi/ir/snode.cpp
+++ b/taichi/ir/snode.cpp
@@ -243,8 +243,7 @@ std::string SNode::get_node_type_name() const {
 
 std::string SNode::get_node_type_name_hinted() const {
   std::string suffix;
-  if (type == SNodeType::place || type == SNodeType::bit_struct ||
-      type == SNodeType::bit_array)
+  if (type == SNodeType::place || type == SNodeType::bit_struct)
     suffix = fmt::format("<{}>", dt->to_string());
   if (is_bit_level)
     suffix += "<bit>";
diff --git a/taichi/ir/statements.cpp b/taichi/ir/statements.cpp
index 85298ee206b65..2c1511d2d90bf 100644
--- a/taichi/ir/statements.cpp
+++ b/taichi/ir/statements.cpp
@@ -392,6 +392,7 @@ std::unique_ptr<Stmt> OffloadedStmt::clone() const {
   new_stmt->grid_dim = grid_dim;
   new_stmt->block_dim = block_dim;
   new_stmt->reversed = reversed;
+  new_stmt->bit_vectorize = bit_vectorize;
   new_stmt->num_cpu_threads = num_cpu_threads;
   new_stmt->index_offsets = index_offsets;
 
diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
index 08215d21172ee..68cfa719d916b 100644
--- a/taichi/ir/statements.h
+++ b/taichi/ir/statements.h
@@ -1172,6 +1172,7 @@ class OffloadedStmt : public Stmt {
   int grid_dim{1};
   int block_dim{1};
   bool reversed{false};
+  int bit_vectorize{1};
   int num_cpu_threads{1};
   Stmt *end_stmt{nullptr};
   std::string range_hint = "";
diff --git a/taichi/transforms/offload.cpp b/taichi/transforms/offload.cpp
index 135e0c7053e16..abc6935ed8419 100644
--- a/taichi/transforms/offload.cpp
+++ b/taichi/transforms/offload.cpp
@@ -189,9 +189,9 @@ class Offloader {
     if (!demotable) {
       for (int i = 1; i < path.size(); i++) {
         auto snode_child = path[i];
-        if ((snode_child->type == SNodeType::bit_array ||
-             snode_child->type == SNodeType::bit_struct) &&
-            i == path.size() - 1) {
+        if (snode_child->type == SNodeType::bit_array &&
+            for_stmt->bit_vectorize != 1) {
+          TI_ASSERT(i == path.size() - 1);
           continue;
         }
         auto offloaded_clear_list = Stmt::make_typed<OffloadedStmt>(
@@ -248,6 +248,7 @@ class Offloader {
     }
 
     offloaded_struct_for->snode = for_stmt->snode;
+    offloaded_struct_for->bit_vectorize = for_stmt->bit_vectorize;
     offloaded_struct_for->num_cpu_threads =
         std::min(for_stmt->num_cpu_threads, config.cpu_max_num_threads);
     offloaded_struct_for->mem_access_opt = mem_access_opt;
diff --git a/tests/python/test_bit_array.py b/tests/python/test_bit_array.py
index ce49935e2d3a7..de28898124b0a 100644
--- a/tests/python/test_bit_array.py
+++ b/tests/python/test_bit_array.py
@@ -67,3 +67,37 @@ def verify_val():
 
     set_val()
     verify_val()
+
+
+@test_utils.test(require=ti.extension.quant, debug=True)
+def test_bit_array_struct_for():
+    block_size = 16
+    N = 64
+    cell = ti.root.pointer(ti.i, N // block_size)
+    qi7 = ti.types.quant.int(7)
+
+    x = ti.field(dtype=qi7)
+    cell.dense(ti.i, block_size // 4).bit_array(ti.i, 4, num_bits=32).place(x)
+
+    @ti.kernel
+    def activate():
+        for i in range(N):
+            if i // block_size % 2 == 0:
+                x[i] = i
+
+    @ti.kernel
+    def assign():
+        for i in x:
+            x[i] -= 1
+
+    @ti.kernel
+    def verify():
+        for i in range(N):
+            if i // block_size % 2 == 0:
+                assert x[i] == i - 1
+            else:
+                assert x[i] == 0
+
+    activate()
+    assign()
+    verify()