From 6c9ad1d7e707833c33861feba37156e91b5efbd9 Mon Sep 17 00:00:00 2001
From: Xuanda Yang <th3charlie@gmail.com>
Date: Fri, 1 Jan 2021 22:37:13 +0800
Subject: [PATCH 1/6] test all common 8 cases

---
 tests/python/test_bit_array_vectorization.py | 8 ++++++++
 1 file changed, 8 insertions(+)
diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py
index 2e1b7dc0cbb9f..18ef8720068e7 100644
--- a/tests/python/test_bit_array_vectorization.py
+++ b/tests/python/test_bit_array_vectorization.py
@@ -93,3 +93,11 @@ def verify(dx: ti.template(), dy: ti.template()):
     verify(0, -1)
     assign_vectorized(-1, 0)
     verify(-1, 0)
+    assign_vectorized(1, 1)
+    verify(1, 1)
+    assign_vectorized(1, -1)
+    verify(1, -1)
+    assign_vectorized(-1, -1)
+    verify(-1, -1)
+    assign_vectorized(-1, 1)
+    verify(-1, 1)

From e487acd384322cde3bc91bcebf9d87633ba8bdcc Mon Sep 17 00:00:00 2001
From: Xuanda Yang <th3charlie@gmail.com>
Date: Tue, 5 Jan 2021 11:46:18 +0800
Subject: [PATCH 2/6] transform atomic add and clean up

---
 taichi/transforms/bit_loop_vectorize.cpp  | 54 +++++++++++++++++++++++
 taichi/transforms/compile_to_offloads.cpp |  1 +
 2 files changed, 55 insertions(+)

diff --git a/taichi/transforms/bit_loop_vectorize.cpp b/taichi/transforms/bit_loop_vectorize.cpp
index 02336580cfc21..da3f523c67f61 100644
--- a/taichi/transforms/bit_loop_vectorize.cpp
+++ b/taichi/transforms/bit_loop_vectorize.cpp
@@ -16,6 +16,7 @@ class BitLoopVectorize : public IRVisitor {
   bool in_struct_for_loop;
   StructForStmt *loop_stmt;
   PrimitiveType *bit_array_physical_type;
+  std::unordered_map<Stmt*, std::vector<Stmt*>> transformed_atomics;
 
   BitLoopVectorize() {
     allow_undefined_visitor = true;
@@ -155,10 +156,63 @@ class BitLoopVectorize : public IRVisitor {
     bit_array_physical_type = nullptr;
   }
 
+  void visit(AtomicOpStmt *stmt) override {
+    DataType dt(bit_array_physical_type);
+    if (in_struct_for_loop && bit_vectorize != 1 && stmt->op_type == AtomicOpType::add) {
+      auto it = transformed_atomics.find(stmt->dest);
+      // process a transformed atomic stmt
+      if (it != transformed_atomics.end()) {
+        auto& buffer_vec = it->second;
+        transform_atomic_add(buffer_vec, stmt, dt);
+      } else {
+        // alloc three buffers a, b, c
+        auto alloc_a = std::make_unique<AllocaStmt>(dt);
+        auto alloc_b = std::make_unique<AllocaStmt>(dt);
+        auto alloc_c = std::make_unique<AllocaStmt>(dt);
+        std::vector<Stmt*> buffer_vec{alloc_a.get(), alloc_b.get(), alloc_c.get()};
+        transformed_atomics[stmt->dest] = buffer_vec;
+        // modify IR
+        stmt->insert_before_me(std::move(alloc_a));
+        stmt->insert_before_me(std::move(alloc_b));
+        stmt->insert_before_me(std::move(alloc_c));
+        transform_atomic_add(buffer_vec, stmt, dt);
+      }
+    }
+  }
+
   static void run(IRNode *node) {
     BitLoopVectorize inst;
     node->accept(&inst);
   }
+
+ private:
+  void transform_atomic_add(const std::vector<Stmt*>& buffer_vec, AtomicOpStmt *stmt, DataType& dt) {
+    // To transform an atomic add on a vectorized subarray of a bit array,
+    // we use a local adder with three buffers(*a*,*b*,*c*) of the same physical type
+    // of the original bit array.
+    // Each bit in *a* represents the highest bit of the result, while *b* for the second bit
+    // and *c* for the lowest bit
+    // To add *d* to the subarray, we do bit_xor and bit_and to compute the sum and the carry
+    Stmt* a = buffer_vec[0], *b = buffer_vec[1], *c = buffer_vec[2];
+    auto load_c = std::make_unique<LocalLoadStmt>(LocalAddress(c, 0));
+    auto carry_c = std::make_unique<BinaryOpStmt>(BinaryOpType::bit_and, load_c.get(), stmt->val);
+    auto sum_c = std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, c, stmt->val);
+    auto load_b = std::make_unique<LocalLoadStmt>(LocalAddress(b, 0));
+    auto carry_b = std::make_unique<BinaryOpStmt>(BinaryOpType::bit_and, load_b.get(), carry_c.get());
+    auto sum_b = std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, b, carry_c.get());
+    // for a, we do not need to compute its carry
+    auto sum_a = std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, a, carry_b.get());
+    // modify IR
+    stmt->insert_before_me(std::move(load_c));
+    stmt->insert_before_me(std::move(carry_c));
+    stmt->insert_before_me(std::move(sum_c));
+    stmt->insert_before_me(std::move(load_b));
+    stmt->insert_before_me(std::move(carry_b));
+    stmt->insert_before_me(std::move(sum_b));
+    stmt->insert_before_me(std::move(sum_a));
+    // TODO: we need to do non-trivial replacement of the original atomic add stmt
+    // as it is only used in logical ifs
+  }
 };
 
 namespace irpass {
diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
index 9511c8cee1df7..a08f920e852da 100644
--- a/taichi/transforms/compile_to_offloads.cpp
+++ b/taichi/transforms/compile_to_offloads.cpp
@@ -75,6 +75,7 @@ void compile_to_offloads(IRNode *ir,
   //       create a separate CompileConfig flag for the new pass
   if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) {
     irpass::bit_loop_vectorize(ir);
+    irpass::type_check(ir);
     print("Bit Loop Vectorized");
     irpass::analysis::verify(ir);
   }

From d7840f394a19ff51fac13a86b48b61c1754c9335 Mon Sep 17 00:00:00 2001
From: Xuanda Yang <th3charlie@gmail.com>
Date: Tue, 5 Jan 2021 20:59:20 +0800
Subject: [PATCH 3/6] format

---
 taichi/transforms/bit_loop_vectorize.cpp | 45 ++++++++++++++----------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/taichi/transforms/bit_loop_vectorize.cpp b/taichi/transforms/bit_loop_vectorize.cpp
index da3f523c67f61..7c10ff3514e59 100644
--- a/taichi/transforms/bit_loop_vectorize.cpp
+++ b/taichi/transforms/bit_loop_vectorize.cpp
@@ -16,7 +16,7 @@ class BitLoopVectorize : public IRVisitor {
   bool in_struct_for_loop;
   StructForStmt *loop_stmt;
   PrimitiveType *bit_array_physical_type;
-  std::unordered_map<Stmt*, std::vector<Stmt*>> transformed_atomics;
+  std::unordered_map<Stmt *, std::vector<Stmt *>> transformed_atomics;
 
   BitLoopVectorize() {
     allow_undefined_visitor = true;
@@ -158,18 +158,20 @@ class BitLoopVectorize : public IRVisitor {
 
   void visit(AtomicOpStmt *stmt) override {
     DataType dt(bit_array_physical_type);
-    if (in_struct_for_loop && bit_vectorize != 1 && stmt->op_type == AtomicOpType::add) {
+    if (in_struct_for_loop && bit_vectorize != 1 &&
+        stmt->op_type == AtomicOpType::add) {
       auto it = transformed_atomics.find(stmt->dest);
       // process a transformed atomic stmt
       if (it != transformed_atomics.end()) {
-        auto& buffer_vec = it->second;
+        auto &buffer_vec = it->second;
         transform_atomic_add(buffer_vec, stmt, dt);
       } else {
         // alloc three buffers a, b, c
         auto alloc_a = std::make_unique<AllocaStmt>(dt);
         auto alloc_b = std::make_unique<AllocaStmt>(dt);
         auto alloc_c = std::make_unique<AllocaStmt>(dt);
-        std::vector<Stmt*> buffer_vec{alloc_a.get(), alloc_b.get(), alloc_c.get()};
+        std::vector<Stmt *> buffer_vec{alloc_a.get(), alloc_b.get(),
+                                       alloc_c.get()};
         transformed_atomics[stmt->dest] = buffer_vec;
         // modify IR
         stmt->insert_before_me(std::move(alloc_a));
@@ -186,22 +188,29 @@ class BitLoopVectorize : public IRVisitor {
   }
 
  private:
-  void transform_atomic_add(const std::vector<Stmt*>& buffer_vec, AtomicOpStmt *stmt, DataType& dt) {
+  void transform_atomic_add(const std::vector<Stmt *> &buffer_vec,
+                            AtomicOpStmt *stmt,
+                            DataType &dt) {
     // To transform an atomic add on a vectorized subarray of a bit array,
-    // we use a local adder with three buffers(*a*,*b*,*c*) of the same physical type
-    // of the original bit array.
-    // Each bit in *a* represents the highest bit of the result, while *b* for the second bit
-    // and *c* for the lowest bit
-    // To add *d* to the subarray, we do bit_xor and bit_and to compute the sum and the carry
-    Stmt* a = buffer_vec[0], *b = buffer_vec[1], *c = buffer_vec[2];
+    // we use a local adder with three buffers(*a*,*b*,*c*) of the same physical
+    // type of the original bit array. Each bit in *a* represents the highest
+    // bit of the result, while *b* for the second bit and *c* for the lowest
+    // bit To add *d* to the subarray, we do bit_xor and bit_and to compute the
+    // sum and the carry
+    Stmt *a = buffer_vec[0], *b = buffer_vec[1], *c = buffer_vec[2];
     auto load_c = std::make_unique<LocalLoadStmt>(LocalAddress(c, 0));
-    auto carry_c = std::make_unique<BinaryOpStmt>(BinaryOpType::bit_and, load_c.get(), stmt->val);
-    auto sum_c = std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, c, stmt->val);
+    auto carry_c = std::make_unique<BinaryOpStmt>(BinaryOpType::bit_and,
+                                                  load_c.get(), stmt->val);
+    auto sum_c =
+        std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, c, stmt->val);
     auto load_b = std::make_unique<LocalLoadStmt>(LocalAddress(b, 0));
-    auto carry_b = std::make_unique<BinaryOpStmt>(BinaryOpType::bit_and, load_b.get(), carry_c.get());
-    auto sum_b = std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, b, carry_c.get());
+    auto carry_b = std::make_unique<BinaryOpStmt>(BinaryOpType::bit_and,
+                                                  load_b.get(), carry_c.get());
+    auto sum_b =
+        std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, b, carry_c.get());
     // for a, we do not need to compute its carry
-    auto sum_a = std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, a, carry_b.get());
+    auto sum_a =
+        std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, a, carry_b.get());
     // modify IR
     stmt->insert_before_me(std::move(load_c));
     stmt->insert_before_me(std::move(carry_c));
@@ -210,8 +219,8 @@ class BitLoopVectorize : public IRVisitor {
     stmt->insert_before_me(std::move(carry_b));
     stmt->insert_before_me(std::move(sum_b));
     stmt->insert_before_me(std::move(sum_a));
-    // TODO: we need to do non-trivial replacement of the original atomic add stmt
-    // as it is only used in logical ifs
+    // TODO: we need to do non-trivial replacement of the original atomic add
+    // stmt as it is only used in logical ifs
   }
 };
 

From 9daee57a171000c38b9840c6d8a6605769f12eba Mon Sep 17 00:00:00 2001
From: Xuanda Yang <th3charlie@gmail.com>
Date: Wed, 6 Jan 2021 03:28:03 +0800
Subject: [PATCH 4/6] transform boolean expr and make test pass

---
 taichi/ir/statements.h                       |  13 ++-
 taichi/transforms/bit_loop_vectorize.cpp     | 107 ++++++++++++++++++-
 tests/python/test_bit_array_vectorization.py |  74 +++++++++++++
 3 files changed, 188 insertions(+), 6 deletions(-)

diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
index dc8f77e7c93c5..3666219b4de7b 100644
--- a/taichi/ir/statements.h
+++ b/taichi/ir/statements.h
@@ -141,9 +141,16 @@ class BinaryOpStmt : public Stmt {
  public:
   BinaryOpType op_type;
   Stmt *lhs, *rhs;
+  bool is_bit_vectorized;
 
-  BinaryOpStmt(BinaryOpType op_type, Stmt *lhs, Stmt *rhs)
-      : op_type(op_type), lhs(lhs), rhs(rhs) {
+  BinaryOpStmt(BinaryOpType op_type,
+               Stmt *lhs,
+               Stmt *rhs,
+               bool is_bit_vectorized = false)
+      : op_type(op_type),
+        lhs(lhs),
+        rhs(rhs),
+        is_bit_vectorized(is_bit_vectorized) {
     TI_ASSERT(!lhs->is<AllocaStmt>());
     TI_ASSERT(!rhs->is<AllocaStmt>());
     TI_STMT_REG_FIELDS;
@@ -153,7 +160,7 @@ class BinaryOpStmt : public Stmt {
     return false;
   }
 
-  TI_STMT_DEF_FIELDS(ret_type, op_type, lhs, rhs);
+  TI_STMT_DEF_FIELDS(ret_type, op_type, lhs, rhs, is_bit_vectorized);
   TI_DEFINE_ACCEPT_AND_CLONE
 };
 
diff --git a/taichi/transforms/bit_loop_vectorize.cpp b/taichi/transforms/bit_loop_vectorize.cpp
index 7c10ff3514e59..266970e6f4fe0 100644
--- a/taichi/transforms/bit_loop_vectorize.cpp
+++ b/taichi/transforms/bit_loop_vectorize.cpp
@@ -156,6 +156,90 @@ class BitLoopVectorize : public IRVisitor {
     bit_array_physical_type = nullptr;
   }
 
+  void visit(BinaryOpStmt *stmt) override {
+    // vectorize cmp_eq and bit_and between
+    // vectorized data(local adder/array elems) and constant
+    if (in_struct_for_loop && bit_vectorize != 1) {
+      if (stmt->op_type == BinaryOpType::bit_and) {
+        // if the rhs is a bit vectorized stmt and lhs is a const 1
+        // (usually generated by boolean expr), we simply replace
+        // the stmt with its rhs
+        int lhs_val = get_constant_value(stmt->lhs);
+        if (lhs_val == 1) {
+          if (auto rhs = stmt->rhs->cast<BinaryOpStmt>();
+              rhs && rhs->is_bit_vectorized) {
+            stmt->replace_with(stmt->rhs);
+          }
+        }
+      } else if (stmt->op_type == BinaryOpType::cmp_eq) {
+        if (auto lhs = stmt->lhs->cast<GlobalLoadStmt>()) {
+          // case 0: lhs is a vectorized global load from the bit array
+          if (auto ptr = lhs->ptr->cast<GlobalPtrStmt>();
+              ptr && ptr->is_bit_vectorized) {
+            int32 rhs_val = get_constant_value(stmt->rhs);
+            // TODO: we limit 1 for now, 0 should be easy to implement by a
+            // bit_not on original bit pattern
+            TI_ASSERT(rhs_val == 1);
+            // cmp_eq with 1 yields the bit pattern itself
+
+            // to pass CFG analysis and mark the stmt vectorized
+            // create a dummy lhs + 0 here
+            auto zero = std::make_unique<ConstStmt>(TypedConstant(0));
+            auto add = std::make_unique<BinaryOpStmt>(BinaryOpType::add,
+                                                      stmt->lhs, zero.get());
+            add->is_bit_vectorized = true;
+            // modify IR
+            auto zero_p = zero.get();
+            stmt->insert_before_me(std::move(zero));
+            stmt->replace_with(add.get());
+            zero_p->insert_after_me(std::move(add));
+          }
+        } else if (auto lhs = stmt->lhs->cast<LocalLoadStmt>()) {
+          // case 1: lhs is a local load from a local adder structure
+          auto it = transformed_atomics.find(lhs->ptr[0].var);
+          if (it != transformed_atomics.end()) {
+            int32 rhs_val = get_constant_value(stmt->rhs);
+            // TODO: we limit 2 and 3 for now, the other case should be
+            // implement in a similar fashion
+            TI_ASSERT(rhs_val == 2 || rhs_val == 3);
+            // 010 and 011 respectively
+            auto &buffer_vec = it->second;
+            Stmt *a = buffer_vec[0], *b = buffer_vec[1], *c = buffer_vec[2];
+            // load all three buffers
+            auto load_a = std::make_unique<LocalLoadStmt>(LocalAddress(a, 0));
+            auto load_b = std::make_unique<LocalLoadStmt>(LocalAddress(b, 0));
+            auto load_c = std::make_unique<LocalLoadStmt>(LocalAddress(c, 0));
+            // compute not_a first
+            auto not_a = std::make_unique<UnaryOpStmt>(UnaryOpType::bit_not,
+                                                       load_a.get());
+            // b should always be itself so do nothing
+            // compute not_c
+            auto not_c = std::make_unique<UnaryOpStmt>(UnaryOpType::bit_not,
+                                                       load_c.get());
+            // bit_and all three patterns
+            auto and_a_b = std::make_unique<BinaryOpStmt>(
+                BinaryOpType::bit_and, not_a.get(), load_b.get());
+            auto and_b_c = std::make_unique<BinaryOpStmt>(
+                BinaryOpType::bit_and, and_a_b.get(),
+                rhs_val == 2 ? (Stmt *)(not_c.get()) : (Stmt *)(load_c.get()));
+            // mark the last stmt as vectorized
+            and_b_c->is_bit_vectorized = true;
+            // modify IR
+            auto and_a_b_p = and_a_b.get();
+            stmt->insert_before_me(std::move(load_a));
+            stmt->insert_before_me(std::move(load_b));
+            stmt->insert_before_me(std::move(load_c));
+            stmt->insert_before_me(std::move(not_a));
+            stmt->insert_before_me(std::move(not_c));
+            stmt->insert_before_me(std::move(and_a_b));
+            stmt->replace_with(and_b_c.get());
+            and_a_b_p->insert_after_me(std::move(and_b_c));
+          }
+        }
+      }
+    }
+  }
+
   void visit(AtomicOpStmt *stmt) override {
     DataType dt(bit_array_physical_type);
     if (in_struct_for_loop && bit_vectorize != 1 &&
@@ -219,8 +303,24 @@ class BitLoopVectorize : public IRVisitor {
     stmt->insert_before_me(std::move(carry_b));
     stmt->insert_before_me(std::move(sum_b));
     stmt->insert_before_me(std::move(sum_a));
-    // TODO: we need to do non-trivial replacement of the original atomic add
-    // stmt as it is only used in logical ifs
+    // there is no need to replace the stmt here as we
+    // will replace it manually later
+  }
+
+  int32 get_constant_value(Stmt *stmt) {
+    int32 val = -1;
+    // the stmt could be a cast stmt
+    if (auto cast_stmt = stmt->cast<UnaryOpStmt>();
+        cast_stmt && cast_stmt->is_cast() &&
+        cast_stmt->op_type == UnaryOpType::cast_value) {
+      stmt = cast_stmt->operand;
+    }
+    if (auto constant_stmt = stmt->cast<ConstStmt>();
+        constant_stmt &&
+        constant_stmt->val[0].dt->is_primitive(PrimitiveTypeID::i32)) {
+      val = constant_stmt->val[0].val_i32;
+    }
+    return val;
   }
 };
 
@@ -228,7 +328,8 @@ namespace irpass {
 
 void bit_loop_vectorize(IRNode *root) {
   TI_AUTO_PROF;
-  return BitLoopVectorize::run(root);
+  BitLoopVectorize::run(root);
+  die(root);
 }
 
 }  // namespace irpass
diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py
index 18ef8720068e7..e39ebec20883c 100644
--- a/tests/python/test_bit_array_vectorization.py
+++ b/tests/python/test_bit_array_vectorization.py
@@ -101,3 +101,77 @@ def verify(dx: ti.template(), dy: ti.template()):
     verify(-1, -1)
     assign_vectorized(-1, 1)
     verify(-1, 1)
+
+
+@ti.test(require=ti.extension.quant)
+def test_evolve():
+    ci1 = ti.type_factory.custom_int(1, False)
+
+    x = ti.field(dtype=ci1)
+    y = ti.field(dtype=ci1)
+    z = ti.field(dtype=ci1)
+
+    N = 4096
+    n_blocks = 4
+    bits = 32
+    boundary_offset = 1024
+    assert boundary_offset >= N // n_blocks
+
+    block = ti.root.pointer(ti.ij, (n_blocks, n_blocks))
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array(
+        ti.j, bits, num_bits=bits).place(x)
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array(
+        ti.j, bits, num_bits=bits).place(y)
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array(
+        ti.j, bits, num_bits=bits).place(z)
+
+    @ti.kernel
+    def init():
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            x[i, j] = ti.random(dtype=ti.i32) % 2
+
+    @ti.kernel
+    def evolve_vectorized(x: ti.template(), y: ti.template()):
+        ti.bit_vectorize(32)
+        for i, j in x:
+            num_active_neighbors = 0
+            num_active_neighbors = 0
+            num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
+            y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2
+                                                      and x[i, j] == 1)
+
+    @ti.kernel
+    def evolve_naive(x: ti.template(), y: ti.template()):
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            num_active_neighbors = 0
+            num_active_neighbors = 0
+            num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
+            y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2
+                                                      and x[i, j] == 1)
+
+    @ti.kernel
+    def verify():
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            assert y[i, j] == z[i, j]
+
+    init()
+    evolve_naive(x, z)
+    evolve_vectorized(x, y)
+    verify()

From 9dcad62837da1448df960e4869177bb10d342745 Mon Sep 17 00:00:00 2001
From: Xuanda Yang <th3charlie@gmail.com>
Date: Wed, 6 Jan 2021 08:43:29 +0800
Subject: [PATCH 5/6] refine test case

---
 tests/python/test_bit_array_vectorization.py | 38 +++++++++-----------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py
index e39ebec20883c..28a30983f0a6c 100644
--- a/tests/python/test_bit_array_vectorization.py
+++ b/tests/python/test_bit_array_vectorization.py
@@ -103,7 +103,7 @@ def verify(dx: ti.template(), dy: ti.template()):
     verify(-1, 1)
 
 
-@ti.test(require=ti.extension.quant)
+@ti.test(require=ti.extension.quant, debug=True)
 def test_evolve():
     ci1 = ti.type_factory.custom_int(1, False)
 
@@ -131,20 +131,25 @@ def init():
                                (boundary_offset, N - boundary_offset)):
             x[i, j] = ti.random(dtype=ti.i32) % 2
 
+    @ti.func
+    def compute_active_neighbors(x: ti.template(), i: ti.template(),
+                                 j: ti.template()):
+        num_active_neighbors = 0
+        num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
+        num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
+        num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
+        num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
+        num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
+        num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
+        num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
+        num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
+        return num_active_neighbors
+
     @ti.kernel
     def evolve_vectorized(x: ti.template(), y: ti.template()):
         ti.bit_vectorize(32)
         for i, j in x:
-            num_active_neighbors = 0
-            num_active_neighbors = 0
-            num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
-            num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
-            num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
-            num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
-            num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
-            num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
-            num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
-            num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
+            num_active_neighbors = compute_active_neighbors(x, i, j)
             y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2
                                                       and x[i, j] == 1)
 
@@ -152,16 +157,7 @@ def evolve_vectorized(x: ti.template(), y: ti.template()):
     def evolve_naive(x: ti.template(), y: ti.template()):
         for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
                                (boundary_offset, N - boundary_offset)):
-            num_active_neighbors = 0
-            num_active_neighbors = 0
-            num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
-            num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
-            num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
-            num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
-            num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
-            num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
-            num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
-            num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
+            num_active_neighbors = compute_active_neighbors(x, i, j)
             y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2
                                                       and x[i, j] == 1)
 

From 6d5a002ceac0379440e031d825697f6b7a56be4e Mon Sep 17 00:00:00 2001
From: Xuanda Yang <th3charlie@gmail.com>
Date: Wed, 6 Jan 2021 21:48:43 +0800
Subject: [PATCH 6/6] inline function

---
 tests/python/test_bit_array_vectorization.py | 34 +++++++++++---------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py
index 28a30983f0a6c..df663269df016 100644
--- a/tests/python/test_bit_array_vectorization.py
+++ b/tests/python/test_bit_array_vectorization.py
@@ -131,25 +131,19 @@ def init():
                                (boundary_offset, N - boundary_offset)):
             x[i, j] = ti.random(dtype=ti.i32) % 2
 
-    @ti.func
-    def compute_active_neighbors(x: ti.template(), i: ti.template(),
-                                 j: ti.template()):
-        num_active_neighbors = 0
-        num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
-        num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
-        num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
-        num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
-        num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
-        num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
-        num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
-        num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
-        return num_active_neighbors
-
     @ti.kernel
     def evolve_vectorized(x: ti.template(), y: ti.template()):
         ti.bit_vectorize(32)
         for i, j in x:
-            num_active_neighbors = compute_active_neighbors(x, i, j)
+            num_active_neighbors = 0
+            num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
             y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2
                                                       and x[i, j] == 1)
 
@@ -157,7 +151,15 @@ def evolve_vectorized(x: ti.template(), y: ti.template()):
     def evolve_naive(x: ti.template(), y: ti.template()):
         for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
                                (boundary_offset, N - boundary_offset)):
-            num_active_neighbors = compute_active_neighbors(x, i, j)
+            num_active_neighbors = 0
+            num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
             y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2
                                                       and x[i, j] == 1)