taichi-dev · TH3CHARLie · Jan 6, 2021 · Jan 1, 2021 · Jan 5, 2021 · Jan 5, 2021
diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
@@ -141,9 +141,16 @@ class BinaryOpStmt : public Stmt {
  public:
   BinaryOpType op_type;
   Stmt *lhs, *rhs;
+  bool is_bit_vectorized;
 
-  BinaryOpStmt(BinaryOpType op_type, Stmt *lhs, Stmt *rhs)
-      : op_type(op_type), lhs(lhs), rhs(rhs) {
+  BinaryOpStmt(BinaryOpType op_type,
+               Stmt *lhs,
+               Stmt *rhs,
+               bool is_bit_vectorized = false)
+      : op_type(op_type),
+        lhs(lhs),
+        rhs(rhs),
+        is_bit_vectorized(is_bit_vectorized) {
     TI_ASSERT(!lhs->is<AllocaStmt>());
     TI_ASSERT(!rhs->is<AllocaStmt>());
     TI_STMT_REG_FIELDS;
@@ -153,7 +160,7 @@ class BinaryOpStmt : public Stmt {
     return false;
   }
 
-  TI_STMT_DEF_FIELDS(ret_type, op_type, lhs, rhs);
+  TI_STMT_DEF_FIELDS(ret_type, op_type, lhs, rhs, is_bit_vectorized);
   TI_DEFINE_ACCEPT_AND_CLONE
 };
 

diff --git a/taichi/transforms/bit_loop_vectorize.cpp b/taichi/transforms/bit_loop_vectorize.cpp
@@ -16,6 +16,7 @@ class BitLoopVectorize : public IRVisitor {
   bool in_struct_for_loop;
   StructForStmt *loop_stmt;
   PrimitiveType *bit_array_physical_type;
+  std::unordered_map<Stmt *, std::vector<Stmt *>> transformed_atomics;
 
   BitLoopVectorize() {
     allow_undefined_visitor = true;
@@ -155,17 +156,180 @@ class BitLoopVectorize : public IRVisitor {
     bit_array_physical_type = nullptr;
   }
 
+  void visit(BinaryOpStmt *stmt) override {
+    // vectorize cmp_eq and bit_and between
+    // vectorized data(local adder/array elems) and constant
+    if (in_struct_for_loop && bit_vectorize != 1) {
+      if (stmt->op_type == BinaryOpType::bit_and) {
+        // if the rhs is a bit vectorized stmt and lhs is a const 1
+        // (usually generated by boolean expr), we simply replace
+        // the stmt with its rhs
+        int lhs_val = get_constant_value(stmt->lhs);
+        if (lhs_val == 1) {
+          if (auto rhs = stmt->rhs->cast<BinaryOpStmt>();
+              rhs && rhs->is_bit_vectorized) {
+            stmt->replace_with(stmt->rhs);
+          }
+        }
+      } else if (stmt->op_type == BinaryOpType::cmp_eq) {
+        if (auto lhs = stmt->lhs->cast<GlobalLoadStmt>()) {
+          // case 0: lhs is a vectorized global load from the bit array
+          if (auto ptr = lhs->ptr->cast<GlobalPtrStmt>();
+              ptr && ptr->is_bit_vectorized) {
+            int32 rhs_val = get_constant_value(stmt->rhs);
+            // TODO: we limit 1 for now, 0 should be easy to implement by a
+            // bit_not on original bit pattern
+            TI_ASSERT(rhs_val == 1);
+            // cmp_eq with 1 yields the bit pattern itself
+
+            // to pass CFG analysis and mark the stmt vectorized
+            // create a dummy lhs + 0 here
+            auto zero = std::make_unique<ConstStmt>(TypedConstant(0));
+            auto add = std::make_unique<BinaryOpStmt>(BinaryOpType::add,
+                                                      stmt->lhs, zero.get());
+            add->is_bit_vectorized = true;
+            // modify IR
+            auto zero_p = zero.get();
+            stmt->insert_before_me(std::move(zero));
+            stmt->replace_with(add.get());
+            zero_p->insert_after_me(std::move(add));
+          }
+        } else if (auto lhs = stmt->lhs->cast<LocalLoadStmt>()) {
+          // case 1: lhs is a local load from a local adder structure
+          auto it = transformed_atomics.find(lhs->ptr[0].var);
+          if (it != transformed_atomics.end()) {
+            int32 rhs_val = get_constant_value(stmt->rhs);
+            // TODO: we limit 2 and 3 for now, the other case should be
+            // implement in a similar fashion
+            TI_ASSERT(rhs_val == 2 || rhs_val == 3);
+            // 010 and 011 respectively
+            auto &buffer_vec = it->second;
+            Stmt *a = buffer_vec[0], *b = buffer_vec[1], *c = buffer_vec[2];
+            // load all three buffers
+            auto load_a = std::make_unique<LocalLoadStmt>(LocalAddress(a, 0));
+            auto load_b = std::make_unique<LocalLoadStmt>(LocalAddress(b, 0));
+            auto load_c = std::make_unique<LocalLoadStmt>(LocalAddress(c, 0));
+            // compute not_a first
+            auto not_a = std::make_unique<UnaryOpStmt>(UnaryOpType::bit_not,
+                                                       load_a.get());
+            // b should always be itself so do nothing
+            // compute not_c
+            auto not_c = std::make_unique<UnaryOpStmt>(UnaryOpType::bit_not,
+                                                       load_c.get());
+            // bit_and all three patterns
+            auto and_a_b = std::make_unique<BinaryOpStmt>(
+                BinaryOpType::bit_and, not_a.get(), load_b.get());
+            auto and_b_c = std::make_unique<BinaryOpStmt>(
+                BinaryOpType::bit_and, and_a_b.get(),
+                rhs_val == 2 ? (Stmt *)(not_c.get()) : (Stmt *)(load_c.get()));
+            // mark the last stmt as vectorized
+            and_b_c->is_bit_vectorized = true;
+            // modify IR
+            auto and_a_b_p = and_a_b.get();
+            stmt->insert_before_me(std::move(load_a));
 VecStatement statements; 
 VecStatement statements; 
+            stmt->insert_before_me(std::move(load_b));
+            stmt->insert_before_me(std::move(load_c));
+            stmt->insert_before_me(std::move(not_a));
+            stmt->insert_before_me(std::move(not_c));
+            stmt->insert_before_me(std::move(and_a_b));
+            stmt->replace_with(and_b_c.get());
+            and_a_b_p->insert_after_me(std::move(and_b_c));
+          }
+        }
+      }
+    }
+  }
+
+  void visit(AtomicOpStmt *stmt) override {
+    DataType dt(bit_array_physical_type);
+    if (in_struct_for_loop && bit_vectorize != 1 &&
+        stmt->op_type == AtomicOpType::add) {
+      auto it = transformed_atomics.find(stmt->dest);
+      // process a transformed atomic stmt
+      if (it != transformed_atomics.end()) {
+        auto &buffer_vec = it->second;
+        transform_atomic_add(buffer_vec, stmt, dt);
+      } else {
+        // alloc three buffers a, b, c
+        auto alloc_a = std::make_unique<AllocaStmt>(dt);
+        auto alloc_b = std::make_unique<AllocaStmt>(dt);
+        auto alloc_c = std::make_unique<AllocaStmt>(dt);
+        std::vector<Stmt *> buffer_vec{alloc_a.get(), alloc_b.get(),
+                                       alloc_c.get()};
+        transformed_atomics[stmt->dest] = buffer_vec;
+        // modify IR
+        stmt->insert_before_me(std::move(alloc_a));
+        stmt->insert_before_me(std::move(alloc_b));
+        stmt->insert_before_me(std::move(alloc_c));
+        transform_atomic_add(buffer_vec, stmt, dt);
+      }
+    }
+  }
+
   static void run(IRNode *node) {
     BitLoopVectorize inst;
     node->accept(&inst);
   }
+
+ private:
+  void transform_atomic_add(const std::vector<Stmt *> &buffer_vec,
+                            AtomicOpStmt *stmt,
+                            DataType &dt) {
+    // To transform an atomic add on a vectorized subarray of a bit array,
+    // we use a local adder with three buffers(*a*,*b*,*c*) of the same physical
+    // type of the original bit array. Each bit in *a* represents the highest
+    // bit of the result, while *b* for the second bit and *c* for the lowest
+    // bit To add *d* to the subarray, we do bit_xor and bit_and to compute the
+    // sum and the carry
+    Stmt *a = buffer_vec[0], *b = buffer_vec[1], *c = buffer_vec[2];
+    auto load_c = std::make_unique<LocalLoadStmt>(LocalAddress(c, 0));
+    auto carry_c = std::make_unique<BinaryOpStmt>(BinaryOpType::bit_and,
+                                                  load_c.get(), stmt->val);
+    auto sum_c =
+        std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, c, stmt->val);
+    auto load_b = std::make_unique<LocalLoadStmt>(LocalAddress(b, 0));
+    auto carry_b = std::make_unique<BinaryOpStmt>(BinaryOpType::bit_and,
+                                                  load_b.get(), carry_c.get());
+    auto sum_b =
+        std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, b, carry_c.get());
+    // for a, we do not need to compute its carry
+    auto sum_a =
+        std::make_unique<AtomicOpStmt>(AtomicOpType::bit_xor, a, carry_b.get());
+    // modify IR
+    stmt->insert_before_me(std::move(load_c));
+    stmt->insert_before_me(std::move(carry_c));
+    stmt->insert_before_me(std::move(sum_c));
+    stmt->insert_before_me(std::move(load_b));
+    stmt->insert_before_me(std::move(carry_b));
+    stmt->insert_before_me(std::move(sum_b));
+    stmt->insert_before_me(std::move(sum_a));
+    // there is no need to replace the stmt here as we
+    // will replace it manually later
+  }
+
+  int32 get_constant_value(Stmt *stmt) {
+    int32 val = -1;
+    // the stmt could be a cast stmt
+    if (auto cast_stmt = stmt->cast<UnaryOpStmt>();
+        cast_stmt && cast_stmt->is_cast() &&
+        cast_stmt->op_type == UnaryOpType::cast_value) {
+      stmt = cast_stmt->operand;
+    }
+    if (auto constant_stmt = stmt->cast<ConstStmt>();
+        constant_stmt &&
+        constant_stmt->val[0].dt->is_primitive(PrimitiveTypeID::i32)) {
+      val = constant_stmt->val[0].val_i32;
+    }
+    return val;
+  }
 };
 
 namespace irpass {
 
 void bit_loop_vectorize(IRNode *root) {
   TI_AUTO_PROF;
-  return BitLoopVectorize::run(root);
+  BitLoopVectorize::run(root);
+  die(root);
 }
 
 }  // namespace irpass

diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
@@ -75,6 +75,7 @@ void compile_to_offloads(IRNode *ir,
   //       create a separate CompileConfig flag for the new pass
   if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) {
     irpass::bit_loop_vectorize(ir);
+    irpass::type_check(ir);
     print("Bit Loop Vectorized");
     irpass::analysis::verify(ir);
   }

diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py
@@ -93,3 +93,85 @@ def verify(dx: ti.template(), dy: ti.template()):
     verify(0, -1)
     assign_vectorized(-1, 0)
     verify(-1, 0)
+    assign_vectorized(1, 1)
+    verify(1, 1)
+    assign_vectorized(1, -1)
+    verify(1, -1)
+    assign_vectorized(-1, -1)
+    verify(-1, -1)
+    assign_vectorized(-1, 1)
+    verify(-1, 1)
+
+
+@ti.test(require=ti.extension.quant)
+def test_evolve():
+    ci1 = ti.type_factory.custom_int(1, False)
+
+    x = ti.field(dtype=ci1)
+    y = ti.field(dtype=ci1)
+    z = ti.field(dtype=ci1)
+
+    N = 4096
+    n_blocks = 4
+    bits = 32
+    boundary_offset = 1024
+    assert boundary_offset >= N // n_blocks
+
+    block = ti.root.pointer(ti.ij, (n_blocks, n_blocks))
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array(
+        ti.j, bits, num_bits=bits).place(x)
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array(
+        ti.j, bits, num_bits=bits).place(y)
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array(
+        ti.j, bits, num_bits=bits).place(z)
+
+    @ti.kernel
+    def init():
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            x[i, j] = ti.random(dtype=ti.i32) % 2
+
+    @ti.kernel
+    def evolve_vectorized(x: ti.template(), y: ti.template()):
+        ti.bit_vectorize(32)
+        for i, j in x:
+            num_active_neighbors = 0
+            num_active_neighbors = 0
+            num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
+            y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2
+                                                      and x[i, j] == 1)
+
+    @ti.kernel
+    def evolve_naive(x: ti.template(), y: ti.template()):
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            num_active_neighbors = 0
+            num_active_neighbors = 0
+            num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i, j + 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j], ti.u32)
+            num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32)
+            y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2
+                                                      and x[i, j] == 1)
+
+    @ti.kernel
+    def verify():
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                               (boundary_offset, N - boundary_offset)):
+            assert y[i, j] == z[i, j]
+
+    init()
+    evolve_naive(x, z)
+    evolve_vectorized(x, y)
+    verify()