taichi-dev · TH3CHARLie · Dec 26, 2020 · Dec 25, 2020 · Dec 25, 2020 · Dec 25, 2020
diff --git a/misc/test_bit_array_vectorization.py b/misc/test_bit_array_vectorization.py
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
@@ -40,8 +40,6 @@ FrontendForStmt::FrontendForStmt(const ExprGroup &loop_var,
   auto cfg = get_current_program().config;
   if (cfg.arch == Arch::cuda) {
     vectorize = 1;
-    // TODO: temporally setting to 1
-    bit_vectorize = 1;
     parallelize = 1;
     TI_ASSERT(block_dim <= taichi_max_gpu_block_dim);
   } else {
@@ -79,8 +77,6 @@ FrontendForStmt::FrontendForStmt(const Expr &loop_var,
   auto cfg = get_current_program().config;
   if (cfg.arch == Arch::cuda) {
     vectorize = 1;
-    // TODO: temporally setting to 1
-    bit_vectorize = 1;
     parallelize = 1;
   } else {
     if (parallelize == 0)

diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
@@ -65,15 +65,19 @@ void compile_to_offloads(IRNode *ir,
     print("Loop Vectorized");
     irpass::analysis::verify(ir);
 
-    // TODO: create a separate CompileConfig flag for the new pass
-    irpass::bit_loop_vectorize(ir);
-    print("Bit Loop Vectorized");
-    irpass::analysis::verify(ir);
-
     irpass::vector_split(ir, config.max_vector_width, config.serial_schedule);
     print("Loop Split");
     irpass::analysis::verify(ir);
   }
+
+  // TODO: strictly enforce bit vectorization for x86 cpu and CUDA now
+  //       create a separate CompileConfig flag for the new pass
+  if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) {
+    irpass::bit_loop_vectorize(ir);
+    print("Bit Loop Vectorized");
+    irpass::analysis::verify(ir);
+  }
+
   irpass::full_simplify(ir, false);
   print("Simplified I");
   irpass::analysis::verify(ir);

diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py
@@ -0,0 +1,46 @@
+import taichi as ti
+import numpy as np
+
+
+@ti.test(require=ti.extension.quant, debug=True, cfg_optimization=False)
+def test_vectorized_struct_for():
+    ci1 = ti.type_factory_.get_custom_int_type(1, False)
+
+    x = ti.field(dtype=ci1)
+    y = ti.field(dtype=ci1)
+
+    N = 4096
+    n_blocks = 4
+    bits = 32
+    boundary_offset = 1024
+
+    block = ti.root.pointer(ti.ij, (n_blocks, n_blocks))
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array(
+        ti.j, bits, num_bits=bits).place(x)
+    block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array(
+        ti.j, bits, num_bits=bits).place(y)
+
+    @ti.kernel
+    def init():
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                            (boundary_offset, N - boundary_offset)):
+            x[i, j] = ti.random(dtype=ti.i32) % 2
+
+
+    @ti.kernel
+    def assign_vectorized():
+        ti.bit_vectorize(32)
+        for i, j in x:
+            y[i, j] = x[i, j]
+
+
+    @ti.kernel
+    def verify():
+        for i, j in ti.ndrange((boundary_offset, N - boundary_offset),
+                            (boundary_offset, N - boundary_offset)):
+            assert y[i, j] == x[i, j]
+
+
+    init()
+    assign_vectorized()
+    verify()