From 6c9ad1d7e707833c33861feba37156e91b5efbd9 Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Fri, 1 Jan 2021 22:37:13 +0800 Subject: [PATCH 1/6] test all common 8 cases --- tests/python/test_bit_array_vectorization.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py index 2e1b7dc0cbb9f..18ef8720068e7 100644 --- a/tests/python/test_bit_array_vectorization.py +++ b/tests/python/test_bit_array_vectorization.py @@ -93,3 +93,11 @@ def verify(dx: ti.template(), dy: ti.template()): verify(0, -1) assign_vectorized(-1, 0) verify(-1, 0) + assign_vectorized(1, 1) + verify(1, 1) + assign_vectorized(1, -1) + verify(1, -1) + assign_vectorized(-1, -1) + verify(-1, -1) + assign_vectorized(-1, 1) + verify(-1, 1) From e487acd384322cde3bc91bcebf9d87633ba8bdcc Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Tue, 5 Jan 2021 11:46:18 +0800 Subject: [PATCH 2/6] transform atomic add and clean up --- taichi/transforms/bit_loop_vectorize.cpp | 54 +++++++++++++++++++++++ taichi/transforms/compile_to_offloads.cpp | 1 + 2 files changed, 55 insertions(+) diff --git a/taichi/transforms/bit_loop_vectorize.cpp b/taichi/transforms/bit_loop_vectorize.cpp index 02336580cfc21..da3f523c67f61 100644 --- a/taichi/transforms/bit_loop_vectorize.cpp +++ b/taichi/transforms/bit_loop_vectorize.cpp @@ -16,6 +16,7 @@ class BitLoopVectorize : public IRVisitor { bool in_struct_for_loop; StructForStmt *loop_stmt; PrimitiveType *bit_array_physical_type; + std::unordered_map> transformed_atomics; BitLoopVectorize() { allow_undefined_visitor = true; @@ -155,10 +156,63 @@ class BitLoopVectorize : public IRVisitor { bit_array_physical_type = nullptr; } + void visit(AtomicOpStmt *stmt) override { + DataType dt(bit_array_physical_type); + if (in_struct_for_loop && bit_vectorize != 1 && stmt->op_type == AtomicOpType::add) { + auto it = transformed_atomics.find(stmt->dest); + // process a transformed atomic stmt + if (it != transformed_atomics.end()) { + auto& buffer_vec = it->second; + transform_atomic_add(buffer_vec, stmt, dt); + } else { + // alloc three buffers a, b, c + auto alloc_a = std::make_unique(dt); + auto alloc_b = std::make_unique(dt); + auto alloc_c = std::make_unique(dt); + std::vector buffer_vec{alloc_a.get(), alloc_b.get(), alloc_c.get()}; + transformed_atomics[stmt->dest] = buffer_vec; + // modify IR + stmt->insert_before_me(std::move(alloc_a)); + stmt->insert_before_me(std::move(alloc_b)); + stmt->insert_before_me(std::move(alloc_c)); + transform_atomic_add(buffer_vec, stmt, dt); + } + } + } + static void run(IRNode *node) { BitLoopVectorize inst; node->accept(&inst); } + + private: + void transform_atomic_add(const std::vector& buffer_vec, AtomicOpStmt *stmt, DataType& dt) { + // To transform an atomic add on a vectorized subarray of a bit array, + // we use a local adder with three buffers(*a*,*b*,*c*) of the same physical type + // of the original bit array. + // Each bit in *a* represents the highest bit of the result, while *b* for the second bit + // and *c* for the lowest bit + // To add *d* to the subarray, we do bit_xor and bit_and to compute the sum and the carry + Stmt* a = buffer_vec[0], *b = buffer_vec[1], *c = buffer_vec[2]; + auto load_c = std::make_unique(LocalAddress(c, 0)); + auto carry_c = std::make_unique(BinaryOpType::bit_and, load_c.get(), stmt->val); + auto sum_c = std::make_unique(AtomicOpType::bit_xor, c, stmt->val); + auto load_b = std::make_unique(LocalAddress(b, 0)); + auto carry_b = std::make_unique(BinaryOpType::bit_and, load_b.get(), carry_c.get()); + auto sum_b = std::make_unique(AtomicOpType::bit_xor, b, carry_c.get()); + // for a, we do not need to compute its carry + auto sum_a = std::make_unique(AtomicOpType::bit_xor, a, carry_b.get()); + // modify IR + stmt->insert_before_me(std::move(load_c)); + stmt->insert_before_me(std::move(carry_c)); + stmt->insert_before_me(std::move(sum_c)); + stmt->insert_before_me(std::move(load_b)); + stmt->insert_before_me(std::move(carry_b)); + stmt->insert_before_me(std::move(sum_b)); + stmt->insert_before_me(std::move(sum_a)); + // TODO: we need to do non-trivial replacement of the original atomic add stmt + // as it is only used in logical ifs + } }; namespace irpass { diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index 9511c8cee1df7..a08f920e852da 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -75,6 +75,7 @@ void compile_to_offloads(IRNode *ir, // create a separate CompileConfig flag for the new pass if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) { irpass::bit_loop_vectorize(ir); + irpass::type_check(ir); print("Bit Loop Vectorized"); irpass::analysis::verify(ir); } From d7840f394a19ff51fac13a86b48b61c1754c9335 Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Tue, 5 Jan 2021 20:59:20 +0800 Subject: [PATCH 3/6] format --- taichi/transforms/bit_loop_vectorize.cpp | 45 ++++++++++++++---------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/taichi/transforms/bit_loop_vectorize.cpp b/taichi/transforms/bit_loop_vectorize.cpp index da3f523c67f61..7c10ff3514e59 100644 --- a/taichi/transforms/bit_loop_vectorize.cpp +++ b/taichi/transforms/bit_loop_vectorize.cpp @@ -16,7 +16,7 @@ class BitLoopVectorize : public IRVisitor { bool in_struct_for_loop; StructForStmt *loop_stmt; PrimitiveType *bit_array_physical_type; - std::unordered_map> transformed_atomics; + std::unordered_map> transformed_atomics; BitLoopVectorize() { allow_undefined_visitor = true; @@ -158,18 +158,20 @@ class BitLoopVectorize : public IRVisitor { void visit(AtomicOpStmt *stmt) override { DataType dt(bit_array_physical_type); - if (in_struct_for_loop && bit_vectorize != 1 && stmt->op_type == AtomicOpType::add) { + if (in_struct_for_loop && bit_vectorize != 1 && + stmt->op_type == AtomicOpType::add) { auto it = transformed_atomics.find(stmt->dest); // process a transformed atomic stmt if (it != transformed_atomics.end()) { - auto& buffer_vec = it->second; + auto &buffer_vec = it->second; transform_atomic_add(buffer_vec, stmt, dt); } else { // alloc three buffers a, b, c auto alloc_a = std::make_unique(dt); auto alloc_b = std::make_unique(dt); auto alloc_c = std::make_unique(dt); - std::vector buffer_vec{alloc_a.get(), alloc_b.get(), alloc_c.get()}; + std::vector buffer_vec{alloc_a.get(), alloc_b.get(), + alloc_c.get()}; transformed_atomics[stmt->dest] = buffer_vec; // modify IR stmt->insert_before_me(std::move(alloc_a)); @@ -186,22 +188,29 @@ class BitLoopVectorize : public IRVisitor { } private: - void transform_atomic_add(const std::vector& buffer_vec, AtomicOpStmt *stmt, DataType& dt) { + void transform_atomic_add(const std::vector &buffer_vec, + AtomicOpStmt *stmt, + DataType &dt) { // To transform an atomic add on a vectorized subarray of a bit array, - // we use a local adder with three buffers(*a*,*b*,*c*) of the same physical type - // of the original bit array. - // Each bit in *a* represents the highest bit of the result, while *b* for the second bit - // and *c* for the lowest bit - // To add *d* to the subarray, we do bit_xor and bit_and to compute the sum and the carry - Stmt* a = buffer_vec[0], *b = buffer_vec[1], *c = buffer_vec[2]; + // we use a local adder with three buffers(*a*,*b*,*c*) of the same physical + // type of the original bit array. Each bit in *a* represents the highest + // bit of the result, while *b* for the second bit and *c* for the lowest + // bit To add *d* to the subarray, we do bit_xor and bit_and to compute the + // sum and the carry + Stmt *a = buffer_vec[0], *b = buffer_vec[1], *c = buffer_vec[2]; auto load_c = std::make_unique(LocalAddress(c, 0)); - auto carry_c = std::make_unique(BinaryOpType::bit_and, load_c.get(), stmt->val); - auto sum_c = std::make_unique(AtomicOpType::bit_xor, c, stmt->val); + auto carry_c = std::make_unique(BinaryOpType::bit_and, + load_c.get(), stmt->val); + auto sum_c = + std::make_unique(AtomicOpType::bit_xor, c, stmt->val); auto load_b = std::make_unique(LocalAddress(b, 0)); - auto carry_b = std::make_unique(BinaryOpType::bit_and, load_b.get(), carry_c.get()); - auto sum_b = std::make_unique(AtomicOpType::bit_xor, b, carry_c.get()); + auto carry_b = std::make_unique(BinaryOpType::bit_and, + load_b.get(), carry_c.get()); + auto sum_b = + std::make_unique(AtomicOpType::bit_xor, b, carry_c.get()); // for a, we do not need to compute its carry - auto sum_a = std::make_unique(AtomicOpType::bit_xor, a, carry_b.get()); + auto sum_a = + std::make_unique(AtomicOpType::bit_xor, a, carry_b.get()); // modify IR stmt->insert_before_me(std::move(load_c)); stmt->insert_before_me(std::move(carry_c)); @@ -210,8 +219,8 @@ class BitLoopVectorize : public IRVisitor { stmt->insert_before_me(std::move(carry_b)); stmt->insert_before_me(std::move(sum_b)); stmt->insert_before_me(std::move(sum_a)); - // TODO: we need to do non-trivial replacement of the original atomic add stmt - // as it is only used in logical ifs + // TODO: we need to do non-trivial replacement of the original atomic add + // stmt as it is only used in logical ifs } }; From 9daee57a171000c38b9840c6d8a6605769f12eba Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Wed, 6 Jan 2021 03:28:03 +0800 Subject: [PATCH 4/6] transform boolean expr and make test pass --- taichi/ir/statements.h | 13 ++- taichi/transforms/bit_loop_vectorize.cpp | 107 ++++++++++++++++++- tests/python/test_bit_array_vectorization.py | 74 +++++++++++++ 3 files changed, 188 insertions(+), 6 deletions(-) diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h index dc8f77e7c93c5..3666219b4de7b 100644 --- a/taichi/ir/statements.h +++ b/taichi/ir/statements.h @@ -141,9 +141,16 @@ class BinaryOpStmt : public Stmt { public: BinaryOpType op_type; Stmt *lhs, *rhs; + bool is_bit_vectorized; - BinaryOpStmt(BinaryOpType op_type, Stmt *lhs, Stmt *rhs) - : op_type(op_type), lhs(lhs), rhs(rhs) { + BinaryOpStmt(BinaryOpType op_type, + Stmt *lhs, + Stmt *rhs, + bool is_bit_vectorized = false) + : op_type(op_type), + lhs(lhs), + rhs(rhs), + is_bit_vectorized(is_bit_vectorized) { TI_ASSERT(!lhs->is()); TI_ASSERT(!rhs->is()); TI_STMT_REG_FIELDS; @@ -153,7 +160,7 @@ class BinaryOpStmt : public Stmt { return false; } - TI_STMT_DEF_FIELDS(ret_type, op_type, lhs, rhs); + TI_STMT_DEF_FIELDS(ret_type, op_type, lhs, rhs, is_bit_vectorized); TI_DEFINE_ACCEPT_AND_CLONE }; diff --git a/taichi/transforms/bit_loop_vectorize.cpp b/taichi/transforms/bit_loop_vectorize.cpp index 7c10ff3514e59..266970e6f4fe0 100644 --- a/taichi/transforms/bit_loop_vectorize.cpp +++ b/taichi/transforms/bit_loop_vectorize.cpp @@ -156,6 +156,90 @@ class BitLoopVectorize : public IRVisitor { bit_array_physical_type = nullptr; } + void visit(BinaryOpStmt *stmt) override { + // vectorize cmp_eq and bit_and between + // vectorized data(local adder/array elems) and constant + if (in_struct_for_loop && bit_vectorize != 1) { + if (stmt->op_type == BinaryOpType::bit_and) { + // if the rhs is a bit vectorized stmt and lhs is a const 1 + // (usually generated by boolean expr), we simply replace + // the stmt with its rhs + int lhs_val = get_constant_value(stmt->lhs); + if (lhs_val == 1) { + if (auto rhs = stmt->rhs->cast(); + rhs && rhs->is_bit_vectorized) { + stmt->replace_with(stmt->rhs); + } + } + } else if (stmt->op_type == BinaryOpType::cmp_eq) { + if (auto lhs = stmt->lhs->cast()) { + // case 0: lhs is a vectorized global load from the bit array + if (auto ptr = lhs->ptr->cast(); + ptr && ptr->is_bit_vectorized) { + int32 rhs_val = get_constant_value(stmt->rhs); + // TODO: we limit 1 for now, 0 should be easy to implement by a + // bit_not on original bit pattern + TI_ASSERT(rhs_val == 1); + // cmp_eq with 1 yields the bit pattern itself + + // to pass CFG analysis and mark the stmt vectorized + // create a dummy lhs + 0 here + auto zero = std::make_unique(TypedConstant(0)); + auto add = std::make_unique(BinaryOpType::add, + stmt->lhs, zero.get()); + add->is_bit_vectorized = true; + // modify IR + auto zero_p = zero.get(); + stmt->insert_before_me(std::move(zero)); + stmt->replace_with(add.get()); + zero_p->insert_after_me(std::move(add)); + } + } else if (auto lhs = stmt->lhs->cast()) { + // case 1: lhs is a local load from a local adder structure + auto it = transformed_atomics.find(lhs->ptr[0].var); + if (it != transformed_atomics.end()) { + int32 rhs_val = get_constant_value(stmt->rhs); + // TODO: we limit 2 and 3 for now, the other case should be + // implement in a similar fashion + TI_ASSERT(rhs_val == 2 || rhs_val == 3); + // 010 and 011 respectively + auto &buffer_vec = it->second; + Stmt *a = buffer_vec[0], *b = buffer_vec[1], *c = buffer_vec[2]; + // load all three buffers + auto load_a = std::make_unique(LocalAddress(a, 0)); + auto load_b = std::make_unique(LocalAddress(b, 0)); + auto load_c = std::make_unique(LocalAddress(c, 0)); + // compute not_a first + auto not_a = std::make_unique(UnaryOpType::bit_not, + load_a.get()); + // b should always be itself so do nothing + // compute not_c + auto not_c = std::make_unique(UnaryOpType::bit_not, + load_c.get()); + // bit_and all three patterns + auto and_a_b = std::make_unique( + BinaryOpType::bit_and, not_a.get(), load_b.get()); + auto and_b_c = std::make_unique( + BinaryOpType::bit_and, and_a_b.get(), + rhs_val == 2 ? (Stmt *)(not_c.get()) : (Stmt *)(load_c.get())); + // mark the last stmt as vectorized + and_b_c->is_bit_vectorized = true; + // modify IR + auto and_a_b_p = and_a_b.get(); + stmt->insert_before_me(std::move(load_a)); + stmt->insert_before_me(std::move(load_b)); + stmt->insert_before_me(std::move(load_c)); + stmt->insert_before_me(std::move(not_a)); + stmt->insert_before_me(std::move(not_c)); + stmt->insert_before_me(std::move(and_a_b)); + stmt->replace_with(and_b_c.get()); + and_a_b_p->insert_after_me(std::move(and_b_c)); + } + } + } + } + } + void visit(AtomicOpStmt *stmt) override { DataType dt(bit_array_physical_type); if (in_struct_for_loop && bit_vectorize != 1 && @@ -219,8 +303,24 @@ class BitLoopVectorize : public IRVisitor { stmt->insert_before_me(std::move(carry_b)); stmt->insert_before_me(std::move(sum_b)); stmt->insert_before_me(std::move(sum_a)); - // TODO: we need to do non-trivial replacement of the original atomic add - // stmt as it is only used in logical ifs + // there is no need to replace the stmt here as we + // will replace it manually later + } + + int32 get_constant_value(Stmt *stmt) { + int32 val = -1; + // the stmt could be a cast stmt + if (auto cast_stmt = stmt->cast(); + cast_stmt && cast_stmt->is_cast() && + cast_stmt->op_type == UnaryOpType::cast_value) { + stmt = cast_stmt->operand; + } + if (auto constant_stmt = stmt->cast(); + constant_stmt && + constant_stmt->val[0].dt->is_primitive(PrimitiveTypeID::i32)) { + val = constant_stmt->val[0].val_i32; + } + return val; } }; @@ -228,7 +328,8 @@ namespace irpass { void bit_loop_vectorize(IRNode *root) { TI_AUTO_PROF; - return BitLoopVectorize::run(root); + BitLoopVectorize::run(root); + die(root); } } // namespace irpass diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py index 18ef8720068e7..e39ebec20883c 100644 --- a/tests/python/test_bit_array_vectorization.py +++ b/tests/python/test_bit_array_vectorization.py @@ -101,3 +101,77 @@ def verify(dx: ti.template(), dy: ti.template()): verify(-1, -1) assign_vectorized(-1, 1) verify(-1, 1) + + +@ti.test(require=ti.extension.quant) +def test_evolve(): + ci1 = ti.type_factory.custom_int(1, False) + + x = ti.field(dtype=ci1) + y = ti.field(dtype=ci1) + z = ti.field(dtype=ci1) + + N = 4096 + n_blocks = 4 + bits = 32 + boundary_offset = 1024 + assert boundary_offset >= N // n_blocks + + block = ti.root.pointer(ti.ij, (n_blocks, n_blocks)) + block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array( + ti.j, bits, num_bits=bits).place(x) + block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array( + ti.j, bits, num_bits=bits).place(y) + block.dense(ti.ij, (N // n_blocks, N // (bits * n_blocks)))._bit_array( + ti.j, bits, num_bits=bits).place(z) + + @ti.kernel + def init(): + for i, j in ti.ndrange((boundary_offset, N - boundary_offset), + (boundary_offset, N - boundary_offset)): + x[i, j] = ti.random(dtype=ti.i32) % 2 + + @ti.kernel + def evolve_vectorized(x: ti.template(), y: ti.template()): + ti.bit_vectorize(32) + for i, j in x: + num_active_neighbors = 0 + num_active_neighbors = 0 + num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i - 1, j], ti.u32) + num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32) + num_active_neighbors += ti.cast(x[i, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i, j + 1], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32) + y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2 + and x[i, j] == 1) + + @ti.kernel + def evolve_naive(x: ti.template(), y: ti.template()): + for i, j in ti.ndrange((boundary_offset, N - boundary_offset), + (boundary_offset, N - boundary_offset)): + num_active_neighbors = 0 + num_active_neighbors = 0 + num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i - 1, j], ti.u32) + num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32) + num_active_neighbors += ti.cast(x[i, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i, j + 1], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32) + y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2 + and x[i, j] == 1) + + @ti.kernel + def verify(): + for i, j in ti.ndrange((boundary_offset, N - boundary_offset), + (boundary_offset, N - boundary_offset)): + assert y[i, j] == z[i, j] + + init() + evolve_naive(x, z) + evolve_vectorized(x, y) + verify() From 9dcad62837da1448df960e4869177bb10d342745 Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Wed, 6 Jan 2021 08:43:29 +0800 Subject: [PATCH 5/6] refine test case --- tests/python/test_bit_array_vectorization.py | 38 +++++++++----------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py index e39ebec20883c..28a30983f0a6c 100644 --- a/tests/python/test_bit_array_vectorization.py +++ b/tests/python/test_bit_array_vectorization.py @@ -103,7 +103,7 @@ def verify(dx: ti.template(), dy: ti.template()): verify(-1, 1) -@ti.test(require=ti.extension.quant) +@ti.test(require=ti.extension.quant, debug=True) def test_evolve(): ci1 = ti.type_factory.custom_int(1, False) @@ -131,20 +131,25 @@ def init(): (boundary_offset, N - boundary_offset)): x[i, j] = ti.random(dtype=ti.i32) % 2 + @ti.func + def compute_active_neighbors(x: ti.template(), i: ti.template(), + j: ti.template()): + num_active_neighbors = 0 + num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i - 1, j], ti.u32) + num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32) + num_active_neighbors += ti.cast(x[i, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i, j + 1], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32) + return num_active_neighbors + @ti.kernel def evolve_vectorized(x: ti.template(), y: ti.template()): ti.bit_vectorize(32) for i, j in x: - num_active_neighbors = 0 - num_active_neighbors = 0 - num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32) - num_active_neighbors += ti.cast(x[i - 1, j], ti.u32) - num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32) - num_active_neighbors += ti.cast(x[i, j - 1], ti.u32) - num_active_neighbors += ti.cast(x[i, j + 1], ti.u32) - num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32) - num_active_neighbors += ti.cast(x[i + 1, j], ti.u32) - num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32) + num_active_neighbors = compute_active_neighbors(x, i, j) y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2 and x[i, j] == 1) @@ -152,16 +157,7 @@ def evolve_vectorized(x: ti.template(), y: ti.template()): def evolve_naive(x: ti.template(), y: ti.template()): for i, j in ti.ndrange((boundary_offset, N - boundary_offset), (boundary_offset, N - boundary_offset)): - num_active_neighbors = 0 - num_active_neighbors = 0 - num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32) - num_active_neighbors += ti.cast(x[i - 1, j], ti.u32) - num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32) - num_active_neighbors += ti.cast(x[i, j - 1], ti.u32) - num_active_neighbors += ti.cast(x[i, j + 1], ti.u32) - num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32) - num_active_neighbors += ti.cast(x[i + 1, j], ti.u32) - num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32) + num_active_neighbors = compute_active_neighbors(x, i, j) y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2 and x[i, j] == 1) From 6d5a002ceac0379440e031d825697f6b7a56be4e Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Wed, 6 Jan 2021 21:48:43 +0800 Subject: [PATCH 6/6] inline function --- tests/python/test_bit_array_vectorization.py | 34 +++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py index 28a30983f0a6c..df663269df016 100644 --- a/tests/python/test_bit_array_vectorization.py +++ b/tests/python/test_bit_array_vectorization.py @@ -131,25 +131,19 @@ def init(): (boundary_offset, N - boundary_offset)): x[i, j] = ti.random(dtype=ti.i32) % 2 - @ti.func - def compute_active_neighbors(x: ti.template(), i: ti.template(), - j: ti.template()): - num_active_neighbors = 0 - num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32) - num_active_neighbors += ti.cast(x[i - 1, j], ti.u32) - num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32) - num_active_neighbors += ti.cast(x[i, j - 1], ti.u32) - num_active_neighbors += ti.cast(x[i, j + 1], ti.u32) - num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32) - num_active_neighbors += ti.cast(x[i + 1, j], ti.u32) - num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32) - return num_active_neighbors - @ti.kernel def evolve_vectorized(x: ti.template(), y: ti.template()): ti.bit_vectorize(32) for i, j in x: - num_active_neighbors = compute_active_neighbors(x, i, j) + num_active_neighbors = 0 + num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i - 1, j], ti.u32) + num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32) + num_active_neighbors += ti.cast(x[i, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i, j + 1], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32) y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2 and x[i, j] == 1) @@ -157,7 +151,15 @@ def evolve_vectorized(x: ti.template(), y: ti.template()): def evolve_naive(x: ti.template(), y: ti.template()): for i, j in ti.ndrange((boundary_offset, N - boundary_offset), (boundary_offset, N - boundary_offset)): - num_active_neighbors = compute_active_neighbors(x, i, j) + num_active_neighbors = 0 + num_active_neighbors += ti.cast(x[i - 1, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i - 1, j], ti.u32) + num_active_neighbors += ti.cast(x[i - 1, j + 1], ti.u32) + num_active_neighbors += ti.cast(x[i, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i, j + 1], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j - 1], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j], ti.u32) + num_active_neighbors += ti.cast(x[i + 1, j + 1], ti.u32) y[i, j] = (num_active_neighbors == 3) or (num_active_neighbors == 2 and x[i, j] == 1)