diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp index 5776aca5a66cc..8fc36feaec3b6 100644 --- a/taichi/codegen/llvm/codegen_llvm.cpp +++ b/taichi/codegen/llvm/codegen_llvm.cpp @@ -241,6 +241,9 @@ std::unique_ptr CodeGenLLVM::emit_struct_meta_object( meta = std::make_unique("BitmaskedMeta", this, builder.get()); emit_struct_meta_base("Bitmasked", meta->ptr, snode); + } else if (snode->type == SNodeType::bit_array) { + meta = std::make_unique("DenseMeta", this, builder.get()); + emit_struct_meta_base("Dense", meta->ptr, snode); } else { TI_P(snode_type_name(snode->type)); TI_NOT_IMPLEMENTED; @@ -1725,16 +1728,16 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) { llvm::Function *body = nullptr; auto leaf_block = stmt->snode; - // When looping over bit_arrays, we always vectorize and generate struct for - // on their parent node (usually "dense") instead of itself for higher - // performance. Also, note that the loop must be bit_vectorized for - // bit_arrays, and their parent must be "dense". - if (leaf_block->type == SNodeType::bit_array) { - if (leaf_block->parent->type == SNodeType::dense) { + // For a bit-vectorized loop over a bit array, we generate struct for on its + // parent node (must be "dense") instead of itself for higher performance. + if (stmt->bit_vectorize != 1) { + if (leaf_block->type == SNodeType::bit_array && + leaf_block->parent->type == SNodeType::dense) { leaf_block = leaf_block->parent; } else { TI_ERROR( - "Struct-for looping through bit array but its parent is not dense") + "A bit-vectorized struct-for must loop over a bit array with a dense " + "parent"); } } @@ -1866,20 +1869,14 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) { create_call(refine, {parent_coordinates, new_coordinates, builder->CreateLoad(loop_index)}); - // One more refine step is needed for bit_arrays to make final coordinates - // non-consecutive, since each thread will process multiple - // coordinates via vectorization - if (stmt->snode->type == SNodeType::bit_array && stmt->snode->parent) { - if (stmt->snode->parent->type == SNodeType::dense) { - refine = - get_runtime_function(stmt->snode->refine_coordinates_func_name()); - - create_call(refine, - {new_coordinates, new_coordinates, tlctx->get_constant(0)}); - } else { - TI_ERROR( - "Struct-for looping through bit array but its parent is not dense"); - } + // For a bit-vectorized loop over a bit array, one more refine step is + // needed to make final coordinates non-consecutive, since each thread will + // process multiple coordinates via vectorization + if (stmt->bit_vectorize != 1) { + refine = + get_runtime_function(stmt->snode->refine_coordinates_func_name()); + create_call(refine, + {new_coordinates, new_coordinates, tlctx->get_constant(0)}); } current_coordinates = new_coordinates; @@ -1889,37 +1886,28 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) { // - if leaf block is bitmasked, make sure we only loop over active // voxels auto exec_cond = tlctx->get_constant(true); - auto snode = stmt->snode; - if (snode->type == SNodeType::bit_array && snode->parent) { - if (snode->parent->type == SNodeType::dense) { - snode = snode->parent; - } else { - TI_ERROR( - "Struct-for looping through bit array but its parent is not dense"); - } - } - auto coord_object = RuntimeObject(kLLVMPhysicalCoordinatesName, this, builder.get(), new_coordinates); if (!prog->config.packed) { - for (int i = 0; i < snode->num_active_indices; i++) { - auto j = snode->physical_index_position[i]; + for (int i = 0; i < leaf_block->num_active_indices; i++) { + auto j = leaf_block->physical_index_position[i]; if (!bit::is_power_of_two( - snode->extractors[j].num_elements_from_root)) { + leaf_block->extractors[j].num_elements_from_root)) { auto coord = coord_object.get("val", tlctx->get_constant(j)); exec_cond = builder->CreateAnd( - exec_cond, builder->CreateICmp( - llvm::CmpInst::ICMP_SLT, coord, - tlctx->get_constant( - snode->extractors[j].num_elements_from_root))); + exec_cond, + builder->CreateICmp( + llvm::CmpInst::ICMP_SLT, coord, + tlctx->get_constant( + leaf_block->extractors[j].num_elements_from_root))); } } } - if (snode->type == SNodeType::bitmasked || - snode->type == SNodeType::pointer) { + if (leaf_block->type == SNodeType::bitmasked || + leaf_block->type == SNodeType::pointer) { // test whether the current voxel is active or not - auto is_active = call(snode, element.get("element"), "is_active", + auto is_active = call(leaf_block, element.get("element"), "is_active", {builder->CreateLoad(loop_index)}); is_active = builder->CreateTrunc(is_active, llvm::Type::getInt1Ty(*llvm_context)); diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h index 5acb9393f093a..e5fc3027156b8 100644 --- a/taichi/ir/frontend_ir.h +++ b/taichi/ir/frontend_ir.h @@ -14,7 +14,7 @@ TLANG_NAMESPACE_BEGIN struct ForLoopConfig { - int bit_vectorize{0}; + int bit_vectorize{1}; int num_cpu_threads{0}; bool strictly_serialized{false}; MemoryAccessOptions mem_access_opt; @@ -803,7 +803,7 @@ class ASTBuilder { } void reset() { - config.bit_vectorize = -1; + config.bit_vectorize = 1; config.num_cpu_threads = 0; config.uniform = false; config.mem_access_opt.clear(); diff --git a/taichi/ir/ir_builder.h b/taichi/ir/ir_builder.h index 48fd37a078b9d..28a62611c9b37 100644 --- a/taichi/ir/ir_builder.h +++ b/taichi/ir/ir_builder.h @@ -103,17 +103,17 @@ class IRBuilder { // Control flows. RangeForStmt *create_range_for(Stmt *begin, Stmt *end, - int bit_vectorize = -1, + int bit_vectorize = 1, int num_cpu_threads = 0, int block_dim = 0, bool strictly_serialized = false); StructForStmt *create_struct_for(SNode *snode, - int bit_vectorize = -1, + int bit_vectorize = 1, int num_cpu_threads = 0, int block_dim = 0); MeshForStmt *create_mesh_for(mesh::Mesh *mesh, mesh::MeshElementType element_type, - int bit_vectorize = -1, + int bit_vectorize = 1, int num_cpu_threads = 0, int block_dim = 0); WhileStmt *create_while_true(); diff --git a/taichi/ir/snode.cpp b/taichi/ir/snode.cpp index 06c45e67a8e74..30f9878dc7b03 100644 --- a/taichi/ir/snode.cpp +++ b/taichi/ir/snode.cpp @@ -243,8 +243,7 @@ std::string SNode::get_node_type_name() const { std::string SNode::get_node_type_name_hinted() const { std::string suffix; - if (type == SNodeType::place || type == SNodeType::bit_struct || - type == SNodeType::bit_array) + if (type == SNodeType::place || type == SNodeType::bit_struct) suffix = fmt::format("<{}>", dt->to_string()); if (is_bit_level) suffix += ""; diff --git a/taichi/ir/statements.cpp b/taichi/ir/statements.cpp index 85298ee206b65..2c1511d2d90bf 100644 --- a/taichi/ir/statements.cpp +++ b/taichi/ir/statements.cpp @@ -392,6 +392,7 @@ std::unique_ptr OffloadedStmt::clone() const { new_stmt->grid_dim = grid_dim; new_stmt->block_dim = block_dim; new_stmt->reversed = reversed; + new_stmt->bit_vectorize = bit_vectorize; new_stmt->num_cpu_threads = num_cpu_threads; new_stmt->index_offsets = index_offsets; diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h index 08215d21172ee..68cfa719d916b 100644 --- a/taichi/ir/statements.h +++ b/taichi/ir/statements.h @@ -1172,6 +1172,7 @@ class OffloadedStmt : public Stmt { int grid_dim{1}; int block_dim{1}; bool reversed{false}; + int bit_vectorize{1}; int num_cpu_threads{1}; Stmt *end_stmt{nullptr}; std::string range_hint = ""; diff --git a/taichi/transforms/offload.cpp b/taichi/transforms/offload.cpp index 135e0c7053e16..abc6935ed8419 100644 --- a/taichi/transforms/offload.cpp +++ b/taichi/transforms/offload.cpp @@ -189,9 +189,9 @@ class Offloader { if (!demotable) { for (int i = 1; i < path.size(); i++) { auto snode_child = path[i]; - if ((snode_child->type == SNodeType::bit_array || - snode_child->type == SNodeType::bit_struct) && - i == path.size() - 1) { + if (snode_child->type == SNodeType::bit_array && + for_stmt->bit_vectorize != 1) { + TI_ASSERT(i == path.size() - 1); continue; } auto offloaded_clear_list = Stmt::make_typed( @@ -248,6 +248,7 @@ class Offloader { } offloaded_struct_for->snode = for_stmt->snode; + offloaded_struct_for->bit_vectorize = for_stmt->bit_vectorize; offloaded_struct_for->num_cpu_threads = std::min(for_stmt->num_cpu_threads, config.cpu_max_num_threads); offloaded_struct_for->mem_access_opt = mem_access_opt; diff --git a/tests/python/test_bit_array.py b/tests/python/test_bit_array.py index ce49935e2d3a7..de28898124b0a 100644 --- a/tests/python/test_bit_array.py +++ b/tests/python/test_bit_array.py @@ -67,3 +67,37 @@ def verify_val(): set_val() verify_val() + + +@test_utils.test(require=ti.extension.quant, debug=True) +def test_bit_array_struct_for(): + block_size = 16 + N = 64 + cell = ti.root.pointer(ti.i, N // block_size) + qi7 = ti.types.quant.int(7) + + x = ti.field(dtype=qi7) + cell.dense(ti.i, block_size // 4).bit_array(ti.i, 4, num_bits=32).place(x) + + @ti.kernel + def activate(): + for i in range(N): + if i // block_size % 2 == 0: + x[i] = i + + @ti.kernel + def assign(): + for i in x: + x[i] -= 1 + + @ti.kernel + def verify(): + for i in range(N): + if i // block_size % 2 == 0: + assert x[i] == i - 1 + else: + assert x[i] == 0 + + activate() + assign() + verify()