From e2b58188ff46287d825a511f740a135f3891968b Mon Sep 17 00:00:00 2001 From: Haidong Lan Date: Fri, 9 Dec 2022 11:22:19 +0800 Subject: [PATCH 1/2] Try to move the runtime CPU block loop into codegen --- taichi/codegen/cpu/codegen_cpu.cpp | 38 +++++++++------ taichi/codegen/llvm/codegen_llvm.cpp | 48 +++++++++++++++++++ taichi/codegen/llvm/codegen_llvm.h | 2 + .../runtime/llvm/runtime_module/runtime.cpp | 36 +++++++------- 4 files changed, 93 insertions(+), 31 deletions(-) diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp index e1c17107df3a9..d510adec0f573 100644 --- a/taichi/codegen/cpu/codegen_cpu.cpp +++ b/taichi/codegen/cpu/codegen_cpu.cpp @@ -38,15 +38,25 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM { // The loop body llvm::Function *body; { + // auto guard = get_function_creation_guard( + // {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0), + // llvm::Type::getInt8PtrTy(*llvm_context), + // tlctx->get_data_type()}); + + // auto loop_var = create_entry_block_alloca(PrimitiveType::i32); + // loop_vars_llvm[stmt].push_back(loop_var); + // builder->CreateStore(get_arg(2), loop_var); + // stmt->body->accept(this); auto guard = get_function_creation_guard( {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0), llvm::Type::getInt8PtrTy(*llvm_context), - tlctx->get_data_type()}); + tlctx->get_data_type(), tlctx->get_data_type()}); - auto loop_var = create_entry_block_alloca(PrimitiveType::i32); - loop_vars_llvm[stmt].push_back(loop_var); - builder->CreateStore(get_arg(2), loop_var); - stmt->body->accept(this); + auto begin_var = builder->CreateAlloca(tlctx->get_data_type(PrimitiveType::i32), (unsigned)0, nullptr); + auto end_var = builder->CreateAlloca(tlctx->get_data_type(PrimitiveType::i32), (unsigned)0, nullptr); + builder->CreateStore(get_arg(2), begin_var); + builder->CreateStore(get_arg(3), end_var); + create_cpu_block_range_for(stmt, begin_var, end_var); body = guard.body; } @@ -55,15 +65,15 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM { auto [begin, end] = get_range_for_bounds(stmt); - // adaptive block_dim - if (prog->this_thread_config().cpu_block_dim_adaptive) { - int num_items = (stmt->end_value - stmt->begin_value) / std::abs(step); - int num_threads = stmt->num_cpu_threads; - int items_per_thread = std::max(1, num_items / (num_threads * 32)); - // keep each task has at least 512 items to amortize scheduler overhead - // also saturate the value to 1024 for better load balancing - stmt->block_dim = std::min(1024, std::max(512, items_per_thread)); - } + // // adaptive block_dim + // if (prog->this_thread_config().cpu_block_dim_adaptive) { + // int num_items = (stmt->end_value - stmt->begin_value) / std::abs(step); + // int num_threads = stmt->num_cpu_threads; + // int items_per_thread = std::max(1, num_items / (num_threads * 32)); + // // keep each task has at least 512 items to amortize scheduler overhead + // // also saturate the value to 1024 for better load balancing + // stmt->block_dim = std::min(1024, std::max(512, items_per_thread)); + // } call("cpu_parallel_range_for", get_arg(0), tlctx->get_constant(stmt->num_cpu_threads), begin, end, diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp index 428661f231bcb..013d9a2dfd2d8 100644 --- a/taichi/codegen/llvm/codegen_llvm.cpp +++ b/taichi/codegen/llvm/codegen_llvm.cpp @@ -1128,6 +1128,54 @@ void TaskCodeGenLLVM::create_increment(llvm::Value *ptr, llvm::Value *value) { builder->CreateStore(builder->CreateAdd(original_value, value), ptr); } +void TaskCodeGenLLVM::create_cpu_block_range_for(OffloadedStmt* stmt, llvm::Value *begin_var, llvm::Value *end_var) { + using namespace llvm; + BasicBlock *body = BasicBlock::Create(*llvm_context, "for_loop_body", func); + BasicBlock *loop_inc = + BasicBlock::Create(*llvm_context, "for_loop_inc", func); + BasicBlock *after_loop = BasicBlock::Create(*llvm_context, "after_for", func); + BasicBlock *loop_test = + BasicBlock::Create(*llvm_context, "for_loop_test", func); + + auto loop_var_ty = tlctx->get_data_type(PrimitiveType::i32); + auto loop_var = create_entry_block_alloca(PrimitiveType::i32); + loop_vars_llvm[stmt].push_back(loop_var); + builder->CreateStore(builder->CreateLoad(loop_var_ty, begin_var), loop_var); + + builder->CreateBr(loop_test); + { + // test block + builder->SetInsertPoint(loop_test); + llvm::Value *cond; + cond = builder->CreateICmp(llvm::CmpInst::Predicate::ICMP_SLT, + builder->CreateLoad(loop_var_ty, loop_var), + builder->CreateLoad(loop_var_ty, end_var)); + // loop_var, end_var); + builder->CreateCondBr(cond, body, after_loop); + } + { + { + auto lrg = make_loop_reentry_guard(this); + // The continue stmt should jump to the loop-increment block! + current_loop_reentry = loop_inc; + // body cfg + builder->SetInsertPoint(body); + stmt->body->accept(this); + } + if (!returned) { + builder->CreateBr(loop_inc); + } else { + returned = false; + } + builder->SetInsertPoint(loop_inc); + + create_increment(loop_var, tlctx->get_constant(1)); + builder->CreateBr(loop_test); + } + // next cfg + builder->SetInsertPoint(after_loop); +} + void TaskCodeGenLLVM::create_naive_range_for(RangeForStmt *for_stmt) { using namespace llvm; BasicBlock *body = BasicBlock::Create(*llvm_context, "for_loop_body", func); diff --git a/taichi/codegen/llvm/codegen_llvm.h b/taichi/codegen/llvm/codegen_llvm.h index 52c9f2694d319..1b97129e727a2 100644 --- a/taichi/codegen/llvm/codegen_llvm.h +++ b/taichi/codegen/llvm/codegen_llvm.h @@ -164,6 +164,8 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder { // Direct translation void create_naive_range_for(RangeForStmt *for_stmt); + void create_cpu_block_range_for(OffloadedStmt* stmt, llvm::Value *begin_var, llvm::Value *end_var); + static std::string get_runtime_snode_name(SNode *snode); void visit(Block *stmt_list) override; diff --git a/taichi/runtime/llvm/runtime_module/runtime.cpp b/taichi/runtime/llvm/runtime_module/runtime.cpp index 935675e183af2..e060f5e7e5370 100644 --- a/taichi/runtime/llvm/runtime_module/runtime.cpp +++ b/taichi/runtime/llvm/runtime_module/runtime.cpp @@ -40,7 +40,7 @@ using host_vsnprintf_type = int (*)(char *, const char *, std::va_list); using vm_allocator_type = void *(*)(void *, std::size_t, std::size_t); -using RangeForTaskFunc = void(RuntimeContext *, const char *tls, int i); +using RangeForTaskFunc = void(RuntimeContext *, const char *tls, int block_begin, int block_end); using MeshForTaskFunc = void(RuntimeContext *, const char *tls, uint32_t i); using parallel_for_type = void (*)(void *thread_pool, int splits, @@ -1466,14 +1466,16 @@ void cpu_parallel_range_for_task(void *range_context, if (ctx.step == 1) { int block_start = ctx.begin + task_id * ctx.block_size; int block_end = std::min(block_start + ctx.block_size, ctx.end); - for (int i = block_start; i < block_end; i++) { - ctx.body(&this_thread_context, tls_ptr, i); - } + // for (int i = block_start; i < block_end; i++) { + // ctx.body(&this_thread_context, tls_ptr, i); + // } + printf("@@@@@ block_start %d, block_end %d\n", block_start, block_end); + ctx.body(&this_thread_context, tls_ptr, block_start, block_end); } else if (ctx.step == -1) { int block_start = ctx.end - task_id * ctx.block_size; int block_end = std::max(ctx.begin, block_start * ctx.block_size); for (int i = block_start - 1; i >= block_end; i--) { - ctx.body(&this_thread_context, tls_ptr, i); + // ctx.body(&this_thread_context, tls_ptr, i); } } if (ctx.epilogue) @@ -1517,17 +1519,17 @@ void gpu_parallel_range_for(RuntimeContext *context, RangeForTaskFunc *func, range_for_xlogue epilogue, const std::size_t tls_size) { - int idx = thread_idx() + block_dim() * block_idx() + begin; - alignas(8) char tls_buffer[tls_size]; - auto tls_ptr = &tls_buffer[0]; - if (prologue) - prologue(context, tls_ptr); - while (idx < end) { - func(context, tls_ptr, idx); - idx += block_dim() * grid_dim(); - } - if (epilogue) - epilogue(context, tls_ptr); + // int idx = thread_idx() + block_dim() * block_idx() + begin; + // alignas(8) char tls_buffer[tls_size]; + // auto tls_ptr = &tls_buffer[0]; + // if (prologue) + // prologue(context, tls_ptr); + // while (idx < end) { + // func(context, tls_ptr, idx); + // idx += block_dim() * grid_dim(); + // } + // if (epilogue) + // epilogue(context, tls_ptr); } struct mesh_task_helper_context { @@ -1556,7 +1558,7 @@ void cpu_parallel_mesh_for_task(void *range_context, for (int idx = block_start; idx < block_end; idx++) { if (ctx.prologue) ctx.prologue(ctx.context, tls_ptr, idx); - ctx.body(&this_thread_context, tls_ptr, idx); + // ctx.body(&this_thread_context, tls_ptr, idx); if (ctx.epilogue) ctx.epilogue(ctx.context, tls_ptr, idx); } From 15e8952eda090d645f0d3a198bca0ad88dbc68f2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 Dec 2022 03:25:07 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- taichi/codegen/cpu/codegen_cpu.cpp | 10 ++++++---- taichi/codegen/llvm/codegen_llvm.cpp | 8 +++++--- taichi/codegen/llvm/codegen_llvm.h | 4 +++- taichi/runtime/llvm/runtime_module/runtime.cpp | 5 ++++- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp index d510adec0f573..fad4ae4f0e223 100644 --- a/taichi/codegen/cpu/codegen_cpu.cpp +++ b/taichi/codegen/cpu/codegen_cpu.cpp @@ -49,11 +49,13 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM { // stmt->body->accept(this); auto guard = get_function_creation_guard( {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0), - llvm::Type::getInt8PtrTy(*llvm_context), - tlctx->get_data_type(), tlctx->get_data_type()}); + llvm::Type::getInt8PtrTy(*llvm_context), tlctx->get_data_type(), + tlctx->get_data_type()}); - auto begin_var = builder->CreateAlloca(tlctx->get_data_type(PrimitiveType::i32), (unsigned)0, nullptr); - auto end_var = builder->CreateAlloca(tlctx->get_data_type(PrimitiveType::i32), (unsigned)0, nullptr); + auto begin_var = builder->CreateAlloca( + tlctx->get_data_type(PrimitiveType::i32), (unsigned)0, nullptr); + auto end_var = builder->CreateAlloca( + tlctx->get_data_type(PrimitiveType::i32), (unsigned)0, nullptr); builder->CreateStore(get_arg(2), begin_var); builder->CreateStore(get_arg(3), end_var); create_cpu_block_range_for(stmt, begin_var, end_var); diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp index 013d9a2dfd2d8..8f4e353f8d771 100644 --- a/taichi/codegen/llvm/codegen_llvm.cpp +++ b/taichi/codegen/llvm/codegen_llvm.cpp @@ -1128,7 +1128,9 @@ void TaskCodeGenLLVM::create_increment(llvm::Value *ptr, llvm::Value *value) { builder->CreateStore(builder->CreateAdd(original_value, value), ptr); } -void TaskCodeGenLLVM::create_cpu_block_range_for(OffloadedStmt* stmt, llvm::Value *begin_var, llvm::Value *end_var) { +void TaskCodeGenLLVM::create_cpu_block_range_for(OffloadedStmt *stmt, + llvm::Value *begin_var, + llvm::Value *end_var) { using namespace llvm; BasicBlock *body = BasicBlock::Create(*llvm_context, "for_loop_body", func); BasicBlock *loop_inc = @@ -1141,7 +1143,7 @@ void TaskCodeGenLLVM::create_cpu_block_range_for(OffloadedStmt* stmt, llvm::Valu auto loop_var = create_entry_block_alloca(PrimitiveType::i32); loop_vars_llvm[stmt].push_back(loop_var); builder->CreateStore(builder->CreateLoad(loop_var_ty, begin_var), loop_var); - + builder->CreateBr(loop_test); { // test block @@ -1150,7 +1152,7 @@ void TaskCodeGenLLVM::create_cpu_block_range_for(OffloadedStmt* stmt, llvm::Valu cond = builder->CreateICmp(llvm::CmpInst::Predicate::ICMP_SLT, builder->CreateLoad(loop_var_ty, loop_var), builder->CreateLoad(loop_var_ty, end_var)); - // loop_var, end_var); + // loop_var, end_var); builder->CreateCondBr(cond, body, after_loop); } { diff --git a/taichi/codegen/llvm/codegen_llvm.h b/taichi/codegen/llvm/codegen_llvm.h index 1b97129e727a2..191a192ceaeb7 100644 --- a/taichi/codegen/llvm/codegen_llvm.h +++ b/taichi/codegen/llvm/codegen_llvm.h @@ -164,7 +164,9 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder { // Direct translation void create_naive_range_for(RangeForStmt *for_stmt); - void create_cpu_block_range_for(OffloadedStmt* stmt, llvm::Value *begin_var, llvm::Value *end_var); + void create_cpu_block_range_for(OffloadedStmt *stmt, + llvm::Value *begin_var, + llvm::Value *end_var); static std::string get_runtime_snode_name(SNode *snode); diff --git a/taichi/runtime/llvm/runtime_module/runtime.cpp b/taichi/runtime/llvm/runtime_module/runtime.cpp index e060f5e7e5370..fc14244c94b2d 100644 --- a/taichi/runtime/llvm/runtime_module/runtime.cpp +++ b/taichi/runtime/llvm/runtime_module/runtime.cpp @@ -40,7 +40,10 @@ using host_vsnprintf_type = int (*)(char *, const char *, std::va_list); using vm_allocator_type = void *(*)(void *, std::size_t, std::size_t); -using RangeForTaskFunc = void(RuntimeContext *, const char *tls, int block_begin, int block_end); +using RangeForTaskFunc = void(RuntimeContext *, + const char *tls, + int block_begin, + int block_end); using MeshForTaskFunc = void(RuntimeContext *, const char *tls, uint32_t i); using parallel_for_type = void (*)(void *thread_pool, int splits,