From a00bd0449822c292267d4a8b48ef8ca4706589d3 Mon Sep 17 00:00:00 2001 From: xumingkuan Date: Fri, 2 Apr 2021 13:59:24 +0800 Subject: [PATCH 1/4] [refactor] [ir] Rename "parallelize" to "num_cpu_threads" --- taichi/ir/frontend_ir.cpp | 16 ++++++++-------- taichi/ir/frontend_ir.h | 2 +- taichi/ir/ir.cpp | 2 +- taichi/ir/ir.h | 4 ++-- taichi/ir/ir_builder.cpp | 10 +++++----- taichi/ir/ir_builder.h | 4 ++-- taichi/ir/statements.cpp | 15 ++++++++------- taichi/ir/statements.h | 12 ++++++------ taichi/python/export_lang.cpp | 2 +- taichi/transforms/lower_ast.cpp | 4 ++-- taichi/transforms/offload.cpp | 4 ++-- 11 files changed, 38 insertions(+), 37 deletions(-) diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp index 3de97f000e316..04171fcf02de8 100644 --- a/taichi/ir/frontend_ir.cpp +++ b/taichi/ir/frontend_ir.cpp @@ -32,18 +32,18 @@ FrontendForStmt::FrontendForStmt(const ExprGroup &loop_var, : global_var(global_var) { vectorize = dec.vectorize; bit_vectorize = dec.bit_vectorize; - parallelize = dec.parallelize; + num_cpu_threads = dec.num_cpu_threads; strictly_serialized = dec.strictly_serialized; block_dim = dec.block_dim; auto cfg = get_current_program().config; if (cfg.arch == Arch::cuda) { vectorize = 1; - parallelize = 1; + num_cpu_threads = 1; TI_ASSERT(block_dim <= taichi_max_gpu_block_dim); } else { // cpu - if (parallelize == 0) - parallelize = std::thread::hardware_concurrency(); + if (num_cpu_threads == 0) + num_cpu_threads = std::thread::hardware_concurrency(); } mem_access_opt = dec.mem_access_opt; dec.reset(); @@ -69,16 +69,16 @@ FrontendForStmt::FrontendForStmt(const Expr &loop_var, : begin(begin), end(end) { vectorize = dec.vectorize; bit_vectorize = dec.bit_vectorize; - parallelize = dec.parallelize; + num_cpu_threads = dec.num_cpu_threads; strictly_serialized = dec.strictly_serialized; block_dim = dec.block_dim; auto cfg = get_current_program().config; if (cfg.arch == Arch::cuda) { vectorize = 1; - parallelize = 1; + num_cpu_threads = 1; } else { - if (parallelize == 0) - parallelize = std::thread::hardware_concurrency(); + if (num_cpu_threads == 0) + num_cpu_threads = std::thread::hardware_concurrency(); } mem_access_opt = dec.mem_access_opt; dec.reset(); diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h index d532c8e02061e..2b410fda4c34a 100644 --- a/taichi/ir/frontend_ir.h +++ b/taichi/ir/frontend_ir.h @@ -123,7 +123,7 @@ class FrontendForStmt : public Stmt { std::vector loop_var_id; int vectorize; int bit_vectorize; - int parallelize; + int num_cpu_threads; bool strictly_serialized; MemoryAccessOptions mem_access_opt; int block_dim; diff --git a/taichi/ir/ir.cpp b/taichi/ir/ir.cpp index 82e2d8f7c89ce..586865419ead9 100644 --- a/taichi/ir/ir.cpp +++ b/taichi/ir/ir.cpp @@ -24,7 +24,7 @@ std::string snode_access_flag_name(SNodeAccessFlag type) { void DecoratorRecorder::reset() { vectorize = -1; bit_vectorize = -1; - parallelize = 0; + num_cpu_threads = 0; uniform = false; mem_access_opt.clear(); block_dim = 0; diff --git a/taichi/ir/ir.h b/taichi/ir/ir.h index 5d2c3eadfd7d5..4f75195df99d5 100644 --- a/taichi/ir/ir.h +++ b/taichi/ir/ir.h @@ -74,7 +74,7 @@ class DecoratorRecorder { public: int vectorize; int bit_vectorize; - int parallelize; + int num_cpu_threads; bool strictly_serialized; MemoryAccessOptions mem_access_opt; int block_dim; @@ -712,7 +712,7 @@ inline void BitVectorize(int v) { } inline void Parallelize(int v) { - dec.parallelize = v; + dec.num_cpu_threads = v; } inline void StrictlySerialize() { diff --git a/taichi/ir/ir_builder.cpp b/taichi/ir/ir_builder.cpp index 798972c5a4eb5..25ef6188555a7 100644 --- a/taichi/ir/ir_builder.cpp +++ b/taichi/ir/ir_builder.cpp @@ -48,22 +48,22 @@ RangeForStmt *IRBuilder::create_range_for(Stmt *begin, Stmt *end, int vectorize, int bit_vectorize, - int parallelize, + int num_cpu_threads, int block_dim, bool strictly_serialized) { return insert(Stmt::make_typed( begin, end, std::make_unique(), vectorize, bit_vectorize, - parallelize, block_dim, strictly_serialized)); + num_cpu_threads, block_dim, strictly_serialized)); } StructForStmt *IRBuilder::create_struct_for(SNode *snode, int vectorize, int bit_vectorize, - int parallelize, + int num_cpu_threads, int block_dim) { return insert(Stmt::make_typed( - snode, std::make_unique(), vectorize, bit_vectorize, parallelize, - block_dim)); + snode, std::make_unique(), vectorize, bit_vectorize, + num_cpu_threads, block_dim)); } WhileStmt *IRBuilder::create_while_true() { diff --git a/taichi/ir/ir_builder.h b/taichi/ir/ir_builder.h index c924eb8c3dd21..22ea0e50ca8ed 100644 --- a/taichi/ir/ir_builder.h +++ b/taichi/ir/ir_builder.h @@ -64,13 +64,13 @@ class IRBuilder { Stmt *end, int vectorize = -1, int bit_vectorize = -1, - int parallelize = 0, + int num_cpu_threads = 0, int block_dim = 0, bool strictly_serialized = false); StructForStmt *create_struct_for(SNode *snode, int vectorize = -1, int bit_vectorize = -1, - int parallelize = 0, + int num_cpu_threads = 0, int block_dim = 0); WhileStmt *create_while_true(); IfStmt *create_if(Stmt *cond); diff --git a/taichi/ir/statements.cpp b/taichi/ir/statements.cpp index 17911b864e259..7b8083c007d88 100644 --- a/taichi/ir/statements.cpp +++ b/taichi/ir/statements.cpp @@ -224,7 +224,7 @@ RangeForStmt::RangeForStmt(Stmt *begin, std::unique_ptr &&body, int vectorize, int bit_vectorize, - int parallelize, + int num_cpu_threads, int block_dim, bool strictly_serialized) : begin(begin), @@ -232,7 +232,7 @@ RangeForStmt::RangeForStmt(Stmt *begin, body(std::move(body)), vectorize(vectorize), bit_vectorize(bit_vectorize), - parallelize(parallelize), + num_cpu_threads(num_cpu_threads), block_dim(block_dim), strictly_serialized(strictly_serialized) { reversed = false; @@ -242,7 +242,7 @@ RangeForStmt::RangeForStmt(Stmt *begin, std::unique_ptr RangeForStmt::clone() const { auto new_stmt = std::make_unique( - begin, end, body->clone(), vectorize, bit_vectorize, parallelize, + begin, end, body->clone(), vectorize, bit_vectorize, num_cpu_threads, block_dim, strictly_serialized); new_stmt->reversed = reversed; return new_stmt; @@ -252,21 +252,22 @@ StructForStmt::StructForStmt(SNode *snode, std::unique_ptr &&body, int vectorize, int bit_vectorize, - int parallelize, + int num_cpu_threads, int block_dim) : snode(snode), body(std::move(body)), vectorize(vectorize), bit_vectorize(bit_vectorize), - parallelize(parallelize), + num_cpu_threads(num_cpu_threads), block_dim(block_dim) { this->body->parent_stmt = this; TI_STMT_REG_FIELDS; } std::unique_ptr StructForStmt::clone() const { - auto new_stmt = std::make_unique( - snode, body->clone(), vectorize, bit_vectorize, parallelize, block_dim); + auto new_stmt = std::make_unique(snode, body->clone(), + vectorize, bit_vectorize, + num_cpu_threads, block_dim); new_stmt->mem_access_opt = mem_access_opt; return new_stmt; } diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h index 09ea4e37107fc..cd5c24717354e 100644 --- a/taichi/ir/statements.h +++ b/taichi/ir/statements.h @@ -551,7 +551,7 @@ class RangeForStmt : public Stmt { bool reversed; int vectorize; int bit_vectorize; - int parallelize; + int num_cpu_threads; int block_dim; bool strictly_serialized; @@ -560,7 +560,7 @@ class RangeForStmt : public Stmt { std::unique_ptr &&body, int vectorize, int bit_vectorize, - int parallelize, + int num_cpu_threads, int block_dim, bool strictly_serialized); @@ -579,7 +579,7 @@ class RangeForStmt : public Stmt { reversed, vectorize, bit_vectorize, - parallelize, + num_cpu_threads, block_dim, strictly_serialized); TI_DEFINE_ACCEPT @@ -595,7 +595,7 @@ class StructForStmt : public Stmt { std::vector index_offsets; int vectorize; int bit_vectorize; - int parallelize; + int num_cpu_threads; int block_dim; MemoryAccessOptions mem_access_opt; @@ -603,7 +603,7 @@ class StructForStmt : public Stmt { std::unique_ptr &&body, int vectorize, int bit_vectorize, - int parallelize, + int num_cpu_threads, int block_dim); bool is_container_statement() const override { @@ -616,7 +616,7 @@ class StructForStmt : public Stmt { index_offsets, vectorize, bit_vectorize, - parallelize, + num_cpu_threads, block_dim, mem_access_opt); TI_DEFINE_ACCEPT diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index 7db44816a41fc..5d02a6319e684 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -699,7 +699,7 @@ void export_lang(py::module &m) { } }); // Schedules - m.def("parallelize", Parallelize); + m.def("num_cpu_threads", Parallelize); m.def("vectorize", Vectorize); m.def("bit_vectorize", BitVectorize); m.def("block_dim", BlockDim); diff --git a/taichi/transforms/lower_ast.cpp b/taichi/transforms/lower_ast.cpp index c6f09ecffde09..42b0183bff76a 100644 --- a/taichi/transforms/lower_ast.cpp +++ b/taichi/transforms/lower_ast.cpp @@ -209,7 +209,7 @@ class LowerAST : public IRVisitor { if (is_good_range_for) { auto &&new_for = std::make_unique( begin->stmt, end->stmt, std::move(stmt->body), stmt->vectorize, - stmt->bit_vectorize, stmt->parallelize, stmt->block_dim, + stmt->bit_vectorize, stmt->num_cpu_threads, stmt->block_dim, stmt->strictly_serialized); new_for->body->insert(std::make_unique(new_for.get(), 0), 0); @@ -293,7 +293,7 @@ class LowerAST : public IRVisitor { auto &&new_for = std::make_unique( snode, std::move(stmt->body), stmt->vectorize, stmt->bit_vectorize, - stmt->parallelize, stmt->block_dim); + stmt->num_cpu_threads, stmt->block_dim); new_for->index_offsets = offsets; VecStatement new_statements; for (int i = 0; i < (int)stmt->loop_var_id.size(); i++) { diff --git a/taichi/transforms/offload.cpp b/taichi/transforms/offload.cpp index a4b3fa710ea68..970415ccbd09d 100644 --- a/taichi/transforms/offload.cpp +++ b/taichi/transforms/offload.cpp @@ -79,7 +79,7 @@ class Offloader { std::make_pair(offloaded.get(), s->end)); } offloaded->num_cpu_threads = - std::min(s->parallelize, + std::min(s->num_cpu_threads, root->get_kernel()->program.config.cpu_max_num_threads); replace_all_usages_with(s, s, offloaded.get()); for (int j = 0; j < (int)s->body->statements.size(); j++) { @@ -182,7 +182,7 @@ class Offloader { offloaded_struct_for->snode = for_stmt->snode; offloaded_struct_for->num_cpu_threads = - std::min(for_stmt->parallelize, program->config.cpu_max_num_threads); + std::min(for_stmt->num_cpu_threads, program->config.cpu_max_num_threads); offloaded_struct_for->mem_access_opt = mem_access_opt; root_block->insert(std::move(offloaded_struct_for)); From 11e21de042136ef762d173531e76bbe40644f6f6 Mon Sep 17 00:00:00 2001 From: Taichi Gardener Date: Fri, 2 Apr 2021 02:05:31 -0400 Subject: [PATCH 2/4] [skip ci] enforce code format --- taichi/python/export_lang.cpp | 52 ++++++++++++++++------------------- taichi/transforms/offload.cpp | 4 +-- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index 5d02a6319e684..b076299a67765 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -95,9 +95,8 @@ void export_lang(py::module &m) { .def(py::self == py::self) .def("__hash__", &DataType::hash) .def("to_string", &DataType::to_string) - .def( - "get_ptr", [](DataType *dtype) -> Type * { return *dtype; }, - py::return_value_policy::reference) + .def("get_ptr", [](DataType *dtype) -> Type * { return *dtype; }, + py::return_value_policy::reference) .def(py::pickle( [](const DataType &dt) { // Note: this only works for primitive types, which is fine for now. @@ -195,10 +194,9 @@ void export_lang(py::module &m) { m.def("reset_default_compile_config", [&]() { default_compile_config = CompileConfig(); }); - m.def( - "default_compile_config", - [&]() -> CompileConfig & { return default_compile_config; }, - py::return_value_policy::reference); + m.def("default_compile_config", + [&]() -> CompileConfig & { return default_compile_config; }, + py::return_value_policy::reference); py::class_(m, "Program") .def(py::init<>()) @@ -215,12 +213,11 @@ void export_lang(py::module &m) { }) .def("print_memory_profiler_info", &Program::print_memory_profiler_info) .def("finalize", &Program::finalize) - .def( - "get_root", - [&](Program *program) -> SNode * { - return program->snode_root.get(); - }, - py::return_value_policy::reference) + .def("get_root", + [&](Program *program) -> SNode * { + return program->snode_root.get(); + }, + py::return_value_policy::reference) .def("get_total_compilation_time", &Program::get_total_compilation_time) .def("print_snode_tree", &Program::print_snode_tree) .def("get_snode_num_dynamically_allocated", @@ -235,10 +232,9 @@ void export_lang(py::module &m) { m.def("get_current_program", get_current_program, py::return_value_policy::reference); - m.def( - "current_compile_config", - [&]() -> CompileConfig & { return get_current_program().config; }, - py::return_value_policy::reference); + m.def("current_compile_config", + [&]() -> CompileConfig & { return get_current_program().config; }, + py::return_value_policy::reference); py::class_(m, "Index").def(py::init()); py::class_(m, "SNode") @@ -273,10 +269,9 @@ void export_lang(py::module &m) { .def("data_type", [](SNode *snode) { return snode->dt; }) .def("get_num_ch", [](SNode *snode) -> int { return (int)snode->ch.size(); }) - .def( - "get_ch", - [](SNode *snode, int i) -> SNode * { return snode->ch[i].get(); }, - py::return_value_policy::reference) + .def("get_ch", + [](SNode *snode, int i) -> SNode * { return snode->ch[i].get(); }, + py::return_value_policy::reference) .def("lazy_grad", [](SNode *snode) { make_lazy_grad(snode, @@ -375,14 +370,13 @@ void export_lang(py::module &m) { py::class_(m, "Stmt"); py::class_(m, "KernelProxy") - .def( - "define", - [](Program::KernelProxy *ker, - const std::function &func) -> Kernel & { - py::gil_scoped_release release; - return ker->def(func); - }, - py::return_value_policy::reference); + .def("define", + [](Program::KernelProxy *ker, + const std::function &func) -> Kernel & { + py::gil_scoped_release release; + return ker->def(func); + }, + py::return_value_policy::reference); m.def("insert_deactivate", [](SNode *snode, const ExprGroup &indices) { return Deactivate(snode, indices); diff --git a/taichi/transforms/offload.cpp b/taichi/transforms/offload.cpp index 970415ccbd09d..e38867fc9ed96 100644 --- a/taichi/transforms/offload.cpp +++ b/taichi/transforms/offload.cpp @@ -181,8 +181,8 @@ class Offloader { } offloaded_struct_for->snode = for_stmt->snode; - offloaded_struct_for->num_cpu_threads = - std::min(for_stmt->num_cpu_threads, program->config.cpu_max_num_threads); + offloaded_struct_for->num_cpu_threads = std::min( + for_stmt->num_cpu_threads, program->config.cpu_max_num_threads); offloaded_struct_for->mem_access_opt = mem_access_opt; root_block->insert(std::move(offloaded_struct_for)); From a109a29c351653c69f0116396ef9ec05b5e8e67f Mon Sep 17 00:00:00 2001 From: xumingkuan Date: Fri, 2 Apr 2021 16:01:39 +0800 Subject: [PATCH 3/4] restore frontend API --- taichi/python/export_lang.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index b076299a67765..10b1467e4312b 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -693,7 +693,7 @@ void export_lang(py::module &m) { } }); // Schedules - m.def("num_cpu_threads", Parallelize); + m.def("parallelize", Parallelize); m.def("vectorize", Vectorize); m.def("bit_vectorize", BitVectorize); m.def("block_dim", BlockDim); From 2d41f86279a93d958c2ea50dafd647a0193b1e68 Mon Sep 17 00:00:00 2001 From: xumingkuan Date: Fri, 2 Apr 2021 16:04:08 +0800 Subject: [PATCH 4/4] code format? --- taichi/python/export_lang.cpp | 52 +++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index 10b1467e4312b..7db44816a41fc 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -95,8 +95,9 @@ void export_lang(py::module &m) { .def(py::self == py::self) .def("__hash__", &DataType::hash) .def("to_string", &DataType::to_string) - .def("get_ptr", [](DataType *dtype) -> Type * { return *dtype; }, - py::return_value_policy::reference) + .def( + "get_ptr", [](DataType *dtype) -> Type * { return *dtype; }, + py::return_value_policy::reference) .def(py::pickle( [](const DataType &dt) { // Note: this only works for primitive types, which is fine for now. @@ -194,9 +195,10 @@ void export_lang(py::module &m) { m.def("reset_default_compile_config", [&]() { default_compile_config = CompileConfig(); }); - m.def("default_compile_config", - [&]() -> CompileConfig & { return default_compile_config; }, - py::return_value_policy::reference); + m.def( + "default_compile_config", + [&]() -> CompileConfig & { return default_compile_config; }, + py::return_value_policy::reference); py::class_(m, "Program") .def(py::init<>()) @@ -213,11 +215,12 @@ void export_lang(py::module &m) { }) .def("print_memory_profiler_info", &Program::print_memory_profiler_info) .def("finalize", &Program::finalize) - .def("get_root", - [&](Program *program) -> SNode * { - return program->snode_root.get(); - }, - py::return_value_policy::reference) + .def( + "get_root", + [&](Program *program) -> SNode * { + return program->snode_root.get(); + }, + py::return_value_policy::reference) .def("get_total_compilation_time", &Program::get_total_compilation_time) .def("print_snode_tree", &Program::print_snode_tree) .def("get_snode_num_dynamically_allocated", @@ -232,9 +235,10 @@ void export_lang(py::module &m) { m.def("get_current_program", get_current_program, py::return_value_policy::reference); - m.def("current_compile_config", - [&]() -> CompileConfig & { return get_current_program().config; }, - py::return_value_policy::reference); + m.def( + "current_compile_config", + [&]() -> CompileConfig & { return get_current_program().config; }, + py::return_value_policy::reference); py::class_(m, "Index").def(py::init()); py::class_(m, "SNode") @@ -269,9 +273,10 @@ void export_lang(py::module &m) { .def("data_type", [](SNode *snode) { return snode->dt; }) .def("get_num_ch", [](SNode *snode) -> int { return (int)snode->ch.size(); }) - .def("get_ch", - [](SNode *snode, int i) -> SNode * { return snode->ch[i].get(); }, - py::return_value_policy::reference) + .def( + "get_ch", + [](SNode *snode, int i) -> SNode * { return snode->ch[i].get(); }, + py::return_value_policy::reference) .def("lazy_grad", [](SNode *snode) { make_lazy_grad(snode, @@ -370,13 +375,14 @@ void export_lang(py::module &m) { py::class_(m, "Stmt"); py::class_(m, "KernelProxy") - .def("define", - [](Program::KernelProxy *ker, - const std::function &func) -> Kernel & { - py::gil_scoped_release release; - return ker->def(func); - }, - py::return_value_policy::reference); + .def( + "define", + [](Program::KernelProxy *ker, + const std::function &func) -> Kernel & { + py::gil_scoped_release release; + return ker->def(func); + }, + py::return_value_policy::reference); m.def("insert_deactivate", [](SNode *snode, const ExprGroup &indices) { return Deactivate(snode, indices);