From 810c17d8c4405b79d21302e14e9de7797c4c448b Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Mon, 29 May 2023 10:37:36 +0800 Subject: [PATCH 1/7] [Lang] Migrate irpass::scalarize() after irpass::make_block_local() --- taichi/transforms/compile_to_offloads.cpp | 10 ++++---- taichi/transforms/make_block_local.cpp | 31 +++++++++++++++++++++++ 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index f231167e952a3..5a95b78652ff6 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -226,6 +226,11 @@ void offload_to_executable(IRNode *ir, } } + if (make_block_local) { + irpass::make_block_local(ir, config, {kernel->get_name()}); + print("Make block local"); + } + if (config.real_matrix_scalarize) { irpass::scalarize(ir); @@ -234,11 +239,6 @@ void offload_to_executable(IRNode *ir, print("Scalarized"); } - if (make_block_local) { - irpass::make_block_local(ir, config, {kernel->get_name()}); - print("Make block local"); - } - if (is_extension_supported(config.arch, Extension::mesh)) { irpass::demote_mesh_statements(ir, config, {kernel->get_name()}); print("Demote mesh statements"); diff --git a/taichi/transforms/make_block_local.cpp b/taichi/transforms/make_block_local.cpp index 314b1b418cbf5..c569567cb1a41 100644 --- a/taichi/transforms/make_block_local.cpp +++ b/taichi/transforms/make_block_local.cpp @@ -15,6 +15,37 @@ void make_block_local_offload(OffloadedStmt *offload, if (offload->task_type != OffloadedStmt::TaskType::struct_for) return; + bool is_bls_applicable = + offload->mem_access_opt.get_snodes_with_flag(SNodeAccessFlag::block_local) + .size() > 0; + if (!is_bls_applicable) { + return; + } + + /* + [TensorType TODO #2] + In general, BLS is trying to analyze and replace load/store of + loop-specific GlobalPtrStmt(..., index) with load/store of a cross-loop + BlockLocalPtrStmt. This requires heavy analysis upon depencencies between + index of GlobalPtrStmt and the loop index. + + In case where GlobalPtrStmt's index being TensorType and stored in an + AllocaStmt, the analysis will fail due to the complicity of address + aliasing. Therefore we apply scalarize here to leverage this analysis + + [Example] + $1 = loop $0 index 0 + <[Tensor (1) i32]> $3 = [$1] + ... + <[Tensor (1) i32]> $12 = alloca + <[Tensor (1) i32]> $13 : local store [$12 <- $3] + <*i32> $14 = shift ptr [$12 + $4] + $15 = local load [$14] + <*i32> $16 = global ptr [S5place], index [$15] activate=true + */ + irpass::scalarize(offload); + irpass::full_simplify(offload, config, {false, /*autodiff_enabled*/ false}); + bool debug = config.debug; auto pads = irpass::initialize_scratch_pad(offload); From f4a45c08723bdb345f0f5703939539b866d5e683 Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Mon, 29 May 2023 11:15:28 +0800 Subject: [PATCH 2/7] bug fix --- taichi/ir/transforms.h | 2 +- taichi/transforms/compile_to_offloads.cpp | 20 +++++----- taichi/transforms/make_block_local.cpp | 25 ++++++++++++- taichi/transforms/scalarize.cpp | 45 +++++++++++++++-------- 4 files changed, 64 insertions(+), 28 deletions(-) diff --git a/taichi/ir/transforms.h b/taichi/ir/transforms.h index 6bd06cb6df9ab..95b4d29cc5764 100644 --- a/taichi/ir/transforms.h +++ b/taichi/ir/transforms.h @@ -30,7 +30,7 @@ namespace irpass { void re_id(IRNode *root); void flag_access(IRNode *root); void eliminate_immutable_local_vars(IRNode *root); -void scalarize(IRNode *root); +bool scalarize(IRNode *root); void vectorize_half2(IRNode *root); void lower_matrix_ptr(IRNode *root); bool die(IRNode *root); diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index 5a95b78652ff6..83acbcd3aa9c3 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -232,11 +232,11 @@ void offload_to_executable(IRNode *ir, } if (config.real_matrix_scalarize) { - irpass::scalarize(ir); - - // Remove redundant MatrixInitStmt inserted during scalarization - irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false}); - print("Scalarized"); + if (irpass::scalarize(ir)) { + // Remove redundant MatrixInitStmt inserted during scalarization + irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false}); + print("Scalarized"); + } } if (is_extension_supported(config.arch, Extension::mesh)) { @@ -356,11 +356,11 @@ void compile_function(IRNode *ir, } if (config.real_matrix_scalarize) { - irpass::scalarize(ir); - - // Remove redundant MatrixInitStmt inserted during scalarization - irpass::die(ir); - print("Scalarized"); + if (irpass::scalarize(ir)) { + // Remove redundant MatrixInitStmt inserted during scalarization + irpass::die(ir); + print("Scalarized"); + } } irpass::lower_access(ir, config, {{}, true}); diff --git a/taichi/transforms/make_block_local.cpp b/taichi/transforms/make_block_local.cpp index c569567cb1a41..f43a790ed7bd4 100644 --- a/taichi/transforms/make_block_local.cpp +++ b/taichi/transforms/make_block_local.cpp @@ -9,6 +9,24 @@ namespace taichi::lang { namespace { +std::function +make_pass_printer(bool verbose, const std::string &kernel_name, IRNode *ir) { + if (!verbose) { + return [](const std::string &) {}; + } + return [ir, kernel_name](const std::string &pass) { + TI_INFO("[{}] {}:", kernel_name, pass); + std::cout << std::flush; + irpass::re_id(ir); + irpass::print(ir); + std::cout << std::flush; + }; +} + +} // namespace + +namespace { + void make_block_local_offload(OffloadedStmt *offload, const CompileConfig &config, const std::string &kernel_name) { @@ -22,6 +40,8 @@ void make_block_local_offload(OffloadedStmt *offload, return; } + auto print = make_pass_printer(true, "asdasdasd", offload); + /* [TensorType TODO #2] In general, BLS is trying to analyze and replace load/store of @@ -43,8 +63,9 @@ void make_block_local_offload(OffloadedStmt *offload, $15 = local load [$14] <*i32> $16 = global ptr [S5place], index [$15] activate=true */ - irpass::scalarize(offload); - irpass::full_simplify(offload, config, {false, /*autodiff_enabled*/ false}); + if (irpass::scalarize(offload)) { + irpass::full_simplify(offload, config, {false, /*autodiff_enabled*/ false}); + } bool debug = config.debug; diff --git a/taichi/transforms/scalarize.cpp b/taichi/transforms/scalarize.cpp index f1526ff14130b..d4719011e4f1f 100644 --- a/taichi/transforms/scalarize.cpp +++ b/taichi/transforms/scalarize.cpp @@ -18,9 +18,6 @@ class Scalarize : public BasicStmtVisitor { DelayedIRModifier delayed_modifier_; explicit Scalarize(IRNode *node) : immediate_modifier_(node) { - node->accept(this); - - delayed_modifier_.modify_ir(); } /* @@ -841,6 +838,12 @@ class Scalarize : public BasicStmtVisitor { } } + static bool run(IRNode *node) { + Scalarize pass(node); + node->accept(&pass); + return pass.delayed_modifier_.modify_ir(); + } + private: using BasicStmtVisitor::visit; std::unordered_map> scalarized_ad_stack_map_; @@ -898,9 +901,6 @@ class ScalarizePointers : public BasicStmtVisitor { IRNode *node, const std::unordered_set &scalarizable_allocas) : immediate_modifier_(node), scalarizable_allocas_(scalarizable_allocas) { - node->accept(this); - - delayed_modifier_.modify_ir(); } /* @@ -1041,6 +1041,13 @@ class ScalarizePointers : public BasicStmtVisitor { } } + static bool run(IRNode *node, + const std::unordered_set &scalarizable_allocas) { + ScalarizePointers pass(node, scalarizable_allocas); + node->accept(&pass); + return pass.delayed_modifier_.modify_ir(); + } + private: using BasicStmtVisitor::visit; }; @@ -1086,8 +1093,6 @@ class ExtractLocalPointers : public BasicStmtVisitor { TI_ASSERT(root->is()); top_level_ = root->as(); } - root->accept(this); - delayed_modifier_.modify_ir(); } void visit(OffloadedStmt *stmt) override { @@ -1124,6 +1129,12 @@ class ExtractLocalPointers : public BasicStmtVisitor { } } + static bool run(IRNode *node) { + ExtractLocalPointers pass(node); + node->accept(&pass); + return pass.delayed_modifier_.modify_ir(); + } + private: using BasicStmtVisitor::visit; }; @@ -1172,22 +1183,26 @@ class MergeExternalAndMatrixPtr : public BasicStmtVisitor { } } - static void run(IRNode *node) { + static bool run(IRNode *node) { MergeExternalAndMatrixPtr pass; node->accept(&pass); - pass.modifier_.modify_ir(); + return pass.modifier_.modify_ir(); } }; namespace irpass { -void scalarize(IRNode *root) { +bool scalarize(IRNode *root) { TI_AUTO_PROF; - Scalarize scalarize_pass(root); + bool modified = false; + + modified = Scalarize::run(root); auto scalarizable_allocas = GatherScalarizableLocalPointers::run(root); - ScalarizePointers scalarize_pointers_pass(root, scalarizable_allocas); - ExtractLocalPointers extract_pointers_pass(root); - MergeExternalAndMatrixPtr::run(root); + modified = ScalarizePointers::run(root, scalarizable_allocas); + modified = ExtractLocalPointers::run(root); + modified = MergeExternalAndMatrixPtr::run(root); + + return modified; } } // namespace irpass From d7541410f2247da2225ef06c356fdc72df6e14c9 Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Mon, 29 May 2023 11:17:35 +0800 Subject: [PATCH 3/7] [Lang] Migrate irpass::scalarize() after irpass::lower_access() --- taichi/transforms/compile_to_offloads.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index 83acbcd3aa9c3..48b6f086a0f78 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -231,14 +231,6 @@ void offload_to_executable(IRNode *ir, print("Make block local"); } - if (config.real_matrix_scalarize) { - if (irpass::scalarize(ir)) { - // Remove redundant MatrixInitStmt inserted during scalarization - irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false}); - print("Scalarized"); - } - } - if (is_extension_supported(config.arch, Extension::mesh)) { irpass::demote_mesh_statements(ir, config, {kernel->get_name()}); print("Demote mesh statements"); @@ -276,6 +268,14 @@ void offload_to_executable(IRNode *ir, irpass::analysis::verify(ir); } + if (config.real_matrix_scalarize) { + if (irpass::scalarize(ir)) { + // Remove redundant MatrixInitStmt inserted during scalarization + irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false}); + print("Scalarized"); + } + } + irpass::demote_operations(ir, config); print("Operations demoted"); @@ -363,7 +363,7 @@ void compile_function(IRNode *ir, } } - irpass::lower_access(ir, config, {{}, true}); + ipass::lower_access(ir, config, {{}, true}); print("Access lowered"); irpass::analysis::verify(ir); From 660819192b4a0e3719939d6df31bf274e2dcba26 Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Mon, 29 May 2023 11:27:36 +0800 Subject: [PATCH 4/7] bug fix --- taichi/transforms/compile_to_offloads.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index 48b6f086a0f78..db8b7f75e87e4 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -363,7 +363,7 @@ void compile_function(IRNode *ir, } } - ipass::lower_access(ir, config, {{}, true}); + irpass::lower_access(ir, config, {{}, true}); print("Access lowered"); irpass::analysis::verify(ir); From 7d6a954836340a351cd41642161139d4fd166ba5 Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Mon, 29 May 2023 11:53:47 +0800 Subject: [PATCH 5/7] bug fix --- taichi/transforms/make_block_local.cpp | 20 -------------------- taichi/transforms/scalarize.cpp | 8 ++++---- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/taichi/transforms/make_block_local.cpp b/taichi/transforms/make_block_local.cpp index f43a790ed7bd4..c2e89d6b2805f 100644 --- a/taichi/transforms/make_block_local.cpp +++ b/taichi/transforms/make_block_local.cpp @@ -9,24 +9,6 @@ namespace taichi::lang { namespace { -std::function -make_pass_printer(bool verbose, const std::string &kernel_name, IRNode *ir) { - if (!verbose) { - return [](const std::string &) {}; - } - return [ir, kernel_name](const std::string &pass) { - TI_INFO("[{}] {}:", kernel_name, pass); - std::cout << std::flush; - irpass::re_id(ir); - irpass::print(ir); - std::cout << std::flush; - }; -} - -} // namespace - -namespace { - void make_block_local_offload(OffloadedStmt *offload, const CompileConfig &config, const std::string &kernel_name) { @@ -40,8 +22,6 @@ void make_block_local_offload(OffloadedStmt *offload, return; } - auto print = make_pass_printer(true, "asdasdasd", offload); - /* [TensorType TODO #2] In general, BLS is trying to analyze and replace load/store of diff --git a/taichi/transforms/scalarize.cpp b/taichi/transforms/scalarize.cpp index d4719011e4f1f..7e84c6bd40121 100644 --- a/taichi/transforms/scalarize.cpp +++ b/taichi/transforms/scalarize.cpp @@ -1196,11 +1196,11 @@ bool scalarize(IRNode *root) { TI_AUTO_PROF; bool modified = false; - modified = Scalarize::run(root); + modified |= Scalarize::run(root); auto scalarizable_allocas = GatherScalarizableLocalPointers::run(root); - modified = ScalarizePointers::run(root, scalarizable_allocas); - modified = ExtractLocalPointers::run(root); - modified = MergeExternalAndMatrixPtr::run(root); + modified |= ScalarizePointers::run(root, scalarizable_allocas); + modified |= ExtractLocalPointers::run(root); + modified |= MergeExternalAndMatrixPtr::run(root); return modified; } From b211db53be3cfafbf332ebae9cdc95c007432d30 Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Mon, 29 May 2023 12:37:58 +0800 Subject: [PATCH 6/7] bug fix --- taichi/transforms/compile_to_offloads.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index db8b7f75e87e4..a5e686cad9386 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -271,7 +271,8 @@ void offload_to_executable(IRNode *ir, if (config.real_matrix_scalarize) { if (irpass::scalarize(ir)) { // Remove redundant MatrixInitStmt inserted during scalarization - irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false}); + irpass::full_simplify(ir, config, + {lower_global_access, /*autodiff_enabled*/ false}); print("Scalarized"); } } From 99130d01066c01f1e90597f25558aefaf965bf3e Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Mon, 29 May 2023 15:22:16 +0800 Subject: [PATCH 7/7] code adjustment --- taichi/transforms/compile_to_offloads.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index 6c3a6acda3030..a5e686cad9386 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -231,14 +231,6 @@ void offload_to_executable(IRNode *ir, print("Make block local"); } - if (config.real_matrix_scalarize) { - if (irpass::scalarize(ir)) { - // Remove redundant MatrixInitStmt inserted during scalarization - irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false}); - print("Scalarized"); - } - } - if (is_extension_supported(config.arch, Extension::mesh)) { irpass::demote_mesh_statements(ir, config, {kernel->get_name()}); print("Demote mesh statements");