[Lang] Add config.force_scalarize_matrix to avoid perf-regression in …

…certain scenario (#8509) Issue: # ### Brief Summary copilot:summary ### Walkthrough copilot:walkthrough
taichi-dev · Apr 18, 2024 · 0da6846 · 0da6846
1 parent 52b24f3
commit 0da6846
Show file tree

Hide file tree

Showing 8 changed files with 60 additions and 16 deletions.
diff --git a/taichi/analysis/offline_cache_util.cpp b/taichi/analysis/offline_cache_util.cpp
@@ -78,6 +78,7 @@ static std::vector<std::uint8_t> get_offline_cache_key_of_compile_config(
   serializer(config.experimental_auto_mesh_local);
   serializer(config.auto_mesh_local_default_occupacy);
   serializer(config.real_matrix_scalarize);
+  serializer(config.force_scalarize_matrix);
   serializer(config.half2_vectorization);
   serializer.finalize();
 

diff --git a/taichi/codegen/codegen_utils.h b/taichi/codegen/codegen_utils.h
@@ -5,7 +5,7 @@
 namespace taichi::lang {
 
 inline bool codegen_vector_type(const CompileConfig &config) {
-  return !config.real_matrix_scalarize;
+  return !(config.real_matrix_scalarize || config.force_scalarize_matrix);
 }
 
 // Parses a C-style printf format string specifier into its constituent parts.

diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
@@ -43,6 +43,7 @@ CompileConfig::CompileConfig() {
   make_block_local = true;
   detect_read_only = true;
   real_matrix_scalarize = true;
+  force_scalarize_matrix = false;
   half2_vectorization = false;
   make_cpu_multithreading_loop = true;
 

diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
@@ -39,6 +39,7 @@ struct CompileConfig {
   bool make_block_local;
   bool detect_read_only;
   bool real_matrix_scalarize;
+  bool force_scalarize_matrix;
   bool half2_vectorization;
   bool make_cpu_multithreading_loop;
   DataType default_fp;

diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
@@ -218,6 +218,8 @@ void export_lang(py::module &m) {
       .def_readwrite("detect_read_only", &CompileConfig::detect_read_only)
       .def_readwrite("real_matrix_scalarize",
                      &CompileConfig::real_matrix_scalarize)
+      .def_readwrite("force_scalarize_matrix",
+                     &CompileConfig::force_scalarize_matrix)
       .def_readwrite("half2_vectorization", &CompileConfig::half2_vectorization)
       .def_readwrite("make_cpu_multithreading_loop",
                      &CompileConfig::make_cpu_multithreading_loop)

diff --git a/taichi/transforms/auto_diff.cpp b/taichi/transforms/auto_diff.cpp
@@ -82,8 +82,13 @@ class IndependentBlocksJudger : public BasicStmtVisitor {
     if (is_inside_loop_)
       return;
 
-    if (stmt->dest->is<ExternalPtrStmt>()) {
-      if (stmt->dest->as<ExternalPtrStmt>()
+    Stmt *dest = stmt->dest;
+    if (dest->is<MatrixPtrStmt>()) {
+      dest = dest->as<MatrixPtrStmt>()->origin;
+    }
+
+    if (dest->is<ExternalPtrStmt>()) {
+      if (dest->as<ExternalPtrStmt>()
               ->base_ptr->as<ArgLoadStmt>()
               ->ret_type.ptr_removed()
               ->as<StructType>()
@@ -92,8 +97,8 @@ class IndependentBlocksJudger : public BasicStmtVisitor {
         qualified_glb_operations_ = true;
       }
     } else {
-      TI_ASSERT(stmt->dest->is<GlobalPtrStmt>());
-      if (stmt->dest->as<GlobalPtrStmt>()->snode->has_adjoint()) {
+      TI_ASSERT(dest->is<GlobalPtrStmt>());
+      if (dest->as<GlobalPtrStmt>()->snode->has_adjoint()) {
         qualified_glb_operations_ = true;
       }
     }
@@ -108,15 +113,21 @@ class IndependentBlocksJudger : public BasicStmtVisitor {
     // another IndependentBlocksJudger
     if (is_inside_loop_)
       return;
-    if ((stmt->src->is<ExternalPtrStmt>() &&
-         stmt->src->as<ExternalPtrStmt>()
+
+    Stmt *src = stmt->src;
+    if (src->is<MatrixPtrStmt>()) {
+      src = src->as<MatrixPtrStmt>()->origin;
+    }
+
+    if ((src->is<ExternalPtrStmt>() &&
+         src->as<ExternalPtrStmt>()
                  ->base_ptr->as<ArgLoadStmt>()
                  ->ret_type.ptr_removed()
                  ->as<StructType>()
                  ->elements()
                  .size() > TypeFactory::GRAD_PTR_POS_IN_NDARRAY) ||
-        (stmt->src->is<GlobalPtrStmt>() &&
-         stmt->src->as<GlobalPtrStmt>()->snode->has_adjoint())) {
+        (src->is<GlobalPtrStmt>() &&
+         src->as<GlobalPtrStmt>()->snode->has_adjoint())) {
       qualified_glb_operations_ = true;
     }
   }
@@ -2425,7 +2436,13 @@ class GloablDataAccessRuleChecker : public BasicStmtVisitor {
   using BasicStmtVisitor::visit;
 
   void visit(GlobalLoadStmt *stmt) override {
-    GlobalPtrStmt *src = stmt->src->as<GlobalPtrStmt>();
+    GlobalPtrStmt *src = nullptr;
+    if (stmt->src->is<GlobalPtrStmt>()) {
+      src = stmt->src->as<GlobalPtrStmt>();
+    } else {
+      TI_ASSERT(stmt->src->is<MatrixPtrStmt>());
+      src = stmt->src->as<MatrixPtrStmt>()->origin->as<GlobalPtrStmt>();
+    }
     auto snode = src->snode;
     if (!snode->has_adjoint_checkbit()) {
       return;
@@ -2466,12 +2483,24 @@ class GloablDataAccessRuleChecker : public BasicStmtVisitor {
   }
 
   void visit(GlobalStoreStmt *stmt) override {
-    GlobalPtrStmt *dest = stmt->dest->as<GlobalPtrStmt>();
+    GlobalPtrStmt *dest = nullptr;
+    if (stmt->dest->is<GlobalPtrStmt>()) {
+      dest = stmt->dest->as<GlobalPtrStmt>();
+    } else {
+      TI_ASSERT(stmt->dest->is<MatrixPtrStmt>());
+      dest = stmt->dest->as<MatrixPtrStmt>()->origin->as<GlobalPtrStmt>();
+    }
     visit_gloabl_store_stmt_and_atomic_add(stmt, dest);
   }
 
   void visit(AtomicOpStmt *stmt) override {
-    GlobalPtrStmt *dest = stmt->dest->as<GlobalPtrStmt>();
+    GlobalPtrStmt *dest = nullptr;
+    if (stmt->dest->is<GlobalPtrStmt>()) {
+      dest = stmt->dest->as<GlobalPtrStmt>();
+    } else {
+      TI_ASSERT(stmt->dest->is<MatrixPtrStmt>());
+      dest = stmt->dest->as<MatrixPtrStmt>()->origin->as<GlobalPtrStmt>();
+    }
     visit_gloabl_store_stmt_and_atomic_add(stmt, dest);
   }
 

diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
@@ -86,6 +86,10 @@ void compile_to_offloads(IRNode *ir,
     irpass::analysis::gather_meshfor_relation_types(ir);
   }
 
+  if (config.force_scalarize_matrix) {
+    irpass::scalarize(ir, false /*half2_optimization_enabled*/);
+  }
+
   if (config.debug && autodiff_mode == AutodiffMode::kCheckAutodiffValid) {
     // Check whether the kernel obeys the autodiff limitation e.g., gloabl data
     // access rule
@@ -136,8 +140,9 @@ void compile_to_offloads(IRNode *ir,
   // TODO: This pass may be redundant as cfg_optimization() is already called
   //  in full_simplify().
   if (config.opt_level > 0 && config.cfg_optimization) {
-    irpass::cfg_optimization(ir, false, /*autodiff_enabled*/ false,
-                             !config.real_matrix_scalarize);
+    irpass::cfg_optimization(
+        ir, false, /*autodiff_enabled*/ false,
+        !config.real_matrix_scalarize && !config.force_scalarize_matrix);
     print("Optimized by CFG");
     irpass::analysis::verify(ir);
   }
@@ -371,6 +376,10 @@ void compile_function(IRNode *ir,
     func->set_ir_stage(Function::IRStage::BeforeLowerAccess);
   }
 
+  if (config.force_scalarize_matrix) {
+    irpass::scalarize(ir, false /*half2_optimization_enabled*/);
+  }
+
   if (target_stage >= Function::IRStage::OptimizedIR &&
       current_stage < Function::IRStage::OptimizedIR) {
     irpass::lower_access(ir, config, {{}, true});

diff --git a/taichi/transforms/simplify.cpp b/taichi/transforms/simplify.cpp
@@ -564,8 +564,9 @@ void full_simplify(IRNode *root,
       // Don't do this time-consuming optimization pass again if the IR is
       // not modified.
       if (config.opt_level > 0 && first_iteration && config.cfg_optimization &&
-          cfg_optimization(root, args.after_lower_access, args.autodiff_enabled,
-                           !config.real_matrix_scalarize))
+          cfg_optimization(
+              root, args.after_lower_access, args.autodiff_enabled,
+              !config.real_matrix_scalarize && !config.force_scalarize_matrix))
         modified = true;
       print("cfg_optimization");
       first_iteration = false;