From a4db06d70b124b3d96b0ff9de44c968245b5a2c5 Mon Sep 17 00:00:00 2001
From: Ye Kuang <yekuang.ky@gmail.com>
Date: Sun, 23 Aug 2020 18:36:10 +0900
Subject: [PATCH] [async] Support constant folding in async mode

---
 python/taichi/lang/kernel.py        | 12 +++++++-----
 taichi/program/async_engine.cpp     |  2 +-
 taichi/program/kernel.cpp           |  2 +-
 taichi/program/program.cpp          | 24 ++++++++++++------------
 taichi/program/program.h            |  1 +
 taichi/transforms/constant_fold.cpp |  8 +++++++-
 tests/python/test_constant_fold.py  | 18 ++++++++++++++++++
 7 files changed, 47 insertions(+), 20 deletions(-)
 create mode 100644 tests/python/test_constant_fold.py

diff --git a/python/taichi/lang/kernel.py b/python/taichi/lang/kernel.py
index 9f2992c0acb18f..630c3fc5fa655a 100644
--- a/python/taichi/lang/kernel.py
+++ b/python/taichi/lang/kernel.py
@@ -461,16 +461,18 @@ def call_back():
 
             ret = None
             ret_dt = self.return_type
-            if ret_dt is not None:
+            has_ret = ret_dt is not None
+
+            if has_external_arrays or has_ret:
+                import taichi as ti
+                ti.sync()
+
+            if has_ret:
                 if id(ret_dt) in integer_type_ids:
                     ret = t_kernel.get_ret_int(0)
                 else:
                     ret = t_kernel.get_ret_float(0)
 
-            if has_external_arrays:
-                import taichi as ti
-                ti.sync()
-
             if callbacks:
                 for c in callbacks:
                     c()
diff --git a/taichi/program/async_engine.cpp b/taichi/program/async_engine.cpp
index 1729ccbbd7eb7a..bf549793ef1707 100644
--- a/taichi/program/async_engine.cpp
+++ b/taichi/program/async_engine.cpp
@@ -175,7 +175,7 @@ void ExecutionQueue::enqueue(KernelLaunchRecord &&ker) {
         auto config = kernel->program.config;
         auto ir = stmt;
         offload_to_executable(
-            ir, config, /*verbose=*/false,
+            ir, config, /*verbose=*/config.print_ir,
             /*lower_global_access=*/true,
             /*make_thread_local=*/true,
             /*make_block_local=*/
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 04ce79e1777cb3..931d813db1bb7a 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -94,7 +94,7 @@ void Kernel::lower(bool to_executable) {  // TODO: is a "Lowerer" class
 }
 
 void Kernel::operator()(LaunchContextBuilder &launch_ctx) {
-  if (!program.config.async_mode) {
+  if (!program.config.async_mode || this->is_evaluator) {
     if (!compiled) {
       compile();
     }
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index ae7dd90e111eac..7de2a8af4f9c24 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -468,16 +468,20 @@ void Program::synchronize() {
     if (config.async_mode) {
       async_engine->synchronize();
     }
-    if (config.arch == Arch::cuda) {
+    device_synchronize();
+    sync = true;
+  }
+}
+
+void Program::device_synchronize() {
+  if (config.arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-      CUDADriver::get_instance().stream_synchronize(nullptr);
+    CUDADriver::get_instance().stream_synchronize(nullptr);
 #else
-      TI_ERROR("No CUDA support");
+    TI_ERROR("No CUDA support");
 #endif
-    } else if (config.arch == Arch::metal) {
-      metal_kernel_mgr_->synchronize();
-    }
-    sync = true;
+  } else if (config.arch == Arch::metal) {
+    metal_kernel_mgr_->synchronize();
   }
 }
 
@@ -626,13 +630,9 @@ Kernel &Program::get_snode_writer(SNode *snode) {
 }
 
 uint64 Program::fetch_result_uint64(int i) {
+  // Precondition: caller must have already done a program synchronization.
   uint64 ret;
   auto arch = config.arch;
-  sync = false;
-  // Runtime calls that set result buffer don't execute sync=false, so we have
-  // to set it here otherwise synchronize() does nothing.
-  // TODO: systematically fix this.
-  synchronize();
   if (arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
     if (config.use_unified_memory) {
diff --git a/taichi/program/program.h b/taichi/program/program.h
index fe51742ee4197c..66a688cb669f49 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -138,6 +138,7 @@ class Program {
   void initialize_device_llvm_context();
 
   void synchronize();
+  void device_synchronize();
 
   void layout(std::function<void()> func) {
     func();
diff --git a/taichi/transforms/constant_fold.cpp b/taichi/transforms/constant_fold.cpp
index f3150c7d6ce7c6..bc9f0bdaeefb7f 100644
--- a/taichi/transforms/constant_fold.cpp
+++ b/taichi/transforms/constant_fold.cpp
@@ -121,6 +121,9 @@ class ConstantFold : public BasicStmtVisitor {
     launch_ctx.set_arg_raw(0, lhs.val_u64);
     launch_ctx.set_arg_raw(1, rhs.val_u64);
     (*ker)(launch_ctx);
+    // Constant folding kernel is always run in sync mode, therefore we call
+    // device_synchronize().
+    current_program.device_synchronize();
     ret.val_i64 = current_program.fetch_result<int64_t>(0);
     return true;
   }
@@ -143,6 +146,9 @@ class ConstantFold : public BasicStmtVisitor {
     auto launch_ctx = ker->make_launch_context();
     launch_ctx.set_arg_raw(0, operand.val_u64);
     (*ker)(launch_ctx);
+    // Constant folding kernel is always run in sync mode, therefore we call
+    // device_synchronize().
+    current_program.device_synchronize();
     ret.val_i64 = current_program.fetch_result<int64_t>(0);
     return true;
   }
@@ -240,7 +246,7 @@ bool constant_fold(IRNode *root) {
     TI_TRACE("config.debug enabled, ignoring constant fold");
     return false;
   }
-  if (!cfg.advanced_optimization || cfg.async_mode)
+  if (!cfg.advanced_optimization)
     return false;
   return ConstantFold::run(root);
 }
diff --git a/tests/python/test_constant_fold.py b/tests/python/test_constant_fold.py
new file mode 100644
index 00000000000000..b217ddca0ad9ed
--- /dev/null
+++ b/tests/python/test_constant_fold.py
@@ -0,0 +1,18 @@
+import taichi as ti
+
+
+@ti.test(arch=ti.cpu, async_mode=True)
+def test_constant_fold():
+    n = 100
+
+    @ti.kernel
+    def series() -> int:
+        s = 0
+        for i in ti.static(range(n)):
+            a = i + 1
+            s += a * a
+        return s
+
+    # \sum_{i=1}^n (i^2) = n * (n + 1) * (2n + 1) / 6
+    expected = n * (n + 1) * (2 * n + 1) // 6
+    assert series() == expected