From a4db06d70b124b3d96b0ff9de44c968245b5a2c5 Mon Sep 17 00:00:00 2001 From: Ye Kuang Date: Sun, 23 Aug 2020 18:36:10 +0900 Subject: [PATCH] [async] Support constant folding in async mode --- python/taichi/lang/kernel.py | 12 +++++++----- taichi/program/async_engine.cpp | 2 +- taichi/program/kernel.cpp | 2 +- taichi/program/program.cpp | 24 ++++++++++++------------ taichi/program/program.h | 1 + taichi/transforms/constant_fold.cpp | 8 +++++++- tests/python/test_constant_fold.py | 18 ++++++++++++++++++ 7 files changed, 47 insertions(+), 20 deletions(-) create mode 100644 tests/python/test_constant_fold.py diff --git a/python/taichi/lang/kernel.py b/python/taichi/lang/kernel.py index 9f2992c0acb18f..630c3fc5fa655a 100644 --- a/python/taichi/lang/kernel.py +++ b/python/taichi/lang/kernel.py @@ -461,16 +461,18 @@ def call_back(): ret = None ret_dt = self.return_type - if ret_dt is not None: + has_ret = ret_dt is not None + + if has_external_arrays or has_ret: + import taichi as ti + ti.sync() + + if has_ret: if id(ret_dt) in integer_type_ids: ret = t_kernel.get_ret_int(0) else: ret = t_kernel.get_ret_float(0) - if has_external_arrays: - import taichi as ti - ti.sync() - if callbacks: for c in callbacks: c() diff --git a/taichi/program/async_engine.cpp b/taichi/program/async_engine.cpp index 1729ccbbd7eb7a..bf549793ef1707 100644 --- a/taichi/program/async_engine.cpp +++ b/taichi/program/async_engine.cpp @@ -175,7 +175,7 @@ void ExecutionQueue::enqueue(KernelLaunchRecord &&ker) { auto config = kernel->program.config; auto ir = stmt; offload_to_executable( - ir, config, /*verbose=*/false, + ir, config, /*verbose=*/config.print_ir, /*lower_global_access=*/true, /*make_thread_local=*/true, /*make_block_local=*/ diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp index 04ce79e1777cb3..931d813db1bb7a 100644 --- a/taichi/program/kernel.cpp +++ b/taichi/program/kernel.cpp @@ -94,7 +94,7 @@ void Kernel::lower(bool to_executable) { // TODO: is a "Lowerer" class } void Kernel::operator()(LaunchContextBuilder &launch_ctx) { - if (!program.config.async_mode) { + if (!program.config.async_mode || this->is_evaluator) { if (!compiled) { compile(); } diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp index ae7dd90e111eac..7de2a8af4f9c24 100644 --- a/taichi/program/program.cpp +++ b/taichi/program/program.cpp @@ -468,16 +468,20 @@ void Program::synchronize() { if (config.async_mode) { async_engine->synchronize(); } - if (config.arch == Arch::cuda) { + device_synchronize(); + sync = true; + } +} + +void Program::device_synchronize() { + if (config.arch == Arch::cuda) { #if defined(TI_WITH_CUDA) - CUDADriver::get_instance().stream_synchronize(nullptr); + CUDADriver::get_instance().stream_synchronize(nullptr); #else - TI_ERROR("No CUDA support"); + TI_ERROR("No CUDA support"); #endif - } else if (config.arch == Arch::metal) { - metal_kernel_mgr_->synchronize(); - } - sync = true; + } else if (config.arch == Arch::metal) { + metal_kernel_mgr_->synchronize(); } } @@ -626,13 +630,9 @@ Kernel &Program::get_snode_writer(SNode *snode) { } uint64 Program::fetch_result_uint64(int i) { + // Precondition: caller must have already done a program synchronization. uint64 ret; auto arch = config.arch; - sync = false; - // Runtime calls that set result buffer don't execute sync=false, so we have - // to set it here otherwise synchronize() does nothing. - // TODO: systematically fix this. - synchronize(); if (arch == Arch::cuda) { #if defined(TI_WITH_CUDA) if (config.use_unified_memory) { diff --git a/taichi/program/program.h b/taichi/program/program.h index fe51742ee4197c..66a688cb669f49 100644 --- a/taichi/program/program.h +++ b/taichi/program/program.h @@ -138,6 +138,7 @@ class Program { void initialize_device_llvm_context(); void synchronize(); + void device_synchronize(); void layout(std::function func) { func(); diff --git a/taichi/transforms/constant_fold.cpp b/taichi/transforms/constant_fold.cpp index f3150c7d6ce7c6..bc9f0bdaeefb7f 100644 --- a/taichi/transforms/constant_fold.cpp +++ b/taichi/transforms/constant_fold.cpp @@ -121,6 +121,9 @@ class ConstantFold : public BasicStmtVisitor { launch_ctx.set_arg_raw(0, lhs.val_u64); launch_ctx.set_arg_raw(1, rhs.val_u64); (*ker)(launch_ctx); + // Constant folding kernel is always run in sync mode, therefore we call + // device_synchronize(). + current_program.device_synchronize(); ret.val_i64 = current_program.fetch_result(0); return true; } @@ -143,6 +146,9 @@ class ConstantFold : public BasicStmtVisitor { auto launch_ctx = ker->make_launch_context(); launch_ctx.set_arg_raw(0, operand.val_u64); (*ker)(launch_ctx); + // Constant folding kernel is always run in sync mode, therefore we call + // device_synchronize(). + current_program.device_synchronize(); ret.val_i64 = current_program.fetch_result(0); return true; } @@ -240,7 +246,7 @@ bool constant_fold(IRNode *root) { TI_TRACE("config.debug enabled, ignoring constant fold"); return false; } - if (!cfg.advanced_optimization || cfg.async_mode) + if (!cfg.advanced_optimization) return false; return ConstantFold::run(root); } diff --git a/tests/python/test_constant_fold.py b/tests/python/test_constant_fold.py new file mode 100644 index 00000000000000..b217ddca0ad9ed --- /dev/null +++ b/tests/python/test_constant_fold.py @@ -0,0 +1,18 @@ +import taichi as ti + + +@ti.test(arch=ti.cpu, async_mode=True) +def test_constant_fold(): + n = 100 + + @ti.kernel + def series() -> int: + s = 0 + for i in ti.static(range(n)): + a = i + 1 + s += a * a + return s + + # \sum_{i=1}^n (i^2) = n * (n + 1) * (2n + 1) / 6 + expected = n * (n + 1) * (2 * n + 1) // 6 + assert series() == expected