From e0ca120e2068b91fec153c8e173e776fd6d2b975 Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Fri, 30 Oct 2020 15:09:39 +0800
Subject: [PATCH 1/9] tmp1

---
 con.py                                    | 23 +++++++++++++
 mrp.py                                    | 18 ++++++++++
 taichi/backends/opengl/codegen_opengl.cpp |  3 +-
 taichi/backends/opengl/opengl_api.cpp     | 41 +++++++++++++++++++----
 taichi/backends/opengl/opengl_api.h       |  2 ++
 taichi/backends/opengl/shaders/runtime.h  |  6 ++++
 6 files changed, 85 insertions(+), 8 deletions(-)
 create mode 100644 con.py
 create mode 100644 mrp.py

diff --git a/con.py b/con.py
new file mode 100644
index 0000000000000..4bbb18ac90333
--- /dev/null
+++ b/con.py
@@ -0,0 +1,23 @@
+import taichi as ti
+import timeit
+
+ti.init(ti.opengl, log_level=ti.DEBUG)
+
+N = 2**14
+
+a = ti.field(int, 4)
+x = ti.field(int, N)
+
+@ti.kernel
+def indirect():
+    for i in range(a[3]):
+        x[i] = i + 1
+
+a[0] = 128
+a[1] = 1
+a[2] = 1
+a[3] = N
+stmt = lambda: indirect()
+print(timeit.timeit(stmt, stmt, number=10000))
+print(x)
+print(a)
diff --git a/mrp.py b/mrp.py
new file mode 100644
index 0000000000000..03a72e7002c2e
--- /dev/null
+++ b/mrp.py
@@ -0,0 +1,18 @@
+import taichi as ti
+import timeit
+
+ti.init(ti.opengl)
+
+N = 2**14
+
+x = ti.field(int, N)
+
+@ti.kernel
+def func():
+    for i in x:
+        x[i] = i
+    for i in range(x[2]):
+        x[i] = i
+
+stmt = lambda: func()
+print(timeit.timeit(stmt, stmt, number=10000))
diff --git a/taichi/backends/opengl/codegen_opengl.cpp b/taichi/backends/opengl/codegen_opengl.cpp
index 7cd6f85f50b72..c7d840ad3004b 100644
--- a/taichi/backends/opengl/codegen_opengl.cpp
+++ b/taichi/backends/opengl/codegen_opengl.cpp
@@ -796,7 +796,6 @@ class KernelGen : public IRVisitor {
       ScopedGridStrideLoop _gsl(this);
       emit("if (_sid >= {}) {};", end_value - begin_value, get_return_stmt());
       emit("int _itv = {} + _sid * {};", begin_value, 1 /* stmt->step? */);
-      stmt->body->accept(this);
     } else {
       ScopedIndent _s(line_appender_);
       emit("// range known at runtime");
@@ -812,8 +811,8 @@ class KernelGen : public IRVisitor {
       emit("int _beg = {}, _end = {};", begin_expr, end_expr);
       emit("int _itv = _beg + _sid;");
       emit("if (_itv >= _end) {};", get_return_stmt());
-      stmt->body->accept(this);
     }
+    stmt->body->accept(this);
 
     if (used_tls) {
       TI_ASSERT(stmt->tls_epilogue != nullptr);
diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp
index 9cb437b6e3e42..486b63f06f87b 100644
--- a/taichi/backends/opengl/opengl_api.cpp
+++ b/taichi/backends/opengl/opengl_api.cpp
@@ -209,6 +209,10 @@ struct GLSSBO {
     check_opengl_error("glBindBufferRange");
   }
 
+  void as_indirect_buffer() {
+    glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, id_);
+  }
+
   void *map(size_t offset,
             size_t length,
             GLbitfield access = GL_READ_ONLY) const {
@@ -309,6 +313,14 @@ struct GLSLLauncherImpl {
 ParallelSize::~ParallelSize() {
 }
 
+bool ParallelSize::is_indirect() const {
+  return false;
+}
+
+bool ParallelSize_DynamicRange::is_indirect() const {
+  return true;
+}
+
 size_t ParallelSize::get_threads_per_block() const {
   size_t limit = opengl_threads_per_block;
   size_t n = threads_per_block.value_or(0);
@@ -483,10 +495,6 @@ struct CompiledKernel {
   }
 
   void dispatch_compute(GLSLLauncher *launcher) const {
-    int num_blocks = ps->get_num_blocks(launcher);
-
-    glsl->use();
-
     // https://www.khronos.org/opengl/wiki/Compute_Shader
     // https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders
     // https://www.khronos.org/assets/uploads/developers/library/2014-siggraph-bof/KITE-BOF_Aug14.pdf
@@ -494,8 +502,21 @@ struct CompiledKernel {
     // `glDispatchCompute(X, Y, Z)`   - the X*Y*Z  == `Blocks`   in CUDA
     // `layout(local_size_x = X) in;` - the X      == `Threads`  in CUDA
     //
-    glDispatchCompute(num_blocks, 1, 1);
-    check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks));
+    if (/*!taichi::starts_with(kernel_name, "indirect_") && */!ps->is_indirect()) {
+      int num_blocks = ps->get_num_blocks(launcher);
+      glsl->use();
+      glDispatchCompute(num_blocks, 1, 1);
+      check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks));
+
+    } else {
+      //auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime);
+      //runtime->as_indirect_buffer();
+      auto root = launcher->impl->core_bufs.get(GLBufId::Root);
+      root->as_indirect_buffer();
+      glsl->use();
+      glDispatchComputeIndirect(0);  // offset of runtime.indirect_x is 0
+      check_opengl_error(fmt::format("glDispatchComputeIndirect"));
+    }
 
     glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
     check_opengl_error("glMemoryBarrier");
@@ -723,6 +744,14 @@ bool initialize_opengl(bool error_tolerance) {
 ParallelSize::~ParallelSize() {
 }
 
+bool ParallelSize::is_indirect() const {
+  TI_NOT_IMPLEMENTED;
+}
+
+bool ParallelSize_DynamicRange::is_indirect() const {
+  TI_NOT_IMPLEMENTED;
+}
+
 size_t ParallelSize::get_num_threads(GLSLLauncher *launcher) const {
   TI_NOT_IMPLEMENTED;
 }
diff --git a/taichi/backends/opengl/opengl_api.h b/taichi/backends/opengl/opengl_api.h
index 4e46766992b98..66be118447bac 100644
--- a/taichi/backends/opengl/opengl_api.h
+++ b/taichi/backends/opengl/opengl_api.h
@@ -44,6 +44,7 @@ class ParallelSize {
   std::optional<size_t> strides_per_thread;
   std::optional<size_t> threads_per_block;
 
+  virtual bool is_indirect() const;
   virtual size_t get_num_strides(GLSLLauncher *launcher) const = 0;
   size_t get_num_threads(GLSLLauncher *launcher) const;
   size_t get_num_blocks(GLSLLauncher *launcher) const;
@@ -71,6 +72,7 @@ class ParallelSize_DynamicRange : public ParallelSize {
   ParallelSize_DynamicRange(OffloadedStmt *stmt);
   virtual size_t get_num_strides(GLSLLauncher *launcher) const override;
   virtual ~ParallelSize_DynamicRange() override = default;
+  virtual bool is_indirect() const override;
 };
 
 class ParallelSize_StructFor : public ParallelSize {
diff --git a/taichi/backends/opengl/shaders/runtime.h b/taichi/backends/opengl/shaders/runtime.h
index c717bfbb6bad6..56f2e47b11f7e 100644
--- a/taichi/backends/opengl/shaders/runtime.h
+++ b/taichi/backends/opengl/shaders/runtime.h
@@ -16,6 +16,9 @@ struct _msg_entry_t {
 };
 
 layout(std430, binding = 6) buffer runtime {
+  int _indirect_x_;
+  int _indirect_y_;
+  int _indirect_z_;
   int _rand_state_;
   int _msg_count_;
   // TODO: move msg buf to gtmp
@@ -46,6 +49,9 @@ struct GLSLMsgEntry {
 };
 
 struct GLSLRuntime {
+  int indirect_x;
+  int indirect_y;
+  int indirect_z;
   int rand_state;
   int msg_count;
   GLSLMsgEntry msg_buf[MAX_MESSAGES];

From 1c5e4be6735036b4d0c0fc9bae1387d0f10a4cdb Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Fri, 30 Oct 2020 15:31:05 +0800
Subject: [PATCH 2/9] tmp2

---
 taichi/backends/opengl/opengl_api.cpp         | 20 +++++++++++++++----
 .../backends/opengl/shaders/indirect.glsl.h   | 14 +++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 taichi/backends/opengl/shaders/indirect.glsl.h

diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp
index 486b63f06f87b..22e8700a58740 100644
--- a/taichi/backends/opengl/opengl_api.cpp
+++ b/taichi/backends/opengl/opengl_api.cpp
@@ -458,6 +458,7 @@ void display_kernel_info(std::string const &kernel_name,
                      taichi::starts_with(kernel_name, "tensor_to_") ||
                      taichi::starts_with(kernel_name, "matrix_to_") ||
                      taichi::starts_with(kernel_name, "ext_arr_to_") ||
+                     //taichi::starts_with(kernel_name, "indirect_evaluator_") ||
                      taichi::starts_with(kernel_name, "jit_evaluator_");
   if (!is_accessor)
     TI_DEBUG("source of kernel [{}]:\n{}", kernel_name, kernel_source_code);
@@ -494,6 +495,16 @@ struct CompiledKernel {
     glsl->link();
   }
 
+  static std::unique_ptr<CompiledKernel> dbg_make_indirect_evaluator() {
+    auto ps = std::make_unique<ParallelSize_ConstRange>(0);
+    std::string source =
+#include "taichi/backends/opengl/shaders/indirect.glsl.h"
+      ;
+    auto ck = std::make_unique<CompiledKernel>("indirect_evaluator_0",
+        source, std::move(ps));
+    return ck;
+  }
+
   void dispatch_compute(GLSLLauncher *launcher) const {
     // https://www.khronos.org/opengl/wiki/Compute_Shader
     // https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders
@@ -509,10 +520,11 @@ struct CompiledKernel {
       check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks));
 
     } else {
-      //auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime);
-      //runtime->as_indirect_buffer();
-      auto root = launcher->impl->core_bufs.get(GLBufId::Root);
-      root->as_indirect_buffer();
+      //auto ie = ps->get_indirect_evaluator();
+      auto ie = dbg_make_indirect_evaluator();
+      ie->dispatch_compute(launcher);
+      auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime);
+      runtime->as_indirect_buffer();
       glsl->use();
       glDispatchComputeIndirect(0);  // offset of runtime.indirect_x is 0
       check_opengl_error(fmt::format("glDispatchComputeIndirect"));
diff --git a/taichi/backends/opengl/shaders/indirect.glsl.h b/taichi/backends/opengl/shaders/indirect.glsl.h
new file mode 100644
index 0000000000000..f8bbfba43d33b
--- /dev/null
+++ b/taichi/backends/opengl/shaders/indirect.glsl.h
@@ -0,0 +1,14 @@
+// vim: ft=glsl
+// clang-format off
+#include "taichi/util/macros.h"
+"#version 430 core\nprecision highp float;\n"
+#define __GLSL__
+#include "taichi/backends/opengl/shaders/runtime.h"
+#undef __GLSL__
+STR(
+void main() {  // indirect parallel size evaluator kernel
+  _indirect_x_ = 128;
+  _indirect_y_ = 1;
+  _indirect_z_ = 1;
+}
+)

From d42c54462a48966b1b14ada5832c87b015c5959a Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Fri, 30 Oct 2020 16:00:38 +0800
Subject: [PATCH 3/9] CompiledKernel refactor to PIMPL

---
 taichi/backends/opengl/opengl_api.cpp | 37 ++++++++++++++++++++-------
 taichi/backends/opengl/opengl_api.h   | 17 ++++++++++++
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp
index 22e8700a58740..379cc761c9886 100644
--- a/taichi/backends/opengl/opengl_api.cpp
+++ b/taichi/backends/opengl/opengl_api.cpp
@@ -470,20 +470,15 @@ void display_kernel_info(std::string const &kernel_name,
 #endif
 }
 
-struct CompiledKernel {
+struct CompiledKernel::Impl {
   std::string kernel_name;
   std::unique_ptr<GLProgram> glsl;
   std::unique_ptr<ParallelSize> ps;
   std::string source;
 
-  // disscussion:
-  // https://github.com/taichi-dev/taichi/pull/696#issuecomment-609332527
-  CompiledKernel(CompiledKernel &&) = default;
-  CompiledKernel &operator=(CompiledKernel &&) = default;
-
-  explicit CompiledKernel(const std::string &kernel_name_,
-                          const std::string &kernel_source_code,
-                          std::unique_ptr<ParallelSize> ps_)
+  Impl(const std::string &kernel_name_,
+       const std::string &kernel_source_code,
+       std::unique_ptr<ParallelSize> ps_)
       : kernel_name(kernel_name_), ps(std::move(ps_)) {
     source =
         kernel_source_code +
@@ -715,6 +710,18 @@ bool is_opengl_api_available() {
 struct GLProgram {};
 struct GLSLLauncherImpl {};
 
+struct CompiledKernel::Impl {
+  Impl(const std::string &kernel_name_,
+       const std::string &kernel_source_code,
+       std::unique_ptr<ParallelSize> ps_) {
+    TI_NOT_IMPLEMENTED;
+  }
+
+  void dispatch_compute(GLSLLauncher *launcher) const {
+    TI_NOT_IMPLEMENTED;
+  }
+};
+
 struct CompiledProgram::Impl {
   UsedFeature used;
 
@@ -823,6 +830,18 @@ void CompiledProgram::launch(Context &ctx, GLSLLauncher *launcher) const {
   impl->launch(ctx, launcher);
 }
 
+CompiledKernel::CompiledKernel(const std::string &kernel_name_,
+                 const std::string &kernel_source_code,
+                 std::unique_ptr<ParallelSize> ps_)
+      : impl(std::make_unique<Impl>(kernel_name_, kernel_source_code, std::move(ps_))) {
+}
+
+void CompiledKernel::dispatch_compute(GLSLLauncher *launcher) const {
+  impl->dispatch_compute(launcher);
+}
+
+CompiledKernel::~CompiledKernel() = default;
+
 GLSLLauncher::~GLSLLauncher() = default;
 
 }  // namespace opengl
diff --git a/taichi/backends/opengl/opengl_api.h b/taichi/backends/opengl/opengl_api.h
index 66be118447bac..25f53fa587fd2 100644
--- a/taichi/backends/opengl/opengl_api.h
+++ b/taichi/backends/opengl/opengl_api.h
@@ -82,6 +82,23 @@ class ParallelSize_StructFor : public ParallelSize {
   virtual ~ParallelSize_StructFor() override = default;
 };
 
+struct CompiledKernel {
+  struct Impl;
+  std::unique_ptr<Impl> impl;
+
+  // disscussion:
+  // https://github.com/taichi-dev/taichi/pull/696#issuecomment-609332527
+  CompiledKernel(CompiledKernel &&) = default;
+  CompiledKernel &operator=(CompiledKernel &&) = default;
+
+  CompiledKernel(const std::string &kernel_name_,
+                 const std::string &kernel_source_code,
+                 std::unique_ptr<ParallelSize> ps_);
+  ~CompiledKernel();
+
+  void dispatch_compute(GLSLLauncher *launcher) const;
+};
+
 struct CompiledProgram {
   struct Impl;
   std::unique_ptr<Impl> impl;

From 248546bc9f6673c778cb9f5442a13c9a034b1c32 Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Fri, 30 Oct 2020 16:12:48 +0800
Subject: [PATCH 4/9] ps->get_indirect_evaluator

---
 con.py                                |  2 +-
 taichi/backends/opengl/opengl_api.cpp | 28 ++++++++++++++++++++++++---
 taichi/backends/opengl/opengl_api.h   |  4 ++++
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/con.py b/con.py
index 4bbb18ac90333..f00f95cea57d9 100644
--- a/con.py
+++ b/con.py
@@ -16,7 +16,7 @@ def indirect():
 a[0] = 128
 a[1] = 1
 a[2] = 1
-a[3] = N
+a[3] = N - 1
 stmt = lambda: indirect()
 print(timeit.timeit(stmt, stmt, number=10000))
 print(x)
diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp
index 379cc761c9886..f536400170c27 100644
--- a/taichi/backends/opengl/opengl_api.cpp
+++ b/taichi/backends/opengl/opengl_api.cpp
@@ -321,6 +321,20 @@ bool ParallelSize_DynamicRange::is_indirect() const {
   return true;
 }
 
+std::unique_ptr<CompiledKernel> ParallelSize::get_indirect_evaluator() {
+  return nullptr;
+}
+
+std::unique_ptr<CompiledKernel> ParallelSize_DynamicRange::get_indirect_evaluator() {
+  auto ps = std::make_unique<ParallelSize_ConstRange>(0);
+  std::string source =
+#include "taichi/backends/opengl/shaders/indirect.glsl.h"
+    ;
+  auto ck = std::make_unique<CompiledKernel>("indirect_evaluator_0",
+      source, std::move(ps));
+  return ck;
+}
+
 size_t ParallelSize::get_threads_per_block() const {
   size_t limit = opengl_threads_per_block;
   size_t n = threads_per_block.value_or(0);
@@ -508,15 +522,15 @@ struct CompiledKernel::Impl {
     // `glDispatchCompute(X, Y, Z)`   - the X*Y*Z  == `Blocks`   in CUDA
     // `layout(local_size_x = X) in;` - the X      == `Threads`  in CUDA
     //
-    if (/*!taichi::starts_with(kernel_name, "indirect_") && */!ps->is_indirect()) {
+    if (!ps->is_indirect()) {
       int num_blocks = ps->get_num_blocks(launcher);
       glsl->use();
       glDispatchCompute(num_blocks, 1, 1);
       check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks));
 
     } else {
-      //auto ie = ps->get_indirect_evaluator();
-      auto ie = dbg_make_indirect_evaluator();
+      auto ie = ps->get_indirect_evaluator();
+      //auto ie = dbg_make_indirect_evaluator();
       ie->dispatch_compute(launcher);
       auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime);
       runtime->as_indirect_buffer();
@@ -800,6 +814,14 @@ size_t ParallelSize_StructFor::get_num_strides(GLSLLauncher *launcher) const {
   TI_NOT_IMPLEMENTED;
 }
 
+std::unique_ptr<CompiledKernel> ParallelSize::get_indirect_evaluator() {
+  TI_NOT_IMPLEMENTED;
+}
+
+std::unique_ptr<CompiledKernel> ParallelSize_DynamicRange::get_indirect_evaluator() {
+  TI_NOT_IMPLEMENTED;
+}
+
 ParallelSize_ConstRange::ParallelSize_ConstRange(size_t num_strides)
     : num_strides(num_strides) {
 }
diff --git a/taichi/backends/opengl/opengl_api.h b/taichi/backends/opengl/opengl_api.h
index 25f53fa587fd2..ce1501ffd5eeb 100644
--- a/taichi/backends/opengl/opengl_api.h
+++ b/taichi/backends/opengl/opengl_api.h
@@ -37,6 +37,8 @@ extern int opengl_threads_per_block;
     return false;                  \
   })()
 
+struct CompiledKernel;
+
 class ParallelSize {
   // GLSL: stride < invocation < local work group < 'dispatch'
   // CUDA: stride < thread < block < grid
@@ -48,6 +50,7 @@ class ParallelSize {
   virtual size_t get_num_strides(GLSLLauncher *launcher) const = 0;
   size_t get_num_threads(GLSLLauncher *launcher) const;
   size_t get_num_blocks(GLSLLauncher *launcher) const;
+  virtual std::unique_ptr<CompiledKernel> get_indirect_evaluator();
   virtual size_t get_threads_per_block() const;
   virtual ~ParallelSize();
 };
@@ -72,6 +75,7 @@ class ParallelSize_DynamicRange : public ParallelSize {
   ParallelSize_DynamicRange(OffloadedStmt *stmt);
   virtual size_t get_num_strides(GLSLLauncher *launcher) const override;
   virtual ~ParallelSize_DynamicRange() override = default;
+  virtual std::unique_ptr<CompiledKernel> get_indirect_evaluator() override;
   virtual bool is_indirect() const override;
 };
 

From 5f32be9f671099538da5f7ad4734d6e97352cbdf Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Fri, 30 Oct 2020 16:19:20 +0800
Subject: [PATCH 5/9] cache indirect evaluator kernel

---
 taichi/backends/opengl/opengl_api.cpp | 35 ++++++++++-----------------
 taichi/backends/opengl/opengl_api.h   |  5 ++--
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp
index f536400170c27..8142425f3d26c 100644
--- a/taichi/backends/opengl/opengl_api.cpp
+++ b/taichi/backends/opengl/opengl_api.cpp
@@ -321,18 +321,20 @@ bool ParallelSize_DynamicRange::is_indirect() const {
   return true;
 }
 
-std::unique_ptr<CompiledKernel> ParallelSize::get_indirect_evaluator() {
+CompiledKernel *ParallelSize::get_indirect_evaluator() {
   return nullptr;
 }
 
-std::unique_ptr<CompiledKernel> ParallelSize_DynamicRange::get_indirect_evaluator() {
-  auto ps = std::make_unique<ParallelSize_ConstRange>(0);
-  std::string source =
+CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() {
+  if (!indirect_evaluator) {
+    auto ps = std::make_unique<ParallelSize_ConstRange>(0);
+    std::string source =
 #include "taichi/backends/opengl/shaders/indirect.glsl.h"
-    ;
-  auto ck = std::make_unique<CompiledKernel>("indirect_evaluator_0",
-      source, std::move(ps));
-  return ck;
+      ;
+    indirect_evaluator = std::make_unique<CompiledKernel>(
+        "indirect_evaluator_opengl", source, std::move(ps));
+  }
+  return indirect_evaluator.get();
 }
 
 size_t ParallelSize::get_threads_per_block() const {
@@ -472,7 +474,7 @@ void display_kernel_info(std::string const &kernel_name,
                      taichi::starts_with(kernel_name, "tensor_to_") ||
                      taichi::starts_with(kernel_name, "matrix_to_") ||
                      taichi::starts_with(kernel_name, "ext_arr_to_") ||
-                     //taichi::starts_with(kernel_name, "indirect_evaluator_") ||
+                     taichi::starts_with(kernel_name, "indirect_evaluator_") ||
                      taichi::starts_with(kernel_name, "jit_evaluator_");
   if (!is_accessor)
     TI_DEBUG("source of kernel [{}]:\n{}", kernel_name, kernel_source_code);
@@ -504,16 +506,6 @@ struct CompiledKernel::Impl {
     glsl->link();
   }
 
-  static std::unique_ptr<CompiledKernel> dbg_make_indirect_evaluator() {
-    auto ps = std::make_unique<ParallelSize_ConstRange>(0);
-    std::string source =
-#include "taichi/backends/opengl/shaders/indirect.glsl.h"
-      ;
-    auto ck = std::make_unique<CompiledKernel>("indirect_evaluator_0",
-        source, std::move(ps));
-    return ck;
-  }
-
   void dispatch_compute(GLSLLauncher *launcher) const {
     // https://www.khronos.org/opengl/wiki/Compute_Shader
     // https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders
@@ -530,7 +522,6 @@ struct CompiledKernel::Impl {
 
     } else {
       auto ie = ps->get_indirect_evaluator();
-      //auto ie = dbg_make_indirect_evaluator();
       ie->dispatch_compute(launcher);
       auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime);
       runtime->as_indirect_buffer();
@@ -814,11 +805,11 @@ size_t ParallelSize_StructFor::get_num_strides(GLSLLauncher *launcher) const {
   TI_NOT_IMPLEMENTED;
 }
 
-std::unique_ptr<CompiledKernel> ParallelSize::get_indirect_evaluator() {
+CompiledKernel *ParallelSize::get_indirect_evaluator() {
   TI_NOT_IMPLEMENTED;
 }
 
-std::unique_ptr<CompiledKernel> ParallelSize_DynamicRange::get_indirect_evaluator() {
+CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() {
   TI_NOT_IMPLEMENTED;
 }
 
diff --git a/taichi/backends/opengl/opengl_api.h b/taichi/backends/opengl/opengl_api.h
index ce1501ffd5eeb..e725ebfcc5dc4 100644
--- a/taichi/backends/opengl/opengl_api.h
+++ b/taichi/backends/opengl/opengl_api.h
@@ -50,7 +50,7 @@ class ParallelSize {
   virtual size_t get_num_strides(GLSLLauncher *launcher) const = 0;
   size_t get_num_threads(GLSLLauncher *launcher) const;
   size_t get_num_blocks(GLSLLauncher *launcher) const;
-  virtual std::unique_ptr<CompiledKernel> get_indirect_evaluator();
+  virtual CompiledKernel *get_indirect_evaluator();
   virtual size_t get_threads_per_block() const;
   virtual ~ParallelSize();
 };
@@ -70,12 +70,13 @@ class ParallelSize_DynamicRange : public ParallelSize {
   bool const_end;
   int range_begin;
   int range_end;
+  std::unique_ptr<CompiledKernel> indirect_evaluator = nullptr;
 
  public:
   ParallelSize_DynamicRange(OffloadedStmt *stmt);
   virtual size_t get_num_strides(GLSLLauncher *launcher) const override;
   virtual ~ParallelSize_DynamicRange() override = default;
-  virtual std::unique_ptr<CompiledKernel> get_indirect_evaluator() override;
+  virtual CompiledKernel *get_indirect_evaluator() override;
   virtual bool is_indirect() const override;
 };
 

From 1cee610929ba00270ca0ed4271c7fe1257794b9d Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Fri, 30 Oct 2020 16:32:46 +0800
Subject: [PATCH 6/9] _compute_indirect as template

---
 con.py                                         | 9 +++------
 taichi/backends/opengl/opengl_api.cpp          | 8 +++++++-
 taichi/backends/opengl/shaders/indirect.glsl.h | 7 ++++++-
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/con.py b/con.py
index f00f95cea57d9..898f5fc56c1ec 100644
--- a/con.py
+++ b/con.py
@@ -5,18 +5,15 @@
 
 N = 2**14
 
-a = ti.field(int, 4)
+a = ti.field(int, ())
 x = ti.field(int, N)
 
 @ti.kernel
 def indirect():
-    for i in range(a[3]):
+    for i in range(a[None]):
         x[i] = i + 1
 
-a[0] = 128
-a[1] = 1
-a[2] = 1
-a[3] = N - 1
+a[None] = N - 1
 stmt = lambda: indirect()
 print(timeit.timeit(stmt, stmt, number=10000))
 print(x)
diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp
index 8142425f3d26c..e62149ab42fdd 100644
--- a/taichi/backends/opengl/opengl_api.cpp
+++ b/taichi/backends/opengl/opengl_api.cpp
@@ -328,8 +328,14 @@ CompiledKernel *ParallelSize::get_indirect_evaluator() {
 CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() {
   if (!indirect_evaluator) {
     auto ps = std::make_unique<ParallelSize_ConstRange>(0);
+    size_t SPT = strides_per_thread.value_or(1);
+    size_t TPG = ParallelSize::get_threads_per_block();
     std::string source =
 #include "taichi/backends/opengl/shaders/indirect.glsl.h"
+      + fmt::format("\nvoid main() {{\n"
+          "  _compute_indirect({}, {}, {}, {}, {}, {});\n"
+          "}}\n", (int)const_begin, (int)const_end, range_begin, range_end,
+          SPT, TPG);
       ;
     indirect_evaluator = std::make_unique<CompiledKernel>(
         "indirect_evaluator_opengl", source, std::move(ps));
@@ -352,7 +358,7 @@ size_t ParallelSize_ConstRange::get_num_strides(GLSLLauncher *launcher) const {
 }
 
 size_t ParallelSize_ConstRange::get_threads_per_block() const {
-  size_t n = get_num_threads(nullptr);
+  size_t n = get_num_threads(nullptr);  // TODO: clean up these (iapr)
   size_t TPG = ParallelSize::get_threads_per_block();
   return std::max(std::min(n, TPG), (size_t)1);
 }
diff --git a/taichi/backends/opengl/shaders/indirect.glsl.h b/taichi/backends/opengl/shaders/indirect.glsl.h
index f8bbfba43d33b..14ef558302e33 100644
--- a/taichi/backends/opengl/shaders/indirect.glsl.h
+++ b/taichi/backends/opengl/shaders/indirect.glsl.h
@@ -6,9 +6,14 @@
 #include "taichi/backends/opengl/shaders/runtime.h"
 #undef __GLSL__
 STR(
-void main() {  // indirect parallel size evaluator kernel
+void _compute_indirect(
+  int const_begin, int const_end,
+  int gtmp_begin, int range_end,
+  int SPT, int TPG) {
+  // indirect work group size evaluator kernel
   _indirect_x_ = 128;
   _indirect_y_ = 1;
   _indirect_z_ = 1;
 }
+// get_indirect_evaluator() will prepend a main here, with template arguments
 )

From 8e754c2d1c6d7eca50670eb0ca1b84767d1ed8f2 Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Fri, 30 Oct 2020 16:41:29 +0800
Subject: [PATCH 7/9] according to gtmp

---
 .../backends/opengl/shaders/indirect.glsl.h   | 26 ++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/taichi/backends/opengl/shaders/indirect.glsl.h b/taichi/backends/opengl/shaders/indirect.glsl.h
index 14ef558302e33..dd4580c478ddc 100644
--- a/taichi/backends/opengl/shaders/indirect.glsl.h
+++ b/taichi/backends/opengl/shaders/indirect.glsl.h
@@ -6,14 +6,34 @@
 #include "taichi/backends/opengl/shaders/runtime.h"
 #undef __GLSL__
 STR(
+// taichi uses gtmp for storing dynamic range endpoints
+layout(std430, binding = 1) buffer gtmp_i32 { int _gtmp_i32_[]; };
+
+// indirect work group size evaluator kernel template
 void _compute_indirect(
   int const_begin, int const_end,
-  int gtmp_begin, int range_end,
+  int range_begin, int range_end,
   int SPT, int TPG) {
-  // indirect work group size evaluator kernel
-  _indirect_x_ = 128;
+
+  // dynamic range for
+  if (const_begin == 0) {
+    range_begin = _gtmp_i32_[range_begin >> 2];
+  }
+  if (const_end == 0) {
+    range_end = _gtmp_i32_[range_end >> 2];
+  }
+  int nstrides = 1;
+  if (range_end > range_begin) {
+    nstrides = range_end - range_begin;
+  }
+
+  int nthreads = max((nstrides + SPT - 1) / SPT, 1);
+  int nblocks = max((nthreads + TPG - 1) / TPG, 1);
+
+  _indirect_x_ = nblocks;
   _indirect_y_ = 1;
   _indirect_z_ = 1;
 }
+
 // get_indirect_evaluator() will prepend a main here, with template arguments
 )

From 8654287d57e9c13a44dabb1ae1ca90761dedbc3b Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Fri, 30 Oct 2020 16:50:01 +0800
Subject: [PATCH 8/9] clean up

---
 con.py                                    | 20 --------------------
 mrp.py                                    | 18 ------------------
 taichi/backends/opengl/codegen_opengl.cpp |  3 ++-
 3 files changed, 2 insertions(+), 39 deletions(-)
 delete mode 100644 con.py
 delete mode 100644 mrp.py

diff --git a/con.py b/con.py
deleted file mode 100644
index 898f5fc56c1ec..0000000000000
--- a/con.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import taichi as ti
-import timeit
-
-ti.init(ti.opengl, log_level=ti.DEBUG)
-
-N = 2**14
-
-a = ti.field(int, ())
-x = ti.field(int, N)
-
-@ti.kernel
-def indirect():
-    for i in range(a[None]):
-        x[i] = i + 1
-
-a[None] = N - 1
-stmt = lambda: indirect()
-print(timeit.timeit(stmt, stmt, number=10000))
-print(x)
-print(a)
diff --git a/mrp.py b/mrp.py
deleted file mode 100644
index 03a72e7002c2e..0000000000000
--- a/mrp.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import taichi as ti
-import timeit
-
-ti.init(ti.opengl)
-
-N = 2**14
-
-x = ti.field(int, N)
-
-@ti.kernel
-def func():
-    for i in x:
-        x[i] = i
-    for i in range(x[2]):
-        x[i] = i
-
-stmt = lambda: func()
-print(timeit.timeit(stmt, stmt, number=10000))
diff --git a/taichi/backends/opengl/codegen_opengl.cpp b/taichi/backends/opengl/codegen_opengl.cpp
index c7d840ad3004b..7cd6f85f50b72 100644
--- a/taichi/backends/opengl/codegen_opengl.cpp
+++ b/taichi/backends/opengl/codegen_opengl.cpp
@@ -796,6 +796,7 @@ class KernelGen : public IRVisitor {
       ScopedGridStrideLoop _gsl(this);
       emit("if (_sid >= {}) {};", end_value - begin_value, get_return_stmt());
       emit("int _itv = {} + _sid * {};", begin_value, 1 /* stmt->step? */);
+      stmt->body->accept(this);
     } else {
       ScopedIndent _s(line_appender_);
       emit("// range known at runtime");
@@ -811,8 +812,8 @@ class KernelGen : public IRVisitor {
       emit("int _beg = {}, _end = {};", begin_expr, end_expr);
       emit("int _itv = _beg + _sid;");
       emit("if (_itv >= _end) {};", get_return_stmt());
+      stmt->body->accept(this);
     }
-    stmt->body->accept(this);
 
     if (used_tls) {
       TI_ASSERT(stmt->tls_epilogue != nullptr);

From d3ba27285532a90c583a66cc730dc0640846419f Mon Sep 17 00:00:00 2001
From: Taichi Gardener <taichigardener@gmail.com>
Date: Fri, 30 Oct 2020 05:43:11 -0400
Subject: [PATCH 9/9] [skip ci] enforce code format

---
 taichi/backends/opengl/opengl_api.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp
index e62149ab42fdd..7fe288eabb4c9 100644
--- a/taichi/backends/opengl/opengl_api.cpp
+++ b/taichi/backends/opengl/opengl_api.cpp
@@ -332,11 +332,12 @@ CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() {
     size_t TPG = ParallelSize::get_threads_per_block();
     std::string source =
 #include "taichi/backends/opengl/shaders/indirect.glsl.h"
-      + fmt::format("\nvoid main() {{\n"
-          "  _compute_indirect({}, {}, {}, {}, {}, {});\n"
-          "}}\n", (int)const_begin, (int)const_end, range_begin, range_end,
-          SPT, TPG);
-      ;
+        +fmt::format(
+            "\nvoid main() {{\n"
+            "  _compute_indirect({}, {}, {}, {}, {}, {});\n"
+            "}}\n",
+            (int)const_begin, (int)const_end, range_begin, range_end, SPT, TPG);
+    ;
     indirect_evaluator = std::make_unique<CompiledKernel>(
         "indirect_evaluator_opengl", source, std::move(ps));
   }
@@ -850,9 +851,11 @@ void CompiledProgram::launch(Context &ctx, GLSLLauncher *launcher) const {
 }
 
 CompiledKernel::CompiledKernel(const std::string &kernel_name_,
-                 const std::string &kernel_source_code,
-                 std::unique_ptr<ParallelSize> ps_)
-      : impl(std::make_unique<Impl>(kernel_name_, kernel_source_code, std::move(ps_))) {
+                               const std::string &kernel_source_code,
+                               std::unique_ptr<ParallelSize> ps_)
+    : impl(std::make_unique<Impl>(kernel_name_,
+                                  kernel_source_code,
+                                  std::move(ps_))) {
 }
 
 void CompiledKernel::dispatch_compute(GLSLLauncher *launcher) const {