taichi-dev · yuanming-hu · Oct 30, 2020 · Oct 30, 2020 · Oct 30, 2020 · Oct 30, 2020
diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp
@@ -209,6 +209,10 @@ struct GLSSBO {
     check_opengl_error("glBindBufferRange");
   }
 
+  void as_indirect_buffer() {
+    glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, id_);
+  }
+
   void *map(size_t offset,
             size_t length,
             GLbitfield access = GL_READ_ONLY) const {
@@ -309,6 +313,37 @@ struct GLSLLauncherImpl {
 ParallelSize::~ParallelSize() {
 }
 
+bool ParallelSize::is_indirect() const {
+  return false;
+}
+
+bool ParallelSize_DynamicRange::is_indirect() const {
+  return true;
+}
+
+CompiledKernel *ParallelSize::get_indirect_evaluator() {
+  return nullptr;
+}
+
+CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() {
+  if (!indirect_evaluator) {
+    auto ps = std::make_unique<ParallelSize_ConstRange>(0);
+    size_t SPT = strides_per_thread.value_or(1);
+    size_t TPG = ParallelSize::get_threads_per_block();
+    std::string source =
+#include "taichi/backends/opengl/shaders/indirect.glsl.h"
+        +fmt::format(
+            "\nvoid main() {{\n"
+            "  _compute_indirect({}, {}, {}, {}, {}, {});\n"
+            "}}\n",
+            (int)const_begin, (int)const_end, range_begin, range_end, SPT, TPG);
+    ;
+    indirect_evaluator = std::make_unique<CompiledKernel>(
+        "indirect_evaluator_opengl", source, std::move(ps));
+  }
+  return indirect_evaluator.get();
+}
+
 size_t ParallelSize::get_threads_per_block() const {
   size_t limit = opengl_threads_per_block;
   size_t n = threads_per_block.value_or(0);
@@ -324,7 +359,7 @@ size_t ParallelSize_ConstRange::get_num_strides(GLSLLauncher *launcher) const {
 }
 
 size_t ParallelSize_ConstRange::get_threads_per_block() const {
-  size_t n = get_num_threads(nullptr);
+  size_t n = get_num_threads(nullptr);  // TODO: clean up these (iapr)
   size_t TPG = ParallelSize::get_threads_per_block();
   return std::max(std::min(n, TPG), (size_t)1);
 }
@@ -446,6 +481,7 @@ void display_kernel_info(std::string const &kernel_name,
                      taichi::starts_with(kernel_name, "tensor_to_") ||
                      taichi::starts_with(kernel_name, "matrix_to_") ||
                      taichi::starts_with(kernel_name, "ext_arr_to_") ||
+                     taichi::starts_with(kernel_name, "indirect_evaluator_") ||
                      taichi::starts_with(kernel_name, "jit_evaluator_");
   if (!is_accessor)
     TI_DEBUG("source of kernel [{}]:\n{}", kernel_name, kernel_source_code);
@@ -457,20 +493,15 @@ void display_kernel_info(std::string const &kernel_name,
 #endif
 }
 
-struct CompiledKernel {
+struct CompiledKernel::Impl {
   std::string kernel_name;
   std::unique_ptr<GLProgram> glsl;
   std::unique_ptr<ParallelSize> ps;
   std::string source;
 
-  // disscussion:
-  // https://github.com/taichi-dev/taichi/pull/696#issuecomment-609332527
-  CompiledKernel(CompiledKernel &&) = default;
-  CompiledKernel &operator=(CompiledKernel &&) = default;
-
-  explicit CompiledKernel(const std::string &kernel_name_,
-                          const std::string &kernel_source_code,
-                          std::unique_ptr<ParallelSize> ps_)
+  Impl(const std::string &kernel_name_,
+       const std::string &kernel_source_code,
+       std::unique_ptr<ParallelSize> ps_)
       : kernel_name(kernel_name_), ps(std::move(ps_)) {
     source =
         kernel_source_code +
@@ -483,19 +514,28 @@ struct CompiledKernel {
   }
 
   void dispatch_compute(GLSLLauncher *launcher) const {
-    int num_blocks = ps->get_num_blocks(launcher);
-
-    glsl->use();
-
     // https://www.khronos.org/opengl/wiki/Compute_Shader
     // https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders
     // https://www.khronos.org/assets/uploads/developers/library/2014-siggraph-bof/KITE-BOF_Aug14.pdf
     //
     // `glDispatchCompute(X, Y, Z)`   - the X*Y*Z  == `Blocks`   in CUDA
     // `layout(local_size_x = X) in;` - the X      == `Threads`  in CUDA
     //
-    glDispatchCompute(num_blocks, 1, 1);
-    check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks));
+    if (!ps->is_indirect()) {
+      int num_blocks = ps->get_num_blocks(launcher);
+      glsl->use();
+      glDispatchCompute(num_blocks, 1, 1);
+      check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks));
+
+    } else {
+      auto ie = ps->get_indirect_evaluator();
+      ie->dispatch_compute(launcher);
+      auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime);
+      runtime->as_indirect_buffer();
+      glsl->use();
+      glDispatchComputeIndirect(0);  // offset of runtime.indirect_x is 0
+      check_opengl_error(fmt::format("glDispatchComputeIndirect"));
+    }
 
     glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
     check_opengl_error("glMemoryBarrier");
@@ -682,6 +722,18 @@ bool is_opengl_api_available() {
 struct GLProgram {};
 struct GLSLLauncherImpl {};
 
+struct CompiledKernel::Impl {
+  Impl(const std::string &kernel_name_,
+       const std::string &kernel_source_code,
+       std::unique_ptr<ParallelSize> ps_) {
+    TI_NOT_IMPLEMENTED;
+  }
+
+  void dispatch_compute(GLSLLauncher *launcher) const {
+    TI_NOT_IMPLEMENTED;
+  }
+};
+
 struct CompiledProgram::Impl {
   UsedFeature used;
 
@@ -723,6 +775,14 @@ bool initialize_opengl(bool error_tolerance) {
 ParallelSize::~ParallelSize() {
 }
 
+bool ParallelSize::is_indirect() const {
+  TI_NOT_IMPLEMENTED;
+}
+
+bool ParallelSize_DynamicRange::is_indirect() const {
+  TI_NOT_IMPLEMENTED;
+}
+
 size_t ParallelSize::get_num_threads(GLSLLauncher *launcher) const {
   TI_NOT_IMPLEMENTED;
 }
@@ -752,6 +812,14 @@ size_t ParallelSize_StructFor::get_num_strides(GLSLLauncher *launcher) const {
   TI_NOT_IMPLEMENTED;
 }
 
+CompiledKernel *ParallelSize::get_indirect_evaluator() {
+  TI_NOT_IMPLEMENTED;
+}
+
+CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() {
+  TI_NOT_IMPLEMENTED;
+}
+
 ParallelSize_ConstRange::ParallelSize_ConstRange(size_t num_strides)
     : num_strides(num_strides) {
 }
@@ -782,6 +850,20 @@ void CompiledProgram::launch(Context &ctx, GLSLLauncher *launcher) const {
   impl->launch(ctx, launcher);
 }
 
+CompiledKernel::CompiledKernel(const std::string &kernel_name_,
+                               const std::string &kernel_source_code,
+                               std::unique_ptr<ParallelSize> ps_)
+    : impl(std::make_unique<Impl>(kernel_name_,
+                                  kernel_source_code,
+                                  std::move(ps_))) {
+}
+
+void CompiledKernel::dispatch_compute(GLSLLauncher *launcher) const {
+  impl->dispatch_compute(launcher);
+}
+
+CompiledKernel::~CompiledKernel() = default;
+
 GLSLLauncher::~GLSLLauncher() = default;
 
 }  // namespace opengl

diff --git a/taichi/backends/opengl/opengl_api.h b/taichi/backends/opengl/opengl_api.h
@@ -37,16 +37,20 @@ extern int opengl_threads_per_block;
     return false;                  \
   })()
 
+struct CompiledKernel;
+
 class ParallelSize {
   // GLSL: stride < invocation < local work group < 'dispatch'
   // CUDA: stride < thread < block < grid
  public:
   std::optional<size_t> strides_per_thread;
   std::optional<size_t> threads_per_block;
 
+  virtual bool is_indirect() const;
   virtual size_t get_num_strides(GLSLLauncher *launcher) const = 0;
   size_t get_num_threads(GLSLLauncher *launcher) const;
   size_t get_num_blocks(GLSLLauncher *launcher) const;
+  virtual CompiledKernel *get_indirect_evaluator();
   virtual size_t get_threads_per_block() const;
   virtual ~ParallelSize();
 };
@@ -66,11 +70,14 @@ class ParallelSize_DynamicRange : public ParallelSize {
   bool const_end;
   int range_begin;
   int range_end;
+  std::unique_ptr<CompiledKernel> indirect_evaluator = nullptr;
 
  public:
   ParallelSize_DynamicRange(OffloadedStmt *stmt);
   virtual size_t get_num_strides(GLSLLauncher *launcher) const override;
   virtual ~ParallelSize_DynamicRange() override = default;
+  virtual CompiledKernel *get_indirect_evaluator() override;
+  virtual bool is_indirect() const override;
 };
 
 class ParallelSize_StructFor : public ParallelSize {
@@ -80,6 +87,23 @@ class ParallelSize_StructFor : public ParallelSize {
   virtual ~ParallelSize_StructFor() override = default;
 };
 
+struct CompiledKernel {
+  struct Impl;
+  std::unique_ptr<Impl> impl;
+
+  // disscussion:
+  // https://github.com/taichi-dev/taichi/pull/696#issuecomment-609332527
+  CompiledKernel(CompiledKernel &&) = default;
+  CompiledKernel &operator=(CompiledKernel &&) = default;
+
+  CompiledKernel(const std::string &kernel_name_,
+                 const std::string &kernel_source_code,
+                 std::unique_ptr<ParallelSize> ps_);
+  ~CompiledKernel();
+
+  void dispatch_compute(GLSLLauncher *launcher) const;
+};
+
 struct CompiledProgram {
   struct Impl;
   std::unique_ptr<Impl> impl;

diff --git a/taichi/backends/opengl/shaders/indirect.glsl.h b/taichi/backends/opengl/shaders/indirect.glsl.h
@@ -0,0 +1,39 @@
+// vim: ft=glsl
+// clang-format off
+#include "taichi/util/macros.h"
+"#version 430 core\nprecision highp float;\n"
+#define __GLSL__
+#include "taichi/backends/opengl/shaders/runtime.h"
+#undef __GLSL__
+STR(
+// taichi uses gtmp for storing dynamic range endpoints
+layout(std430, binding = 1) buffer gtmp_i32 { int _gtmp_i32_[]; };
+
+// indirect work group size evaluator kernel template
+void _compute_indirect(
+  int const_begin, int const_end,
+  int range_begin, int range_end,
+  int SPT, int TPG) {
+
+  // dynamic range for
+  if (const_begin == 0) {
+    range_begin = _gtmp_i32_[range_begin >> 2];
+  }
+  if (const_end == 0) {
+    range_end = _gtmp_i32_[range_end >> 2];
+  }
+  int nstrides = 1;
+  if (range_end > range_begin) {
+    nstrides = range_end - range_begin;
+  }
+
+  int nthreads = max((nstrides + SPT - 1) / SPT, 1);
+  int nblocks = max((nthreads + TPG - 1) / TPG, 1);
+
+  _indirect_x_ = nblocks;
+  _indirect_y_ = 1;
+  _indirect_z_ = 1;
+}
+
+// get_indirect_evaluator() will prepend a main here, with template arguments
+)
diff --git a/taichi/backends/opengl/shaders/runtime.h b/taichi/backends/opengl/shaders/runtime.h
@@ -16,6 +16,9 @@ struct _msg_entry_t {
 };
 
 layout(std430, binding = 6) buffer runtime {
+  int _indirect_x_;
+  int _indirect_y_;
+  int _indirect_z_;
   int _rand_state_;
   int _msg_count_;
   // TODO: move msg buf to gtmp
@@ -46,6 +49,9 @@ struct GLSLMsgEntry {
 };
 
 struct GLSLRuntime {
+  int indirect_x;
+  int indirect_y;
+  int indirect_z;
   int rand_state;
   int msg_count;
   GLSLMsgEntry msg_buf[MAX_MESSAGES];