diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp index 9cb437b6e3e42..7fe288eabb4c9 100644 --- a/taichi/backends/opengl/opengl_api.cpp +++ b/taichi/backends/opengl/opengl_api.cpp @@ -209,6 +209,10 @@ struct GLSSBO { check_opengl_error("glBindBufferRange"); } + void as_indirect_buffer() { + glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, id_); + } + void *map(size_t offset, size_t length, GLbitfield access = GL_READ_ONLY) const { @@ -309,6 +313,37 @@ struct GLSLLauncherImpl { ParallelSize::~ParallelSize() { } +bool ParallelSize::is_indirect() const { + return false; +} + +bool ParallelSize_DynamicRange::is_indirect() const { + return true; +} + +CompiledKernel *ParallelSize::get_indirect_evaluator() { + return nullptr; +} + +CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() { + if (!indirect_evaluator) { + auto ps = std::make_unique(0); + size_t SPT = strides_per_thread.value_or(1); + size_t TPG = ParallelSize::get_threads_per_block(); + std::string source = +#include "taichi/backends/opengl/shaders/indirect.glsl.h" + +fmt::format( + "\nvoid main() {{\n" + " _compute_indirect({}, {}, {}, {}, {}, {});\n" + "}}\n", + (int)const_begin, (int)const_end, range_begin, range_end, SPT, TPG); + ; + indirect_evaluator = std::make_unique( + "indirect_evaluator_opengl", source, std::move(ps)); + } + return indirect_evaluator.get(); +} + size_t ParallelSize::get_threads_per_block() const { size_t limit = opengl_threads_per_block; size_t n = threads_per_block.value_or(0); @@ -324,7 +359,7 @@ size_t ParallelSize_ConstRange::get_num_strides(GLSLLauncher *launcher) const { } size_t ParallelSize_ConstRange::get_threads_per_block() const { - size_t n = get_num_threads(nullptr); + size_t n = get_num_threads(nullptr); // TODO: clean up these (iapr) size_t TPG = ParallelSize::get_threads_per_block(); return std::max(std::min(n, TPG), (size_t)1); } @@ -446,6 +481,7 @@ void display_kernel_info(std::string const &kernel_name, taichi::starts_with(kernel_name, "tensor_to_") || taichi::starts_with(kernel_name, "matrix_to_") || taichi::starts_with(kernel_name, "ext_arr_to_") || + taichi::starts_with(kernel_name, "indirect_evaluator_") || taichi::starts_with(kernel_name, "jit_evaluator_"); if (!is_accessor) TI_DEBUG("source of kernel [{}]:\n{}", kernel_name, kernel_source_code); @@ -457,20 +493,15 @@ void display_kernel_info(std::string const &kernel_name, #endif } -struct CompiledKernel { +struct CompiledKernel::Impl { std::string kernel_name; std::unique_ptr glsl; std::unique_ptr ps; std::string source; - // disscussion: - // https://github.com/taichi-dev/taichi/pull/696#issuecomment-609332527 - CompiledKernel(CompiledKernel &&) = default; - CompiledKernel &operator=(CompiledKernel &&) = default; - - explicit CompiledKernel(const std::string &kernel_name_, - const std::string &kernel_source_code, - std::unique_ptr ps_) + Impl(const std::string &kernel_name_, + const std::string &kernel_source_code, + std::unique_ptr ps_) : kernel_name(kernel_name_), ps(std::move(ps_)) { source = kernel_source_code + @@ -483,10 +514,6 @@ struct CompiledKernel { } void dispatch_compute(GLSLLauncher *launcher) const { - int num_blocks = ps->get_num_blocks(launcher); - - glsl->use(); - // https://www.khronos.org/opengl/wiki/Compute_Shader // https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders // https://www.khronos.org/assets/uploads/developers/library/2014-siggraph-bof/KITE-BOF_Aug14.pdf @@ -494,8 +521,21 @@ struct CompiledKernel { // `glDispatchCompute(X, Y, Z)` - the X*Y*Z == `Blocks` in CUDA // `layout(local_size_x = X) in;` - the X == `Threads` in CUDA // - glDispatchCompute(num_blocks, 1, 1); - check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks)); + if (!ps->is_indirect()) { + int num_blocks = ps->get_num_blocks(launcher); + glsl->use(); + glDispatchCompute(num_blocks, 1, 1); + check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks)); + + } else { + auto ie = ps->get_indirect_evaluator(); + ie->dispatch_compute(launcher); + auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime); + runtime->as_indirect_buffer(); + glsl->use(); + glDispatchComputeIndirect(0); // offset of runtime.indirect_x is 0 + check_opengl_error(fmt::format("glDispatchComputeIndirect")); + } glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); check_opengl_error("glMemoryBarrier"); @@ -682,6 +722,18 @@ bool is_opengl_api_available() { struct GLProgram {}; struct GLSLLauncherImpl {}; +struct CompiledKernel::Impl { + Impl(const std::string &kernel_name_, + const std::string &kernel_source_code, + std::unique_ptr ps_) { + TI_NOT_IMPLEMENTED; + } + + void dispatch_compute(GLSLLauncher *launcher) const { + TI_NOT_IMPLEMENTED; + } +}; + struct CompiledProgram::Impl { UsedFeature used; @@ -723,6 +775,14 @@ bool initialize_opengl(bool error_tolerance) { ParallelSize::~ParallelSize() { } +bool ParallelSize::is_indirect() const { + TI_NOT_IMPLEMENTED; +} + +bool ParallelSize_DynamicRange::is_indirect() const { + TI_NOT_IMPLEMENTED; +} + size_t ParallelSize::get_num_threads(GLSLLauncher *launcher) const { TI_NOT_IMPLEMENTED; } @@ -752,6 +812,14 @@ size_t ParallelSize_StructFor::get_num_strides(GLSLLauncher *launcher) const { TI_NOT_IMPLEMENTED; } +CompiledKernel *ParallelSize::get_indirect_evaluator() { + TI_NOT_IMPLEMENTED; +} + +CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() { + TI_NOT_IMPLEMENTED; +} + ParallelSize_ConstRange::ParallelSize_ConstRange(size_t num_strides) : num_strides(num_strides) { } @@ -782,6 +850,20 @@ void CompiledProgram::launch(Context &ctx, GLSLLauncher *launcher) const { impl->launch(ctx, launcher); } +CompiledKernel::CompiledKernel(const std::string &kernel_name_, + const std::string &kernel_source_code, + std::unique_ptr ps_) + : impl(std::make_unique(kernel_name_, + kernel_source_code, + std::move(ps_))) { +} + +void CompiledKernel::dispatch_compute(GLSLLauncher *launcher) const { + impl->dispatch_compute(launcher); +} + +CompiledKernel::~CompiledKernel() = default; + GLSLLauncher::~GLSLLauncher() = default; } // namespace opengl diff --git a/taichi/backends/opengl/opengl_api.h b/taichi/backends/opengl/opengl_api.h index 4e46766992b98..e725ebfcc5dc4 100644 --- a/taichi/backends/opengl/opengl_api.h +++ b/taichi/backends/opengl/opengl_api.h @@ -37,6 +37,8 @@ extern int opengl_threads_per_block; return false; \ })() +struct CompiledKernel; + class ParallelSize { // GLSL: stride < invocation < local work group < 'dispatch' // CUDA: stride < thread < block < grid @@ -44,9 +46,11 @@ class ParallelSize { std::optional strides_per_thread; std::optional threads_per_block; + virtual bool is_indirect() const; virtual size_t get_num_strides(GLSLLauncher *launcher) const = 0; size_t get_num_threads(GLSLLauncher *launcher) const; size_t get_num_blocks(GLSLLauncher *launcher) const; + virtual CompiledKernel *get_indirect_evaluator(); virtual size_t get_threads_per_block() const; virtual ~ParallelSize(); }; @@ -66,11 +70,14 @@ class ParallelSize_DynamicRange : public ParallelSize { bool const_end; int range_begin; int range_end; + std::unique_ptr indirect_evaluator = nullptr; public: ParallelSize_DynamicRange(OffloadedStmt *stmt); virtual size_t get_num_strides(GLSLLauncher *launcher) const override; virtual ~ParallelSize_DynamicRange() override = default; + virtual CompiledKernel *get_indirect_evaluator() override; + virtual bool is_indirect() const override; }; class ParallelSize_StructFor : public ParallelSize { @@ -80,6 +87,23 @@ class ParallelSize_StructFor : public ParallelSize { virtual ~ParallelSize_StructFor() override = default; }; +struct CompiledKernel { + struct Impl; + std::unique_ptr impl; + + // disscussion: + // https://github.com/taichi-dev/taichi/pull/696#issuecomment-609332527 + CompiledKernel(CompiledKernel &&) = default; + CompiledKernel &operator=(CompiledKernel &&) = default; + + CompiledKernel(const std::string &kernel_name_, + const std::string &kernel_source_code, + std::unique_ptr ps_); + ~CompiledKernel(); + + void dispatch_compute(GLSLLauncher *launcher) const; +}; + struct CompiledProgram { struct Impl; std::unique_ptr impl; diff --git a/taichi/backends/opengl/shaders/indirect.glsl.h b/taichi/backends/opengl/shaders/indirect.glsl.h new file mode 100644 index 0000000000000..dd4580c478ddc --- /dev/null +++ b/taichi/backends/opengl/shaders/indirect.glsl.h @@ -0,0 +1,39 @@ +// vim: ft=glsl +// clang-format off +#include "taichi/util/macros.h" +"#version 430 core\nprecision highp float;\n" +#define __GLSL__ +#include "taichi/backends/opengl/shaders/runtime.h" +#undef __GLSL__ +STR( +// taichi uses gtmp for storing dynamic range endpoints +layout(std430, binding = 1) buffer gtmp_i32 { int _gtmp_i32_[]; }; + +// indirect work group size evaluator kernel template +void _compute_indirect( + int const_begin, int const_end, + int range_begin, int range_end, + int SPT, int TPG) { + + // dynamic range for + if (const_begin == 0) { + range_begin = _gtmp_i32_[range_begin >> 2]; + } + if (const_end == 0) { + range_end = _gtmp_i32_[range_end >> 2]; + } + int nstrides = 1; + if (range_end > range_begin) { + nstrides = range_end - range_begin; + } + + int nthreads = max((nstrides + SPT - 1) / SPT, 1); + int nblocks = max((nthreads + TPG - 1) / TPG, 1); + + _indirect_x_ = nblocks; + _indirect_y_ = 1; + _indirect_z_ = 1; +} + +// get_indirect_evaluator() will prepend a main here, with template arguments +) diff --git a/taichi/backends/opengl/shaders/runtime.h b/taichi/backends/opengl/shaders/runtime.h index c717bfbb6bad6..56f2e47b11f7e 100644 --- a/taichi/backends/opengl/shaders/runtime.h +++ b/taichi/backends/opengl/shaders/runtime.h @@ -16,6 +16,9 @@ struct _msg_entry_t { }; layout(std430, binding = 6) buffer runtime { + int _indirect_x_; + int _indirect_y_; + int _indirect_z_; int _rand_state_; int _msg_count_; // TODO: move msg buf to gtmp @@ -46,6 +49,9 @@ struct GLSLMsgEntry { }; struct GLSLRuntime { + int indirect_x; + int indirect_y; + int indirect_z; int rand_state; int msg_count; GLSLMsgEntry msg_buf[MAX_MESSAGES];