From e0ca120e2068b91fec153c8e173e776fd6d2b975 Mon Sep 17 00:00:00 2001 From: archibate <1931127624@qq.com> Date: Fri, 30 Oct 2020 15:09:39 +0800 Subject: [PATCH 1/9] tmp1 --- con.py | 23 +++++++++++++ mrp.py | 18 ++++++++++ taichi/backends/opengl/codegen_opengl.cpp | 3 +- taichi/backends/opengl/opengl_api.cpp | 41 +++++++++++++++++++---- taichi/backends/opengl/opengl_api.h | 2 ++ taichi/backends/opengl/shaders/runtime.h | 6 ++++ 6 files changed, 85 insertions(+), 8 deletions(-) create mode 100644 con.py create mode 100644 mrp.py diff --git a/con.py b/con.py new file mode 100644 index 0000000000000..4bbb18ac90333 --- /dev/null +++ b/con.py @@ -0,0 +1,23 @@ +import taichi as ti +import timeit + +ti.init(ti.opengl, log_level=ti.DEBUG) + +N = 2**14 + +a = ti.field(int, 4) +x = ti.field(int, N) + +@ti.kernel +def indirect(): + for i in range(a[3]): + x[i] = i + 1 + +a[0] = 128 +a[1] = 1 +a[2] = 1 +a[3] = N +stmt = lambda: indirect() +print(timeit.timeit(stmt, stmt, number=10000)) +print(x) +print(a) diff --git a/mrp.py b/mrp.py new file mode 100644 index 0000000000000..03a72e7002c2e --- /dev/null +++ b/mrp.py @@ -0,0 +1,18 @@ +import taichi as ti +import timeit + +ti.init(ti.opengl) + +N = 2**14 + +x = ti.field(int, N) + +@ti.kernel +def func(): + for i in x: + x[i] = i + for i in range(x[2]): + x[i] = i + +stmt = lambda: func() +print(timeit.timeit(stmt, stmt, number=10000)) diff --git a/taichi/backends/opengl/codegen_opengl.cpp b/taichi/backends/opengl/codegen_opengl.cpp index 7cd6f85f50b72..c7d840ad3004b 100644 --- a/taichi/backends/opengl/codegen_opengl.cpp +++ b/taichi/backends/opengl/codegen_opengl.cpp @@ -796,7 +796,6 @@ class KernelGen : public IRVisitor { ScopedGridStrideLoop _gsl(this); emit("if (_sid >= {}) {};", end_value - begin_value, get_return_stmt()); emit("int _itv = {} + _sid * {};", begin_value, 1 /* stmt->step? */); - stmt->body->accept(this); } else { ScopedIndent _s(line_appender_); emit("// range known at runtime"); @@ -812,8 +811,8 @@ class KernelGen : public IRVisitor { emit("int _beg = {}, _end = {};", begin_expr, end_expr); emit("int _itv = _beg + _sid;"); emit("if (_itv >= _end) {};", get_return_stmt()); - stmt->body->accept(this); } + stmt->body->accept(this); if (used_tls) { TI_ASSERT(stmt->tls_epilogue != nullptr); diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp index 9cb437b6e3e42..486b63f06f87b 100644 --- a/taichi/backends/opengl/opengl_api.cpp +++ b/taichi/backends/opengl/opengl_api.cpp @@ -209,6 +209,10 @@ struct GLSSBO { check_opengl_error("glBindBufferRange"); } + void as_indirect_buffer() { + glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, id_); + } + void *map(size_t offset, size_t length, GLbitfield access = GL_READ_ONLY) const { @@ -309,6 +313,14 @@ struct GLSLLauncherImpl { ParallelSize::~ParallelSize() { } +bool ParallelSize::is_indirect() const { + return false; +} + +bool ParallelSize_DynamicRange::is_indirect() const { + return true; +} + size_t ParallelSize::get_threads_per_block() const { size_t limit = opengl_threads_per_block; size_t n = threads_per_block.value_or(0); @@ -483,10 +495,6 @@ struct CompiledKernel { } void dispatch_compute(GLSLLauncher *launcher) const { - int num_blocks = ps->get_num_blocks(launcher); - - glsl->use(); - // https://www.khronos.org/opengl/wiki/Compute_Shader // https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders // https://www.khronos.org/assets/uploads/developers/library/2014-siggraph-bof/KITE-BOF_Aug14.pdf @@ -494,8 +502,21 @@ struct CompiledKernel { // `glDispatchCompute(X, Y, Z)` - the X*Y*Z == `Blocks` in CUDA // `layout(local_size_x = X) in;` - the X == `Threads` in CUDA // - glDispatchCompute(num_blocks, 1, 1); - check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks)); + if (/*!taichi::starts_with(kernel_name, "indirect_") && */!ps->is_indirect()) { + int num_blocks = ps->get_num_blocks(launcher); + glsl->use(); + glDispatchCompute(num_blocks, 1, 1); + check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks)); + + } else { + //auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime); + //runtime->as_indirect_buffer(); + auto root = launcher->impl->core_bufs.get(GLBufId::Root); + root->as_indirect_buffer(); + glsl->use(); + glDispatchComputeIndirect(0); // offset of runtime.indirect_x is 0 + check_opengl_error(fmt::format("glDispatchComputeIndirect")); + } glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); check_opengl_error("glMemoryBarrier"); @@ -723,6 +744,14 @@ bool initialize_opengl(bool error_tolerance) { ParallelSize::~ParallelSize() { } +bool ParallelSize::is_indirect() const { + TI_NOT_IMPLEMENTED; +} + +bool ParallelSize_DynamicRange::is_indirect() const { + TI_NOT_IMPLEMENTED; +} + size_t ParallelSize::get_num_threads(GLSLLauncher *launcher) const { TI_NOT_IMPLEMENTED; } diff --git a/taichi/backends/opengl/opengl_api.h b/taichi/backends/opengl/opengl_api.h index 4e46766992b98..66be118447bac 100644 --- a/taichi/backends/opengl/opengl_api.h +++ b/taichi/backends/opengl/opengl_api.h @@ -44,6 +44,7 @@ class ParallelSize { std::optional strides_per_thread; std::optional threads_per_block; + virtual bool is_indirect() const; virtual size_t get_num_strides(GLSLLauncher *launcher) const = 0; size_t get_num_threads(GLSLLauncher *launcher) const; size_t get_num_blocks(GLSLLauncher *launcher) const; @@ -71,6 +72,7 @@ class ParallelSize_DynamicRange : public ParallelSize { ParallelSize_DynamicRange(OffloadedStmt *stmt); virtual size_t get_num_strides(GLSLLauncher *launcher) const override; virtual ~ParallelSize_DynamicRange() override = default; + virtual bool is_indirect() const override; }; class ParallelSize_StructFor : public ParallelSize { diff --git a/taichi/backends/opengl/shaders/runtime.h b/taichi/backends/opengl/shaders/runtime.h index c717bfbb6bad6..56f2e47b11f7e 100644 --- a/taichi/backends/opengl/shaders/runtime.h +++ b/taichi/backends/opengl/shaders/runtime.h @@ -16,6 +16,9 @@ struct _msg_entry_t { }; layout(std430, binding = 6) buffer runtime { + int _indirect_x_; + int _indirect_y_; + int _indirect_z_; int _rand_state_; int _msg_count_; // TODO: move msg buf to gtmp @@ -46,6 +49,9 @@ struct GLSLMsgEntry { }; struct GLSLRuntime { + int indirect_x; + int indirect_y; + int indirect_z; int rand_state; int msg_count; GLSLMsgEntry msg_buf[MAX_MESSAGES]; From 1c5e4be6735036b4d0c0fc9bae1387d0f10a4cdb Mon Sep 17 00:00:00 2001 From: archibate <1931127624@qq.com> Date: Fri, 30 Oct 2020 15:31:05 +0800 Subject: [PATCH 2/9] tmp2 --- taichi/backends/opengl/opengl_api.cpp | 20 +++++++++++++++---- .../backends/opengl/shaders/indirect.glsl.h | 14 +++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 taichi/backends/opengl/shaders/indirect.glsl.h diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp index 486b63f06f87b..22e8700a58740 100644 --- a/taichi/backends/opengl/opengl_api.cpp +++ b/taichi/backends/opengl/opengl_api.cpp @@ -458,6 +458,7 @@ void display_kernel_info(std::string const &kernel_name, taichi::starts_with(kernel_name, "tensor_to_") || taichi::starts_with(kernel_name, "matrix_to_") || taichi::starts_with(kernel_name, "ext_arr_to_") || + //taichi::starts_with(kernel_name, "indirect_evaluator_") || taichi::starts_with(kernel_name, "jit_evaluator_"); if (!is_accessor) TI_DEBUG("source of kernel [{}]:\n{}", kernel_name, kernel_source_code); @@ -494,6 +495,16 @@ struct CompiledKernel { glsl->link(); } + static std::unique_ptr dbg_make_indirect_evaluator() { + auto ps = std::make_unique(0); + std::string source = +#include "taichi/backends/opengl/shaders/indirect.glsl.h" + ; + auto ck = std::make_unique("indirect_evaluator_0", + source, std::move(ps)); + return ck; + } + void dispatch_compute(GLSLLauncher *launcher) const { // https://www.khronos.org/opengl/wiki/Compute_Shader // https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders @@ -509,10 +520,11 @@ struct CompiledKernel { check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks)); } else { - //auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime); - //runtime->as_indirect_buffer(); - auto root = launcher->impl->core_bufs.get(GLBufId::Root); - root->as_indirect_buffer(); + //auto ie = ps->get_indirect_evaluator(); + auto ie = dbg_make_indirect_evaluator(); + ie->dispatch_compute(launcher); + auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime); + runtime->as_indirect_buffer(); glsl->use(); glDispatchComputeIndirect(0); // offset of runtime.indirect_x is 0 check_opengl_error(fmt::format("glDispatchComputeIndirect")); diff --git a/taichi/backends/opengl/shaders/indirect.glsl.h b/taichi/backends/opengl/shaders/indirect.glsl.h new file mode 100644 index 0000000000000..f8bbfba43d33b --- /dev/null +++ b/taichi/backends/opengl/shaders/indirect.glsl.h @@ -0,0 +1,14 @@ +// vim: ft=glsl +// clang-format off +#include "taichi/util/macros.h" +"#version 430 core\nprecision highp float;\n" +#define __GLSL__ +#include "taichi/backends/opengl/shaders/runtime.h" +#undef __GLSL__ +STR( +void main() { // indirect parallel size evaluator kernel + _indirect_x_ = 128; + _indirect_y_ = 1; + _indirect_z_ = 1; +} +) From d42c54462a48966b1b14ada5832c87b015c5959a Mon Sep 17 00:00:00 2001 From: archibate <1931127624@qq.com> Date: Fri, 30 Oct 2020 16:00:38 +0800 Subject: [PATCH 3/9] CompiledKernel refactor to PIMPL --- taichi/backends/opengl/opengl_api.cpp | 37 ++++++++++++++++++++------- taichi/backends/opengl/opengl_api.h | 17 ++++++++++++ 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp index 22e8700a58740..379cc761c9886 100644 --- a/taichi/backends/opengl/opengl_api.cpp +++ b/taichi/backends/opengl/opengl_api.cpp @@ -470,20 +470,15 @@ void display_kernel_info(std::string const &kernel_name, #endif } -struct CompiledKernel { +struct CompiledKernel::Impl { std::string kernel_name; std::unique_ptr glsl; std::unique_ptr ps; std::string source; - // disscussion: - // https://github.com/taichi-dev/taichi/pull/696#issuecomment-609332527 - CompiledKernel(CompiledKernel &&) = default; - CompiledKernel &operator=(CompiledKernel &&) = default; - - explicit CompiledKernel(const std::string &kernel_name_, - const std::string &kernel_source_code, - std::unique_ptr ps_) + Impl(const std::string &kernel_name_, + const std::string &kernel_source_code, + std::unique_ptr ps_) : kernel_name(kernel_name_), ps(std::move(ps_)) { source = kernel_source_code + @@ -715,6 +710,18 @@ bool is_opengl_api_available() { struct GLProgram {}; struct GLSLLauncherImpl {}; +struct CompiledKernel::Impl { + Impl(const std::string &kernel_name_, + const std::string &kernel_source_code, + std::unique_ptr ps_) { + TI_NOT_IMPLEMENTED; + } + + void dispatch_compute(GLSLLauncher *launcher) const { + TI_NOT_IMPLEMENTED; + } +}; + struct CompiledProgram::Impl { UsedFeature used; @@ -823,6 +830,18 @@ void CompiledProgram::launch(Context &ctx, GLSLLauncher *launcher) const { impl->launch(ctx, launcher); } +CompiledKernel::CompiledKernel(const std::string &kernel_name_, + const std::string &kernel_source_code, + std::unique_ptr ps_) + : impl(std::make_unique(kernel_name_, kernel_source_code, std::move(ps_))) { +} + +void CompiledKernel::dispatch_compute(GLSLLauncher *launcher) const { + impl->dispatch_compute(launcher); +} + +CompiledKernel::~CompiledKernel() = default; + GLSLLauncher::~GLSLLauncher() = default; } // namespace opengl diff --git a/taichi/backends/opengl/opengl_api.h b/taichi/backends/opengl/opengl_api.h index 66be118447bac..25f53fa587fd2 100644 --- a/taichi/backends/opengl/opengl_api.h +++ b/taichi/backends/opengl/opengl_api.h @@ -82,6 +82,23 @@ class ParallelSize_StructFor : public ParallelSize { virtual ~ParallelSize_StructFor() override = default; }; +struct CompiledKernel { + struct Impl; + std::unique_ptr impl; + + // disscussion: + // https://github.com/taichi-dev/taichi/pull/696#issuecomment-609332527 + CompiledKernel(CompiledKernel &&) = default; + CompiledKernel &operator=(CompiledKernel &&) = default; + + CompiledKernel(const std::string &kernel_name_, + const std::string &kernel_source_code, + std::unique_ptr ps_); + ~CompiledKernel(); + + void dispatch_compute(GLSLLauncher *launcher) const; +}; + struct CompiledProgram { struct Impl; std::unique_ptr impl; From 248546bc9f6673c778cb9f5442a13c9a034b1c32 Mon Sep 17 00:00:00 2001 From: archibate <1931127624@qq.com> Date: Fri, 30 Oct 2020 16:12:48 +0800 Subject: [PATCH 4/9] ps->get_indirect_evaluator --- con.py | 2 +- taichi/backends/opengl/opengl_api.cpp | 28 ++++++++++++++++++++++++--- taichi/backends/opengl/opengl_api.h | 4 ++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/con.py b/con.py index 4bbb18ac90333..f00f95cea57d9 100644 --- a/con.py +++ b/con.py @@ -16,7 +16,7 @@ def indirect(): a[0] = 128 a[1] = 1 a[2] = 1 -a[3] = N +a[3] = N - 1 stmt = lambda: indirect() print(timeit.timeit(stmt, stmt, number=10000)) print(x) diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp index 379cc761c9886..f536400170c27 100644 --- a/taichi/backends/opengl/opengl_api.cpp +++ b/taichi/backends/opengl/opengl_api.cpp @@ -321,6 +321,20 @@ bool ParallelSize_DynamicRange::is_indirect() const { return true; } +std::unique_ptr ParallelSize::get_indirect_evaluator() { + return nullptr; +} + +std::unique_ptr ParallelSize_DynamicRange::get_indirect_evaluator() { + auto ps = std::make_unique(0); + std::string source = +#include "taichi/backends/opengl/shaders/indirect.glsl.h" + ; + auto ck = std::make_unique("indirect_evaluator_0", + source, std::move(ps)); + return ck; +} + size_t ParallelSize::get_threads_per_block() const { size_t limit = opengl_threads_per_block; size_t n = threads_per_block.value_or(0); @@ -508,15 +522,15 @@ struct CompiledKernel::Impl { // `glDispatchCompute(X, Y, Z)` - the X*Y*Z == `Blocks` in CUDA // `layout(local_size_x = X) in;` - the X == `Threads` in CUDA // - if (/*!taichi::starts_with(kernel_name, "indirect_") && */!ps->is_indirect()) { + if (!ps->is_indirect()) { int num_blocks = ps->get_num_blocks(launcher); glsl->use(); glDispatchCompute(num_blocks, 1, 1); check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks)); } else { - //auto ie = ps->get_indirect_evaluator(); - auto ie = dbg_make_indirect_evaluator(); + auto ie = ps->get_indirect_evaluator(); + //auto ie = dbg_make_indirect_evaluator(); ie->dispatch_compute(launcher); auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime); runtime->as_indirect_buffer(); @@ -800,6 +814,14 @@ size_t ParallelSize_StructFor::get_num_strides(GLSLLauncher *launcher) const { TI_NOT_IMPLEMENTED; } +std::unique_ptr ParallelSize::get_indirect_evaluator() { + TI_NOT_IMPLEMENTED; +} + +std::unique_ptr ParallelSize_DynamicRange::get_indirect_evaluator() { + TI_NOT_IMPLEMENTED; +} + ParallelSize_ConstRange::ParallelSize_ConstRange(size_t num_strides) : num_strides(num_strides) { } diff --git a/taichi/backends/opengl/opengl_api.h b/taichi/backends/opengl/opengl_api.h index 25f53fa587fd2..ce1501ffd5eeb 100644 --- a/taichi/backends/opengl/opengl_api.h +++ b/taichi/backends/opengl/opengl_api.h @@ -37,6 +37,8 @@ extern int opengl_threads_per_block; return false; \ })() +struct CompiledKernel; + class ParallelSize { // GLSL: stride < invocation < local work group < 'dispatch' // CUDA: stride < thread < block < grid @@ -48,6 +50,7 @@ class ParallelSize { virtual size_t get_num_strides(GLSLLauncher *launcher) const = 0; size_t get_num_threads(GLSLLauncher *launcher) const; size_t get_num_blocks(GLSLLauncher *launcher) const; + virtual std::unique_ptr get_indirect_evaluator(); virtual size_t get_threads_per_block() const; virtual ~ParallelSize(); }; @@ -72,6 +75,7 @@ class ParallelSize_DynamicRange : public ParallelSize { ParallelSize_DynamicRange(OffloadedStmt *stmt); virtual size_t get_num_strides(GLSLLauncher *launcher) const override; virtual ~ParallelSize_DynamicRange() override = default; + virtual std::unique_ptr get_indirect_evaluator() override; virtual bool is_indirect() const override; }; From 5f32be9f671099538da5f7ad4734d6e97352cbdf Mon Sep 17 00:00:00 2001 From: archibate <1931127624@qq.com> Date: Fri, 30 Oct 2020 16:19:20 +0800 Subject: [PATCH 5/9] cache indirect evaluator kernel --- taichi/backends/opengl/opengl_api.cpp | 35 ++++++++++----------------- taichi/backends/opengl/opengl_api.h | 5 ++-- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp index f536400170c27..8142425f3d26c 100644 --- a/taichi/backends/opengl/opengl_api.cpp +++ b/taichi/backends/opengl/opengl_api.cpp @@ -321,18 +321,20 @@ bool ParallelSize_DynamicRange::is_indirect() const { return true; } -std::unique_ptr ParallelSize::get_indirect_evaluator() { +CompiledKernel *ParallelSize::get_indirect_evaluator() { return nullptr; } -std::unique_ptr ParallelSize_DynamicRange::get_indirect_evaluator() { - auto ps = std::make_unique(0); - std::string source = +CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() { + if (!indirect_evaluator) { + auto ps = std::make_unique(0); + std::string source = #include "taichi/backends/opengl/shaders/indirect.glsl.h" - ; - auto ck = std::make_unique("indirect_evaluator_0", - source, std::move(ps)); - return ck; + ; + indirect_evaluator = std::make_unique( + "indirect_evaluator_opengl", source, std::move(ps)); + } + return indirect_evaluator.get(); } size_t ParallelSize::get_threads_per_block() const { @@ -472,7 +474,7 @@ void display_kernel_info(std::string const &kernel_name, taichi::starts_with(kernel_name, "tensor_to_") || taichi::starts_with(kernel_name, "matrix_to_") || taichi::starts_with(kernel_name, "ext_arr_to_") || - //taichi::starts_with(kernel_name, "indirect_evaluator_") || + taichi::starts_with(kernel_name, "indirect_evaluator_") || taichi::starts_with(kernel_name, "jit_evaluator_"); if (!is_accessor) TI_DEBUG("source of kernel [{}]:\n{}", kernel_name, kernel_source_code); @@ -504,16 +506,6 @@ struct CompiledKernel::Impl { glsl->link(); } - static std::unique_ptr dbg_make_indirect_evaluator() { - auto ps = std::make_unique(0); - std::string source = -#include "taichi/backends/opengl/shaders/indirect.glsl.h" - ; - auto ck = std::make_unique("indirect_evaluator_0", - source, std::move(ps)); - return ck; - } - void dispatch_compute(GLSLLauncher *launcher) const { // https://www.khronos.org/opengl/wiki/Compute_Shader // https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders @@ -530,7 +522,6 @@ struct CompiledKernel::Impl { } else { auto ie = ps->get_indirect_evaluator(); - //auto ie = dbg_make_indirect_evaluator(); ie->dispatch_compute(launcher); auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime); runtime->as_indirect_buffer(); @@ -814,11 +805,11 @@ size_t ParallelSize_StructFor::get_num_strides(GLSLLauncher *launcher) const { TI_NOT_IMPLEMENTED; } -std::unique_ptr ParallelSize::get_indirect_evaluator() { +CompiledKernel *ParallelSize::get_indirect_evaluator() { TI_NOT_IMPLEMENTED; } -std::unique_ptr ParallelSize_DynamicRange::get_indirect_evaluator() { +CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() { TI_NOT_IMPLEMENTED; } diff --git a/taichi/backends/opengl/opengl_api.h b/taichi/backends/opengl/opengl_api.h index ce1501ffd5eeb..e725ebfcc5dc4 100644 --- a/taichi/backends/opengl/opengl_api.h +++ b/taichi/backends/opengl/opengl_api.h @@ -50,7 +50,7 @@ class ParallelSize { virtual size_t get_num_strides(GLSLLauncher *launcher) const = 0; size_t get_num_threads(GLSLLauncher *launcher) const; size_t get_num_blocks(GLSLLauncher *launcher) const; - virtual std::unique_ptr get_indirect_evaluator(); + virtual CompiledKernel *get_indirect_evaluator(); virtual size_t get_threads_per_block() const; virtual ~ParallelSize(); }; @@ -70,12 +70,13 @@ class ParallelSize_DynamicRange : public ParallelSize { bool const_end; int range_begin; int range_end; + std::unique_ptr indirect_evaluator = nullptr; public: ParallelSize_DynamicRange(OffloadedStmt *stmt); virtual size_t get_num_strides(GLSLLauncher *launcher) const override; virtual ~ParallelSize_DynamicRange() override = default; - virtual std::unique_ptr get_indirect_evaluator() override; + virtual CompiledKernel *get_indirect_evaluator() override; virtual bool is_indirect() const override; }; From 1cee610929ba00270ca0ed4271c7fe1257794b9d Mon Sep 17 00:00:00 2001 From: archibate <1931127624@qq.com> Date: Fri, 30 Oct 2020 16:32:46 +0800 Subject: [PATCH 6/9] _compute_indirect as template --- con.py | 9 +++------ taichi/backends/opengl/opengl_api.cpp | 8 +++++++- taichi/backends/opengl/shaders/indirect.glsl.h | 7 ++++++- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/con.py b/con.py index f00f95cea57d9..898f5fc56c1ec 100644 --- a/con.py +++ b/con.py @@ -5,18 +5,15 @@ N = 2**14 -a = ti.field(int, 4) +a = ti.field(int, ()) x = ti.field(int, N) @ti.kernel def indirect(): - for i in range(a[3]): + for i in range(a[None]): x[i] = i + 1 -a[0] = 128 -a[1] = 1 -a[2] = 1 -a[3] = N - 1 +a[None] = N - 1 stmt = lambda: indirect() print(timeit.timeit(stmt, stmt, number=10000)) print(x) diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp index 8142425f3d26c..e62149ab42fdd 100644 --- a/taichi/backends/opengl/opengl_api.cpp +++ b/taichi/backends/opengl/opengl_api.cpp @@ -328,8 +328,14 @@ CompiledKernel *ParallelSize::get_indirect_evaluator() { CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() { if (!indirect_evaluator) { auto ps = std::make_unique(0); + size_t SPT = strides_per_thread.value_or(1); + size_t TPG = ParallelSize::get_threads_per_block(); std::string source = #include "taichi/backends/opengl/shaders/indirect.glsl.h" + + fmt::format("\nvoid main() {{\n" + " _compute_indirect({}, {}, {}, {}, {}, {});\n" + "}}\n", (int)const_begin, (int)const_end, range_begin, range_end, + SPT, TPG); ; indirect_evaluator = std::make_unique( "indirect_evaluator_opengl", source, std::move(ps)); @@ -352,7 +358,7 @@ size_t ParallelSize_ConstRange::get_num_strides(GLSLLauncher *launcher) const { } size_t ParallelSize_ConstRange::get_threads_per_block() const { - size_t n = get_num_threads(nullptr); + size_t n = get_num_threads(nullptr); // TODO: clean up these (iapr) size_t TPG = ParallelSize::get_threads_per_block(); return std::max(std::min(n, TPG), (size_t)1); } diff --git a/taichi/backends/opengl/shaders/indirect.glsl.h b/taichi/backends/opengl/shaders/indirect.glsl.h index f8bbfba43d33b..14ef558302e33 100644 --- a/taichi/backends/opengl/shaders/indirect.glsl.h +++ b/taichi/backends/opengl/shaders/indirect.glsl.h @@ -6,9 +6,14 @@ #include "taichi/backends/opengl/shaders/runtime.h" #undef __GLSL__ STR( -void main() { // indirect parallel size evaluator kernel +void _compute_indirect( + int const_begin, int const_end, + int gtmp_begin, int range_end, + int SPT, int TPG) { + // indirect work group size evaluator kernel _indirect_x_ = 128; _indirect_y_ = 1; _indirect_z_ = 1; } +// get_indirect_evaluator() will prepend a main here, with template arguments ) From 8e754c2d1c6d7eca50670eb0ca1b84767d1ed8f2 Mon Sep 17 00:00:00 2001 From: archibate <1931127624@qq.com> Date: Fri, 30 Oct 2020 16:41:29 +0800 Subject: [PATCH 7/9] according to gtmp --- .../backends/opengl/shaders/indirect.glsl.h | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/taichi/backends/opengl/shaders/indirect.glsl.h b/taichi/backends/opengl/shaders/indirect.glsl.h index 14ef558302e33..dd4580c478ddc 100644 --- a/taichi/backends/opengl/shaders/indirect.glsl.h +++ b/taichi/backends/opengl/shaders/indirect.glsl.h @@ -6,14 +6,34 @@ #include "taichi/backends/opengl/shaders/runtime.h" #undef __GLSL__ STR( +// taichi uses gtmp for storing dynamic range endpoints +layout(std430, binding = 1) buffer gtmp_i32 { int _gtmp_i32_[]; }; + +// indirect work group size evaluator kernel template void _compute_indirect( int const_begin, int const_end, - int gtmp_begin, int range_end, + int range_begin, int range_end, int SPT, int TPG) { - // indirect work group size evaluator kernel - _indirect_x_ = 128; + + // dynamic range for + if (const_begin == 0) { + range_begin = _gtmp_i32_[range_begin >> 2]; + } + if (const_end == 0) { + range_end = _gtmp_i32_[range_end >> 2]; + } + int nstrides = 1; + if (range_end > range_begin) { + nstrides = range_end - range_begin; + } + + int nthreads = max((nstrides + SPT - 1) / SPT, 1); + int nblocks = max((nthreads + TPG - 1) / TPG, 1); + + _indirect_x_ = nblocks; _indirect_y_ = 1; _indirect_z_ = 1; } + // get_indirect_evaluator() will prepend a main here, with template arguments ) From 8654287d57e9c13a44dabb1ae1ca90761dedbc3b Mon Sep 17 00:00:00 2001 From: archibate <1931127624@qq.com> Date: Fri, 30 Oct 2020 16:50:01 +0800 Subject: [PATCH 8/9] clean up --- con.py | 20 -------------------- mrp.py | 18 ------------------ taichi/backends/opengl/codegen_opengl.cpp | 3 ++- 3 files changed, 2 insertions(+), 39 deletions(-) delete mode 100644 con.py delete mode 100644 mrp.py diff --git a/con.py b/con.py deleted file mode 100644 index 898f5fc56c1ec..0000000000000 --- a/con.py +++ /dev/null @@ -1,20 +0,0 @@ -import taichi as ti -import timeit - -ti.init(ti.opengl, log_level=ti.DEBUG) - -N = 2**14 - -a = ti.field(int, ()) -x = ti.field(int, N) - -@ti.kernel -def indirect(): - for i in range(a[None]): - x[i] = i + 1 - -a[None] = N - 1 -stmt = lambda: indirect() -print(timeit.timeit(stmt, stmt, number=10000)) -print(x) -print(a) diff --git a/mrp.py b/mrp.py deleted file mode 100644 index 03a72e7002c2e..0000000000000 --- a/mrp.py +++ /dev/null @@ -1,18 +0,0 @@ -import taichi as ti -import timeit - -ti.init(ti.opengl) - -N = 2**14 - -x = ti.field(int, N) - -@ti.kernel -def func(): - for i in x: - x[i] = i - for i in range(x[2]): - x[i] = i - -stmt = lambda: func() -print(timeit.timeit(stmt, stmt, number=10000)) diff --git a/taichi/backends/opengl/codegen_opengl.cpp b/taichi/backends/opengl/codegen_opengl.cpp index c7d840ad3004b..7cd6f85f50b72 100644 --- a/taichi/backends/opengl/codegen_opengl.cpp +++ b/taichi/backends/opengl/codegen_opengl.cpp @@ -796,6 +796,7 @@ class KernelGen : public IRVisitor { ScopedGridStrideLoop _gsl(this); emit("if (_sid >= {}) {};", end_value - begin_value, get_return_stmt()); emit("int _itv = {} + _sid * {};", begin_value, 1 /* stmt->step? */); + stmt->body->accept(this); } else { ScopedIndent _s(line_appender_); emit("// range known at runtime"); @@ -811,8 +812,8 @@ class KernelGen : public IRVisitor { emit("int _beg = {}, _end = {};", begin_expr, end_expr); emit("int _itv = _beg + _sid;"); emit("if (_itv >= _end) {};", get_return_stmt()); + stmt->body->accept(this); } - stmt->body->accept(this); if (used_tls) { TI_ASSERT(stmt->tls_epilogue != nullptr); From d3ba27285532a90c583a66cc730dc0640846419f Mon Sep 17 00:00:00 2001 From: Taichi Gardener Date: Fri, 30 Oct 2020 05:43:11 -0400 Subject: [PATCH 9/9] [skip ci] enforce code format --- taichi/backends/opengl/opengl_api.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp index e62149ab42fdd..7fe288eabb4c9 100644 --- a/taichi/backends/opengl/opengl_api.cpp +++ b/taichi/backends/opengl/opengl_api.cpp @@ -332,11 +332,12 @@ CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() { size_t TPG = ParallelSize::get_threads_per_block(); std::string source = #include "taichi/backends/opengl/shaders/indirect.glsl.h" - + fmt::format("\nvoid main() {{\n" - " _compute_indirect({}, {}, {}, {}, {}, {});\n" - "}}\n", (int)const_begin, (int)const_end, range_begin, range_end, - SPT, TPG); - ; + +fmt::format( + "\nvoid main() {{\n" + " _compute_indirect({}, {}, {}, {}, {}, {});\n" + "}}\n", + (int)const_begin, (int)const_end, range_begin, range_end, SPT, TPG); + ; indirect_evaluator = std::make_unique( "indirect_evaluator_opengl", source, std::move(ps)); } @@ -850,9 +851,11 @@ void CompiledProgram::launch(Context &ctx, GLSLLauncher *launcher) const { } CompiledKernel::CompiledKernel(const std::string &kernel_name_, - const std::string &kernel_source_code, - std::unique_ptr ps_) - : impl(std::make_unique(kernel_name_, kernel_source_code, std::move(ps_))) { + const std::string &kernel_source_code, + std::unique_ptr ps_) + : impl(std::make_unique(kernel_name_, + kernel_source_code, + std::move(ps_))) { } void CompiledKernel::dispatch_compute(GLSLLauncher *launcher) const {