Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OpenGL] [perf] Utilize glDispatchComputeIndirect to prevent sync when dynamic ranges are used #2007

Merged
merged 10 commits into from
Oct 30, 2020
Merged
114 changes: 98 additions & 16 deletions taichi/backends/opengl/opengl_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,10 @@ struct GLSSBO {
check_opengl_error("glBindBufferRange");
}

void as_indirect_buffer() {
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, id_);
}

void *map(size_t offset,
size_t length,
GLbitfield access = GL_READ_ONLY) const {
Expand Down Expand Up @@ -309,6 +313,37 @@ struct GLSLLauncherImpl {
ParallelSize::~ParallelSize() {
}

bool ParallelSize::is_indirect() const {
return false;
}

bool ParallelSize_DynamicRange::is_indirect() const {
return true;
}

CompiledKernel *ParallelSize::get_indirect_evaluator() {
return nullptr;
}

CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() {
if (!indirect_evaluator) {
auto ps = std::make_unique<ParallelSize_ConstRange>(0);
size_t SPT = strides_per_thread.value_or(1);
size_t TPG = ParallelSize::get_threads_per_block();
std::string source =
#include "taichi/backends/opengl/shaders/indirect.glsl.h"
+fmt::format(
"\nvoid main() {{\n"
" _compute_indirect({}, {}, {}, {}, {}, {});\n"
"}}\n",
(int)const_begin, (int)const_end, range_begin, range_end, SPT, TPG);
;
indirect_evaluator = std::make_unique<CompiledKernel>(
"indirect_evaluator_opengl", source, std::move(ps));
}
return indirect_evaluator.get();
}

size_t ParallelSize::get_threads_per_block() const {
size_t limit = opengl_threads_per_block;
size_t n = threads_per_block.value_or(0);
Expand All @@ -324,7 +359,7 @@ size_t ParallelSize_ConstRange::get_num_strides(GLSLLauncher *launcher) const {
}

size_t ParallelSize_ConstRange::get_threads_per_block() const {
size_t n = get_num_threads(nullptr);
size_t n = get_num_threads(nullptr); // TODO: clean up these (iapr)
size_t TPG = ParallelSize::get_threads_per_block();
return std::max(std::min(n, TPG), (size_t)1);
}
Expand Down Expand Up @@ -446,6 +481,7 @@ void display_kernel_info(std::string const &kernel_name,
taichi::starts_with(kernel_name, "tensor_to_") ||
taichi::starts_with(kernel_name, "matrix_to_") ||
taichi::starts_with(kernel_name, "ext_arr_to_") ||
taichi::starts_with(kernel_name, "indirect_evaluator_") ||
taichi::starts_with(kernel_name, "jit_evaluator_");
if (!is_accessor)
TI_DEBUG("source of kernel [{}]:\n{}", kernel_name, kernel_source_code);
Expand All @@ -457,20 +493,15 @@ void display_kernel_info(std::string const &kernel_name,
#endif
}

struct CompiledKernel {
struct CompiledKernel::Impl {
std::string kernel_name;
std::unique_ptr<GLProgram> glsl;
std::unique_ptr<ParallelSize> ps;
std::string source;

// disscussion:
// https://github.com/taichi-dev/taichi/pull/696#issuecomment-609332527
CompiledKernel(CompiledKernel &&) = default;
CompiledKernel &operator=(CompiledKernel &&) = default;

explicit CompiledKernel(const std::string &kernel_name_,
const std::string &kernel_source_code,
std::unique_ptr<ParallelSize> ps_)
Impl(const std::string &kernel_name_,
const std::string &kernel_source_code,
std::unique_ptr<ParallelSize> ps_)
: kernel_name(kernel_name_), ps(std::move(ps_)) {
source =
kernel_source_code +
Expand All @@ -483,19 +514,28 @@ struct CompiledKernel {
}

void dispatch_compute(GLSLLauncher *launcher) const {
int num_blocks = ps->get_num_blocks(launcher);

glsl->use();

// https://www.khronos.org/opengl/wiki/Compute_Shader
// https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders
// https://www.khronos.org/assets/uploads/developers/library/2014-siggraph-bof/KITE-BOF_Aug14.pdf
//
// `glDispatchCompute(X, Y, Z)` - the X*Y*Z == `Blocks` in CUDA
// `layout(local_size_x = X) in;` - the X == `Threads` in CUDA
//
glDispatchCompute(num_blocks, 1, 1);
check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks));
if (!ps->is_indirect()) {
int num_blocks = ps->get_num_blocks(launcher);
glsl->use();
glDispatchCompute(num_blocks, 1, 1);
check_opengl_error(fmt::format("glDispatchCompute({})", num_blocks));

} else {
auto ie = ps->get_indirect_evaluator();
ie->dispatch_compute(launcher);
auto runtime = launcher->impl->core_bufs.get(GLBufId::Runtime);
runtime->as_indirect_buffer();
glsl->use();
glDispatchComputeIndirect(0); // offset of runtime.indirect_x is 0
check_opengl_error(fmt::format("glDispatchComputeIndirect"));
}

glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
check_opengl_error("glMemoryBarrier");
Expand Down Expand Up @@ -682,6 +722,18 @@ bool is_opengl_api_available() {
struct GLProgram {};
struct GLSLLauncherImpl {};

struct CompiledKernel::Impl {
Impl(const std::string &kernel_name_,
const std::string &kernel_source_code,
std::unique_ptr<ParallelSize> ps_) {
TI_NOT_IMPLEMENTED;
}

void dispatch_compute(GLSLLauncher *launcher) const {
TI_NOT_IMPLEMENTED;
}
};

struct CompiledProgram::Impl {
UsedFeature used;

Expand Down Expand Up @@ -723,6 +775,14 @@ bool initialize_opengl(bool error_tolerance) {
ParallelSize::~ParallelSize() {
}

bool ParallelSize::is_indirect() const {
TI_NOT_IMPLEMENTED;
}

bool ParallelSize_DynamicRange::is_indirect() const {
TI_NOT_IMPLEMENTED;
}

size_t ParallelSize::get_num_threads(GLSLLauncher *launcher) const {
TI_NOT_IMPLEMENTED;
}
Expand Down Expand Up @@ -752,6 +812,14 @@ size_t ParallelSize_StructFor::get_num_strides(GLSLLauncher *launcher) const {
TI_NOT_IMPLEMENTED;
}

CompiledKernel *ParallelSize::get_indirect_evaluator() {
TI_NOT_IMPLEMENTED;
}

CompiledKernel *ParallelSize_DynamicRange::get_indirect_evaluator() {
TI_NOT_IMPLEMENTED;
}

ParallelSize_ConstRange::ParallelSize_ConstRange(size_t num_strides)
: num_strides(num_strides) {
}
Expand Down Expand Up @@ -782,6 +850,20 @@ void CompiledProgram::launch(Context &ctx, GLSLLauncher *launcher) const {
impl->launch(ctx, launcher);
}

CompiledKernel::CompiledKernel(const std::string &kernel_name_,
const std::string &kernel_source_code,
std::unique_ptr<ParallelSize> ps_)
: impl(std::make_unique<Impl>(kernel_name_,
kernel_source_code,
std::move(ps_))) {
}

void CompiledKernel::dispatch_compute(GLSLLauncher *launcher) const {
impl->dispatch_compute(launcher);
}

CompiledKernel::~CompiledKernel() = default;

GLSLLauncher::~GLSLLauncher() = default;

} // namespace opengl
Expand Down
24 changes: 24 additions & 0 deletions taichi/backends/opengl/opengl_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,20 @@ extern int opengl_threads_per_block;
return false; \
})()

struct CompiledKernel;

class ParallelSize {
// GLSL: stride < invocation < local work group < 'dispatch'
// CUDA: stride < thread < block < grid
public:
std::optional<size_t> strides_per_thread;
std::optional<size_t> threads_per_block;

virtual bool is_indirect() const;
virtual size_t get_num_strides(GLSLLauncher *launcher) const = 0;
size_t get_num_threads(GLSLLauncher *launcher) const;
size_t get_num_blocks(GLSLLauncher *launcher) const;
virtual CompiledKernel *get_indirect_evaluator();
virtual size_t get_threads_per_block() const;
virtual ~ParallelSize();
};
Expand All @@ -66,11 +70,14 @@ class ParallelSize_DynamicRange : public ParallelSize {
bool const_end;
int range_begin;
int range_end;
std::unique_ptr<CompiledKernel> indirect_evaluator = nullptr;

public:
ParallelSize_DynamicRange(OffloadedStmt *stmt);
virtual size_t get_num_strides(GLSLLauncher *launcher) const override;
virtual ~ParallelSize_DynamicRange() override = default;
virtual CompiledKernel *get_indirect_evaluator() override;
virtual bool is_indirect() const override;
};

class ParallelSize_StructFor : public ParallelSize {
Expand All @@ -80,6 +87,23 @@ class ParallelSize_StructFor : public ParallelSize {
virtual ~ParallelSize_StructFor() override = default;
};

struct CompiledKernel {
struct Impl;
std::unique_ptr<Impl> impl;

// disscussion:
// https://github.com/taichi-dev/taichi/pull/696#issuecomment-609332527
CompiledKernel(CompiledKernel &&) = default;
CompiledKernel &operator=(CompiledKernel &&) = default;

CompiledKernel(const std::string &kernel_name_,
const std::string &kernel_source_code,
std::unique_ptr<ParallelSize> ps_);
~CompiledKernel();

void dispatch_compute(GLSLLauncher *launcher) const;
};

struct CompiledProgram {
struct Impl;
std::unique_ptr<Impl> impl;
Expand Down
39 changes: 39 additions & 0 deletions taichi/backends/opengl/shaders/indirect.glsl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// vim: ft=glsl
// clang-format off
#include "taichi/util/macros.h"
"#version 430 core\nprecision highp float;\n"
#define __GLSL__
#include "taichi/backends/opengl/shaders/runtime.h"
#undef __GLSL__
STR(
// taichi uses gtmp for storing dynamic range endpoints
layout(std430, binding = 1) buffer gtmp_i32 { int _gtmp_i32_[]; };

// indirect work group size evaluator kernel template
void _compute_indirect(
int const_begin, int const_end,
int range_begin, int range_end,
int SPT, int TPG) {

// dynamic range for
if (const_begin == 0) {
range_begin = _gtmp_i32_[range_begin >> 2];
}
if (const_end == 0) {
range_end = _gtmp_i32_[range_end >> 2];
}
int nstrides = 1;
if (range_end > range_begin) {
nstrides = range_end - range_begin;
}

int nthreads = max((nstrides + SPT - 1) / SPT, 1);
int nblocks = max((nthreads + TPG - 1) / TPG, 1);

_indirect_x_ = nblocks;
_indirect_y_ = 1;
_indirect_z_ = 1;
}

// get_indirect_evaluator() will prepend a main here, with template arguments
)
6 changes: 6 additions & 0 deletions taichi/backends/opengl/shaders/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ struct _msg_entry_t {
};

layout(std430, binding = 6) buffer runtime {
int _indirect_x_;
int _indirect_y_;
int _indirect_z_;
int _rand_state_;
int _msg_count_;
// TODO: move msg buf to gtmp
Expand Down Expand Up @@ -46,6 +49,9 @@ struct GLSLMsgEntry {
};

struct GLSLRuntime {
int indirect_x;
int indirect_y;
int indirect_z;
int rand_state;
int msg_count;
GLSLMsgEntry msg_buf[MAX_MESSAGES];
Expand Down