From dd6da1f516b40dd64b9a536adcaa0cbc2b8f0947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E4=BA=8E=E6=96=8C?= <17721388340@163.com> Date: Thu, 27 Feb 2020 10:35:32 -0600 Subject: [PATCH] OpenGL backend (stage 1) (#535) * [skip ci] basic start up for opengl [skip ci] test inout data[3] success [skip ci] try to pass kernel arguments * [skip ci] use a struct_metal style struct compiler * [skip ci] basic allocator * [skip ci] basic read/write tensor fix bug x[4] -> x[4 >> 2] = x[1] * adding f32&f64 support for opengl remove opengl from all_archs to prevent test * Update test_abs.py * Update TaichiCore.cmake for TI_WITH_OPENGL optional TI_WITH_OPENGL ON (detect auto OFF) TI_WITH_OPENGL guard opengl_api.cpp fix typo * merge g1 with master fix: x86_64 -> x64 Update profiler.cpp * replace opengl_unary_op_type_symbol with unary_op_type_name * snode_io_arch -> snode_accessor_arch * use GLEW_LIBRARY_DIR TI_WITH_OPENGL=OFF by default TI_WITH_OPENGL default to ON * no more USE_GLEW macro * no more mallocs in SSBO * remove opengl_backend.py, add comment in mpm99 for ti.opengl * fix memory leakage using std::vector * better cmake: find GLFW and glfw3 * support arch: uncomment with_opengl * initialize_opengl in materialize_layout fix typo Co-authored-by: Yuanming Hu --- cmake/TaichiCore.cmake | 25 + examples/mpm99.py | 2 +- python/taichi/lang/__init__.py | 3 + taichi/backends/codegen_opengl.cpp | 518 ++++++++++++++++++++ taichi/backends/codegen_opengl.h | 40 ++ taichi/backends/struct_opengl.cpp | 91 ++++ taichi/backends/struct_opengl.h | 40 ++ taichi/platform/opengl/opengl_api.cpp | 289 +++++++++++ taichi/platform/opengl/opengl_api.h | 21 + taichi/platform/opengl/opengl_data_types.h | 33 ++ taichi/platform/opengl/opengl_kernel.cpp | 33 ++ taichi/platform/opengl/opengl_kernel.h | 39 ++ taichi/platform/opengl/opengl_kernel_util.h | 29 ++ taichi/profiler.cpp | 2 +- taichi/program.cpp | 39 +- taichi/program.h | 4 + taichi/python/export_misc.cpp | 2 + test_abs.py | 35 ++ 18 files changed, 1237 insertions(+), 8 deletions(-) create mode 100644 taichi/backends/codegen_opengl.cpp create mode 100644 taichi/backends/codegen_opengl.h create mode 100644 taichi/backends/struct_opengl.cpp create mode 100644 taichi/backends/struct_opengl.h create mode 100644 taichi/platform/opengl/opengl_api.cpp create mode 100644 taichi/platform/opengl/opengl_api.h create mode 100644 taichi/platform/opengl/opengl_data_types.h create mode 100644 taichi/platform/opengl/opengl_kernel.cpp create mode 100644 taichi/platform/opengl/opengl_kernel.h create mode 100644 taichi/platform/opengl/opengl_kernel_util.h create mode 100644 test_abs.py diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake index 803b7c79fc601..e6c109cf1e799 100644 --- a/cmake/TaichiCore.cmake +++ b/cmake/TaichiCore.cmake @@ -24,6 +24,8 @@ endif() option(USE_STDCPP "Use -stdlib=libc++" OFF) option(TI_WITH_CUDA "Build with CUDA support" OFF) +option(TI_WITH_OPENGL "Build with OpenGL backend" ON) +option(GLEW_USE_STATIC_LIBS OFF) include_directories(${CMAKE_SOURCE_DIR}) include_directories(external/xxhash) @@ -53,6 +55,29 @@ if (TI_WITH_CUDA) endif() endif() +if (TI_WITH_OPENGL) + if(NOT GLEW_VERSION) + set(GLEW_VERSION 2.0.0) + endif() + find_package(GLEW ${GLEW_VERSION}) + if (GLEW_FOUND) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_OPENGL") + message("Building with GLEW ${GLEW_VERSION}") + message("Using GLEW: ${GLEW_LIBRARIES}") + target_include_directories(${LIBRARY_NAME} PUBLIC ${GLEW_INCLUDE_DIRS}) + target_link_libraries(${LIBRARY_NAME} ${GLEW_LIBRARIES} GLEW) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGLEW_STATIC") + find_package(glfw3 REQUIRED) + if (NOT glfw3_FOUND) + message(FATAL_ERROR "glfw3 not found.") + endif() + message("Building with glfw ${glfw3_VERSION}") + target_link_libraries(${LIBRARY_NAME} glfw) + else() + message(WARNING "GLEW not found, ignoring TI_WITH_OPENGL.") + endif() +endif() + # http://llvm.org/docs/CMake.html#embedding-llvm-in-your-project find_package(LLVM REQUIRED CONFIG 8.0) message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") diff --git a/examples/mpm99.py b/examples/mpm99.py index 1bb23917beab1..023af5b5a9c10 100644 --- a/examples/mpm99.py +++ b/examples/mpm99.py @@ -1,5 +1,5 @@ import taichi as ti -ti.init(arch=ti.cuda) # Try to run on GPU +ti.init(arch=ti.cuda) # Try to run on GPU. Use arch=ti.opengl on old GPUs quality = 1 # Use a larger value for higher-res simulations n_particles, n_grid = 9000 * quality ** 2, 128 * quality dx, inv_dx = 1 / n_grid, float(n_grid) diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py index 27d9944196bac..b1bc4b9036c11 100644 --- a/python/taichi/lang/__init__.py +++ b/python/taichi/lang/__init__.py @@ -29,6 +29,7 @@ x64 = core.x64 cuda = core.cuda metal = core.metal +opengl = core.opengl profiler_print = lambda: core.get_current_program().profiler_print() profiler_clear = lambda: core.get_current_program().profiler_clear() profiler_start = lambda n: core.get_current_program().profiler_start(n) @@ -222,6 +223,8 @@ def supported_archs(): archs.append(cuda) if ti.core.with_metal(): archs.append(metal) + if ti.core.with_opengl(): + archs.append(opengl) return archs class _ArchCheckers(object): diff --git a/taichi/backends/codegen_opengl.cpp b/taichi/backends/codegen_opengl.cpp new file mode 100644 index 0000000000000..ae8579695edc6 --- /dev/null +++ b/taichi/backends/codegen_opengl.cpp @@ -0,0 +1,518 @@ +#include "codegen_opengl.h" +#include +#include +#include + +#include +#include + +TLANG_NAMESPACE_BEGIN +namespace opengl { +namespace { + +class KernelGen : public IRVisitor +{ + Kernel *kernel; + +public: + KernelGen(Kernel *kernel, std::string kernel_name, + const StructCompiledResult *struct_compiled) + : kernel(kernel), + struct_compiled_(struct_compiled), + kernel_name_(kernel_name), + glsl_kernel_prefix_(kernel_name) + { + allow_undefined_visitor = true; + invoke_default_visitor = true; + } + +private: // {{{ + std::string kernel_src_code_; + std::string indent_; + bool is_top_level_{true}; + + const StructCompiledResult *struct_compiled_; + const SNode *root_snode_; + GetRootStmt *root_stmt_; + std::string kernel_name_; + std::string glsl_kernel_name_; + std::string root_snode_type_name_; + std::string glsl_kernel_prefix_; + int glsl_kernel_count_{0}; + + void push_indent() + { + indent_ += " "; + } + + void pop_indent() + { + indent_.pop_back(); + indent_.pop_back(); + } + + template + void emit(std::string f, Args &&... args) + { + kernel_src_code_ += + indent_ + fmt::format(f, std::forward(args)...) + "\n"; + } + + void generate_header() + { + emit("#version 430 core"); + emit("#extension GL_ARB_compute_shader: enable"); + emit("{}", struct_compiled_->source_code); + emit("layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;"); + emit("#define NARGS {}", taichi_max_num_args); + emit("layout(std430, binding = 0) buffer args_i32"); + emit("{{"); + emit(" int _args_i32_[NARGS * 2];"); + emit("}};"); + emit("layout(std430, binding = 0) buffer args_f32"); + emit("{{"); + emit(" float _args_f32_[NARGS * 2];"); + emit("}};"); + emit("layout(std430, binding = 0) buffer args_f64"); + emit("{{"); + emit(" double _args_f64_[NARGS];"); + emit("}};"); + emit("layout(std430, binding = 1) buffer data_i32"); + emit("{{"); + emit(" int _data_i32_[];"); + emit("}};"); + emit("layout(std430, binding = 1) buffer data_f32"); + emit("{{"); + emit(" float _data_f32_[];"); + emit("}};"); + emit("layout(std430, binding = 1) buffer data_f64"); + emit("{{"); + emit(" double _data_f64_[];"); + emit("}};"); + emit("#define _arg_i32(x) _args_i32_[(x) << 1]"); // skip to 64bit stride + emit("#define _arg_f32(x) _args_f32_[(x) << 1]"); + emit("#define _arg_i64(x) _args_i64_[(x) << 0]"); + emit("#define _arg_f64(x) _args_f64_[(x) << 0]"); + emit("#define _mem_i32(x) _data_i32_[(x) >> 2]"); + emit("#define _mem_f32(x) _data_f32_[(x) >> 2]"); + emit("#define _mem_i64(x) _data_i64_[(x) >> 3]"); + emit("#define _mem_f64(x) _data_f64_[(x) >> 3]"); + emit(""); + } + + void generate_bottom() + { + // TODO(archibate): () really necessary? How about just main()? + emit("void main()"); + emit("{{"); + emit(" {}();", glsl_kernel_name_); + emit("}}"); + } + + void visit(Block *stmt) override + { + if (!is_top_level_) push_indent(); + for (auto &s : stmt->statements) { + //TI_INFO("visiting sub stmt {}", typeid(*s).name()); + s->accept(this); + } + if (!is_top_level_) pop_indent(); + } + + void visit(LinearizeStmt *stmt) override + { + std::string val = "0"; + for (int i = 0; i < (int)stmt->inputs.size(); i++) { + val = fmt::format("({} * {} + {})", val, stmt->strides[i], + stmt->inputs[i]->raw_name()); + } + emit("const int {} = {};", stmt->raw_name(), val); + } + + void visit(OffsetAndExtractBitsStmt *stmt) override + { + emit("int {} = ((({} + {}) >> {}) & ((1 << {}) - 1));", + stmt->raw_name(), stmt->offset, stmt->input->raw_name(), + stmt->bit_begin, stmt->bit_end - stmt->bit_begin); + } + + void visit(GetRootStmt *stmt) override + { + // Should we assert |root_stmt_| is assigned only once? + root_stmt_ = stmt; + emit("{} {} = 0;", root_snode_type_name_, stmt->raw_name()); + } + + void visit(SNodeLookupStmt *stmt) override + { + Stmt *parent; + std::string parent_type; + if (stmt->input_snode) { + parent = stmt->input_snode; + parent_type = stmt->snode->node_type_name; + } else { + TI_ASSERT(root_stmt_ != nullptr); + parent = root_stmt_; + parent_type = root_snode_type_name_; + } + + emit("{}_ch {} = {}_children({}, {});", stmt->snode->node_type_name, + stmt->raw_name(), parent_type, parent->raw_name(), + stmt->input_index->raw_name()); + } + + void visit(GetChStmt *stmt) override + { + if (stmt->output_snode->is_place()) { + emit("{} /* place {} */ {} = {}_get{}({});", + stmt->output_snode->node_type_name, + opengl_data_type_name(stmt->output_snode->dt), + stmt->raw_name(), stmt->input_snode->node_type_name, + stmt->chid, stmt->input_ptr->raw_name()); + } else { + emit("{} {} = {}_get{}({});", stmt->output_snode->node_type_name, + stmt->raw_name(), stmt->input_snode->node_type_name, + stmt->chid, stmt->input_ptr->raw_name()); + } + } + + void visit(GlobalStoreStmt *stmt) override + { + TI_ASSERT(stmt->width() == 1); + emit("_mem_{}({}) = {};", data_type_short_name(stmt->element_type()), + stmt->ptr->raw_name(), stmt->data->raw_name()); + } + + void visit(GlobalLoadStmt *stmt) override + { + TI_ASSERT(stmt->width() == 1); + emit("{} {} = _mem_{}({});", opengl_data_type_name(stmt->element_type()), + stmt->raw_name(), data_type_short_name(stmt->element_type()), stmt->ptr->raw_name()); + } + + void visit(UnaryOpStmt *stmt) override + { + if (stmt->op_type != UnaryOpType::cast) { + emit("const {} {} = {}({});", opengl_data_type_name(stmt->element_type()), + stmt->raw_name(), unary_op_type_name(stmt->op_type), + stmt->operand->raw_name()); + } else { + // cast + if (stmt->cast_by_value) { + emit("const {} {} = {}({});", + opengl_data_type_name(stmt->element_type()), stmt->raw_name(), + opengl_data_type_name(stmt->cast_type), stmt->operand->raw_name()); + } else { + TI_NOT_IMPLEMENTED; + } + } + } + + void visit(BinaryOpStmt *bin) override + { + const auto dt_name = opengl_data_type_name(bin->element_type()); + const auto lhs_name = bin->lhs->raw_name(); + const auto rhs_name = bin->rhs->raw_name(); + const auto bin_name = bin->raw_name(); + if (bin->op_type == BinaryOpType::floordiv) { + if (is_integral(bin->element_type())) { + emit("const {} {} = int(floor({} / {}));", dt_name, bin_name, lhs_name, + rhs_name); + } else { + emit("const {} {} = floor({} / {});", dt_name, bin_name, lhs_name, + rhs_name); + } + return; + } + const auto binop = binary_op_type_symbol(bin->op_type); + if (is_opengl_binary_op_infix(bin->op_type)) { + emit("const {} {} = ({} {} {});", dt_name, bin_name, lhs_name, binop, + rhs_name); + } else { + // This is a function call + emit("const {} {} = {}({}, {});", dt_name, bin_name, binop, lhs_name, + rhs_name); + } + } + + void visit(TernaryOpStmt *tri) override + { + TI_ASSERT(tri->op_type == TernaryOpType::select); + emit("const {} {} = ({}) ? ({}) : ({});", + opengl_data_type_name(tri->element_type()), tri->raw_name(), + tri->op1->raw_name(), tri->op2->raw_name(), tri->op3->raw_name()); + } + + void visit(LocalLoadStmt *stmt) override + { + bool linear_index = true; + for (int i = 0; i < (int)stmt->ptr.size(); i++) { + if (stmt->ptr[i].offset != i) { + linear_index = false; + } + } + if (stmt->same_source() && linear_index && + stmt->width() == stmt->ptr[0].var->width()) { + auto ptr = stmt->ptr[0].var; + emit("const {} {}({});", opengl_data_type_name(stmt->element_type()), + stmt->raw_name(), ptr->raw_name()); + } else { + TI_NOT_IMPLEMENTED; + } + } + + void visit(LocalStoreStmt *stmt) override + { + emit("{} = {};", stmt->ptr->raw_name(), stmt->data->raw_name()); + } + + void visit(AllocaStmt *alloca) override + { + emit("{} {}(0);", + opengl_data_type_name(alloca->element_type()), + alloca->raw_name()); + } + + void visit(ConstStmt *const_stmt) override + { + TI_ASSERT(const_stmt->width() == 1); + emit("const {} {} = {};", opengl_data_type_name(const_stmt->element_type()), + const_stmt->raw_name(), const_stmt->val[0].stringify()); + } + + void visit(ArgLoadStmt *stmt) override + { + const auto dt = opengl_data_type_name(stmt->element_type()); + if (stmt->is_ptr) { + emit("const {} {} = _arg_{}({}); // is_ptr", dt, stmt->raw_name(), + data_type_short_name(stmt->element_type()), stmt->arg_id); + } else { + emit("const {} {} = _arg_{}({});", dt, stmt->raw_name(), + data_type_short_name(stmt->element_type()), stmt->arg_id); + } + } + + void visit(ArgStoreStmt *stmt) override + { + TI_ASSERT(!stmt->is_ptr); + emit("_arg_{}({}) = {};", data_type_short_name(stmt->element_type()), + stmt->arg_id, stmt->val->raw_name()); + } + + std::string make_kernel_name() + { + return fmt::format("{}{}", glsl_kernel_prefix_, glsl_kernel_count_++); + } + + void generate_serial_kernel(OffloadedStmt *stmt) { + TI_ASSERT(stmt->task_type == OffloadedStmt::TaskType::serial); + const std::string glsl_kernel_name = make_kernel_name(); + this->glsl_kernel_name_ = glsl_kernel_name; + emit("void {}()", glsl_kernel_name); + emit("{{ // serial"); + stmt->body->accept(this); + emit("}}\n"); + } + + + void visit(OffloadedStmt *stmt) override + { + TI_ASSERT(is_top_level_); // TODO(archibate): remove for nested kernel (?) + is_top_level_ = false; + using Type = OffloadedStmt::TaskType; + if (stmt->task_type == Type::serial) { + generate_serial_kernel(stmt); + /*} else if (stmt->task_type == Type::range_for) { + generate_range_for_kernel(stmt);*/ + } else { + // struct_for is automatically lowered to ranged_for for dense snodes + // (#378). So we only need to support serial and range_for tasks. + TI_ERROR("Unsupported offload type={} on OpenGL arch", stmt->task_name()); + } + is_top_level_ = true; + } + + +public: + const std::string &kernel_source_code() const + { + return kernel_src_code_; + } + + SSBO *create_root_ssbo() + { + static SSBO *root_ssbo; + if (!root_ssbo) { + TI_INFO("[glsl] creating root buffer of size {} B", struct_compiled_->root_size); + root_ssbo = new SSBO(struct_compiled_->root_size); + } + return root_ssbo; + } + + void run(const SNode &root_snode) + { + //TI_INFO("ntm:: {}", root_snode.node_type_name); + root_snode_ = &root_snode; + root_snode_type_name_ = root_snode.node_type_name; + generate_header(); + //irpass::print(kernel->ir); + kernel->ir->accept(this); + generate_bottom(); + } +}; + +} // namespace + +void OpenglCodeGen::lower() +{ + auto ir = kernel_->ir; + const bool print_ir = prog_->config.print_ir; + if (print_ir) { + TI_TRACE("Initial IR:"); + irpass::print(ir); + } + + if (kernel_->grad) { + irpass::reverse_segments(ir); + irpass::re_id(ir); + if (print_ir) { + TI_TRACE("Segment reversed (for autodiff):"); + irpass::print(ir); + } + } + + irpass::lower(ir); + irpass::re_id(ir); + if (print_ir) { + TI_TRACE("Lowered:"); + irpass::print(ir); + } + + irpass::typecheck(ir); + irpass::re_id(ir); + if (print_ir) { + TI_TRACE("Typechecked:"); + irpass::print(ir); + } + + irpass::demote_dense_struct_fors(ir); + irpass::typecheck(ir); + if (print_ir) { + TI_TRACE("Dense Struct-for demoted:"); + irpass::print(ir); + } + + irpass::constant_fold(ir); + if (prog_->config.simplify_before_lower_access) { + irpass::simplify(ir); + irpass::re_id(ir); + if (print_ir) { + TI_TRACE("Simplified I:"); + irpass::print(ir); + } + } + + if (kernel_->grad) { + irpass::demote_atomics(ir); + irpass::full_simplify(ir, prog_->config); + irpass::typecheck(ir); + if (print_ir) { + TI_TRACE("Before make_adjoint:"); + irpass::print(ir); + } + irpass::make_adjoint(ir); + if (print_ir) { + TI_TRACE("After make_adjoint:"); + irpass::print(ir); + } + irpass::typecheck(ir); + } + + irpass::lower_access(ir, prog_->config.use_llvm); + irpass::re_id(ir); + if (print_ir) { + TI_TRACE("Access Lowered:"); + irpass::print(ir); + } + + irpass::die(ir); + irpass::re_id(ir); + if (print_ir) { + TI_TRACE("DIEd:"); + irpass::print(ir); + } + + irpass::flag_access(ir); + irpass::re_id(ir); + if (print_ir) { + TI_TRACE("Access Flagged:"); + irpass::print(ir); + } + + irpass::constant_fold(ir); + if (print_ir) { + TI_TRACE("Constant folded:"); + irpass::re_id(ir); + irpass::print(ir); + } + + global_tmps_buffer_size_ = + std::max(irpass::offload(ir).total_size, (size_t)(1)); + if (print_ir) { + TI_TRACE("Offloaded:"); + irpass::re_id(ir); + irpass::print(ir); + } + + irpass::full_simplify(ir, prog_->config); + if (print_ir) { + TI_TRACE("Simplified II:"); + irpass::re_id(ir); + irpass::print(ir); + } + + irpass::demote_atomics(ir); + if (print_ir) { + TI_TRACE("Atomics demoted:"); + irpass::re_id(ir); + irpass::print(ir); + } +} + +FunctionType OpenglCodeGen::gen(void) +{ + KernelGen codegen(kernel_, kernel_name_, struct_compiled_); + codegen.run(*prog_->snode_root); + SSBO *root_sb = codegen.create_root_ssbo(); + const std::string kernel_source_code = codegen.kernel_source_code(); + //TI_INFO("source of kernel [{}]:\n{}", kernel_name_, kernel_source_code); + + return [kernel_source_code, root_sb](Context &ctx) { + // TODO(archibate): find out where get_arg stored, and just new SSBO(ctx) + SSBO *arg_sb = new SSBO(taichi_max_num_args * sizeof(uint64_t)); + arg_sb->load_arguments_from(ctx); + std::vector iov = {*arg_sb, *root_sb}; + /*TI_INFO("data[0] = {}", ((int*)root_sb->data)[0]); + TI_INFO("data[1] = {}", ((int*)root_sb->data)[1]); + TI_INFO("args[0] = {}", ((uint64_t*)arg_sb->data)[0]); + TI_INFO("args[1] = {}", ((uint64_t*)arg_sb->data)[1]);*/ + launch_glsl_kernel(kernel_source_code, iov); + /*TI_INFO("data[0] = {}", ((int*)root_sb->data)[0]); + TI_INFO("data[1] = {}", ((int*)root_sb->data)[1]); + TI_INFO("args[0] = {}", ((uint64_t*)arg_sb->data)[0]); + TI_INFO("args[1] = {}", ((uint64_t*)arg_sb->data)[1]);*/ + arg_sb->save_returns_to(ctx); + }; +} + +FunctionType OpenglCodeGen::compile(Program &program, Kernel &kernel) +{ + this->prog_ = &program; + this->kernel_ = &kernel; + + this->lower(); + return this->gen(); +} + +} // namespace opengl +TLANG_NAMESPACE_END diff --git a/taichi/backends/codegen_opengl.h b/taichi/backends/codegen_opengl.h new file mode 100644 index 0000000000000..49a669408fd5e --- /dev/null +++ b/taichi/backends/codegen_opengl.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +#include "base.h" +#include "kernel.h" + +TLANG_NAMESPACE_BEGIN +namespace opengl { + +class OpenglCodeGen { + public: + OpenglCodeGen(const std::string &kernel_name, + const StructCompiledResult *struct_compiled) + : kernel_name_(kernel_name), struct_compiled_(struct_compiled) + {} + + FunctionType compile(Program &program, Kernel &kernel); + + private: + void lower(); + FunctionType gen(); + + const std::string kernel_name_; + + Program *prog_; + Kernel *kernel_; + const StructCompiledResult *struct_compiled_; + size_t global_tmps_buffer_size_{0}; +}; + +} // namespace opengl +TLANG_NAMESPACE_END diff --git a/taichi/backends/struct_opengl.cpp b/taichi/backends/struct_opengl.cpp new file mode 100644 index 0000000000000..ef7597d8aea0d --- /dev/null +++ b/taichi/backends/struct_opengl.cpp @@ -0,0 +1,91 @@ +#include "struct_opengl.h" + +TLANG_NAMESPACE_BEGIN +namespace opengl { + +OpenglStructCompiler::CompiledResult OpenglStructCompiler::run(SNode &node) +{ + TI_ASSERT(node.type == SNodeType::root); + collect_snodes(node); + // The host side has run this! + // infer_snode_properties(node); + + auto snodes_rev = snodes_; + std::reverse(snodes_rev.begin(), snodes_rev.end()); + + for (auto &n : snodes_rev) { + generate_types(*n); + } + CompiledResult result; + result.source_code = std::move(src_code_); + result.root_size = compute_snode_size(node); + return result; +} + +void OpenglStructCompiler::collect_snodes(SNode &snode) { + snodes_.push_back(&snode); + for (int ch_id = 0; ch_id < (int)snode.ch.size(); ch_id++) { + auto &ch = snode.ch[ch_id]; + collect_snodes(*ch); + } +} +// TODO(archibate): really need fit struct_metal so much? +void OpenglStructCompiler::generate_types(const SNode &snode) { + const bool is_place = snode.is_place(); + if (!is_place) { + const std::string class_name = snode.node_type_name + "_ch"; + emit("#define {} int", class_name); + std::string stride_str; + for (int i = 0; i < (int)snode.ch.size(); i++) { + const auto &ch_node_name = snode.ch[i]->node_type_name; + if (stride_str.empty()) { + emit("#define {}_get{}(a_) (a_) // {}", + snode.node_type_name, i, ch_node_name); + stride_str = ch_node_name + "_stride"; + } else { + emit("#define {}_get{}(a_) ((a_) + ({})) // {}", + snode.node_type_name, i, stride_str, ch_node_name); + stride_str += " + " + ch_node_name + "_stride"; + } + } + if (stride_str.empty()) { + // Is it possible for this to have no children? + stride_str = "0"; + } + emit("#define {}_stride ({})", class_name, stride_str); + } + emit(""); + const auto &node_name = snode.node_type_name; + if (is_place) { + const auto dt_name = opengl_data_type_name(snode.dt); + emit("#define {} int // place {}", node_name, dt_name); + emit("#define {}_stride {} // sizeof({})", node_name, data_type_size(snode.dt), dt_name); + } else if (snode.type == SNodeType::dense || snode.type == SNodeType::root) { + emit("#define {} int // {}", node_name, snode_type_name(snode.type)); + const int n = (snode.type == SNodeType::dense) ? snode.n : 1; + emit("#define {}_n {}", node_name, n); + emit("#define {}_stride ({}_ch_stride * {}_n)", node_name, node_name, node_name); + emit("#define {}_children(a_, i) ((a_) + {}_ch_stride * (i))", node_name, node_name); + } else { + TI_ERROR("SNodeType={} not supported on OpenGL", + snode_type_name(snode.type)); + TI_NOT_IMPLEMENTED; + } + emit(""); +} + +size_t OpenglStructCompiler::compute_snode_size(const SNode &sn) { + if (sn.is_place()) { + return data_type_size(sn.dt); + } + size_t ch_size = 0; + for (const auto &ch : sn.ch) { + ch_size += compute_snode_size(*ch); + } + const int n = (sn.type == SNodeType::dense) ? sn.n : 1; + return n * ch_size; +} + + +} // namespace opengl +TLANG_NAMESPACE_END diff --git a/taichi/backends/struct_opengl.h b/taichi/backends/struct_opengl.h new file mode 100644 index 0000000000000..60c0e7e05da70 --- /dev/null +++ b/taichi/backends/struct_opengl.h @@ -0,0 +1,40 @@ +// Codegen for the hierarchical data structure +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +#include "base.h" + +TLANG_NAMESPACE_BEGIN +namespace opengl { + +class OpenglStructCompiler { + public: + using CompiledResult = opengl::StructCompiledResult; + + CompiledResult run(SNode &node); + + private: + void collect_snodes(SNode &snode); + void generate_types(const SNode &snode); + size_t compute_snode_size(const SNode &sn); + + template + void emit(std::string f, Args &&... args) + { + src_code_ += fmt::format(f, std::forward(args)...) + '\n'; + } + + std::vector snodes_; + std::string src_code_; +}; + +} // namespace opengl +TLANG_NAMESPACE_END diff --git a/taichi/platform/opengl/opengl_api.cpp b/taichi/platform/opengl/opengl_api.cpp new file mode 100644 index 0000000000000..12f3f78a0407e --- /dev/null +++ b/taichi/platform/opengl/opengl_api.cpp @@ -0,0 +1,289 @@ +#include "opengl_api.h" + +#ifdef TI_WITH_OPENGL + +#include +#include +#endif + +TLANG_NAMESPACE_BEGIN +namespace opengl { + +#ifdef TI_WITH_OPENGL +void glapi_set_uniform(GLuint loc, float value) +{ + glUniform1f(loc, value); +} + +static std::string add_line_markers(std::string x) +{ + std::string marker; + size_t pos = 0, npos; + int line = 0; + while (1) { + npos = x.find_first_of('\n', pos); + marker = fmt::format("{:3d} ", ++line); + if (npos == std::string::npos) + break; + x.insert(pos, marker); + pos = npos + 1 + marker.size(); + } + return x; +} + +struct GLShader +{ + GLuint id_; + + GLShader(GLuint type = GL_COMPUTE_SHADER) + { + id_ = glCreateShader(type); + } + + GLShader(std::string source, GLuint type = GL_COMPUTE_SHADER) + : GLShader(type) + { + this->compile(source); + } + + ~GLShader() + { + glDeleteShader(id_); + } + + GLShader &compile(const std::string &source) + { + const GLchar *source_cstr = source.c_str(); + glShaderSource(id_, 1, &source_cstr, nullptr); + + glCompileShader(id_); + GLint status = GL_TRUE; + glGetShaderiv(id_, GL_COMPILE_STATUS, &status); + if (status != GL_TRUE) { + GLsizei logLength; + glGetShaderiv(id_, GL_INFO_LOG_LENGTH, &logLength); + auto log = std::vector(logLength + 1); + glGetShaderInfoLog(id_, logLength, &logLength, log.data()); + log[logLength] = 0; + TI_ERROR("[glsl] error while compiling shader:\n{}\n{}", + add_line_markers(source), log.data()); + } + return *this; + } +}; + +struct GLProgram +{ + GLuint id_; + + GLProgram() + { + id_ = glCreateProgram(); + } + + GLProgram(GLShader &shader) + : GLProgram() + { + this->attach(shader); + } + + ~GLProgram() + { + glDeleteProgram(id_); + } + + GLProgram &attach(GLShader &shader) + { + glAttachShader(id_, shader.id_); + return *this; + } + + GLProgram &link() + { + glLinkProgram(id_); + GLint status = GL_TRUE; + glGetProgramiv(id_, GL_LINK_STATUS, &status); + if (status != GL_TRUE) { + GLsizei logLength; + glGetProgramiv(id_, GL_INFO_LOG_LENGTH, &logLength); + auto log = std::vector(logLength + 1); + glGetProgramInfoLog(id_, logLength, &logLength, log.data()); + log[logLength] = 0; + TI_ERROR("[glsl] error while linking program:\n{}", log.data()); + } + return *this; + } + + GLProgram &use() + { + glUseProgram(id_); + return *this; + } + + template + void set_uniform(std::string name, T value) + { + GLuint loc = glGetUniformLocation(id_, name.c_str()); + glapi_set_uniform(loc, value); + } +}; + + +// https://blog.csdn.net/ylbs110/article/details/52074826 +// https://www.khronos.org/opengl/wiki/Shader_Storage_Buffer_Object +// This is Shader Storage Buffer, we use it to share data between CPU & GPU +struct GLSSBO +{ + GLuint id_; + + GLSSBO() + { + glGenBuffers(1, &id_); + } + + ~GLSSBO() + { + glDeleteBuffers(1, &id_); + } + + /*** + GL_{frequency}_{nature}: + + + STREAM + The data store contents will be modified once and used at most a few times. + + STATIC + The data store contents will be modified once and used many times. + + DYNAMIC + The data store contents will be modified repeatedly and used many times. + + + DRAW + The data store contents are modified by the application, and used as the source + for GL drawing and image specification commands. + + READ + The data store contents are modified by reading data from the GL, and used to + return that data when queried by the application. + + COPY + The data store contents are modified by reading data from the GL, and used as the + source for GL drawing and image specification commands. + ***/ + + GLSSBO &bind_data(void *data, size_t size, GLuint usage = GL_STATIC_READ) + { + glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_); + glBufferData(GL_SHADER_STORAGE_BUFFER, size, data, usage); + return *this; + } + + GLSSBO &bind_index(size_t index) + { + // SSBO index, is `layout(std430, binding = )` in shader. + // We use only one SSBO though... + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, index, id_); + return *this; + } + + void *map(size_t offset, size_t length, GLbitfield access = GL_MAP_READ_BIT) + { + // map GPU memory to CPU address space, offset within SSBO data + glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_); + return glMapBufferRange(GL_SHADER_STORAGE_BUFFER, offset, length, access); + } + + void *map(GLbitfield access = GL_MAP_READ_BIT) + { + glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_); + return glMapBuffer(GL_SHADER_STORAGE_BUFFER, access); + } +}; + +void initialize_opengl() +{ + static bool gl_inited = false; + if (gl_inited) + return; + TI_WARN("OpenGL backend currently WIP, MAY NOT WORK"); + gl_inited = true; + + glfwInit(); + // Compute Shader requires OpenGL 4.3+ (or OpenGL ES 3.1+) + glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); + glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3); + // GLEW cannot load GL without a context + // And the best way to make context is by creating a window + // Then hide it immediately, LOL + GLFWwindow *window = glfwCreateWindow(1, 1, "Make GLEW Happy", nullptr, nullptr); + if (!window) { + const char *desc = nullptr; + GLint status = glfwGetError(&desc); + if (!desc) desc = "Unknown Error"; + TI_ERROR("[glsl] cannot create GLFW window: error {}: {}", status, desc); + } + glfwHideWindow(window); + glfwMakeContextCurrent(window); + GLint status = glewInit(); + if (status != GLEW_OK) { + TI_ERROR("[glsl] cannot initialize GLEW: {}", glewGetErrorString(status)); + } + const char *gl_version = (const char *)glGetString(GL_VERSION); + if (!gl_version) { + TI_WARN("[glsl] cannot get OpenGL version"); + } else { + TI_INFO("[glsl] OpenGL {}", gl_version); + } +} + +void launch_glsl_kernel(std::string source, std::vector iov) +{ + GLShader shader(source); + GLProgram program(shader); + program.link(); + program.use(); + + std::vector ssbo(iov.size()); + for (int i = 0; i < ssbo.size(); i++) { + ssbo[i].bind_index(i); + ssbo[i].bind_data(iov[i].base, iov[i].size, GL_DYNAMIC_READ); // input + } + + // https://www.khronos.org/opengl/wiki/Compute_Shader + // https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders + // https://www.khronos.org/assets/uploads/developers/library/2014-siggraph-bof/KITE-BOF_Aug14.pdf + // + // `glDispatchCompute(X, Y, Z)` - the X*Y*Z == `Blocks` in CUDA + // `layout(local_size_x = X) in;` - the X == `Threads` in CUDA + // + glDispatchCompute(1, 1, 1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); // TODO(archibate): move to Program::synchroize() + + for (int i = 0; i < ssbo.size(); i++) { + void *p = ssbo[i].map(0, iov[i].size); // output + std::memcpy(iov[i].base, p, iov[i].size); + glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); + } +} + +bool is_opengl_api_available() +{ + return true; +} + +#else +void launch_glsl_kernel(std::string source, std::vector iov) +{ + TI_NOT_IMPLEMENTED +} + +bool is_opengl_api_available() +{ + return false; +} +#endif + +} +TLANG_NAMESPACE_END diff --git a/taichi/platform/opengl/opengl_api.h b/taichi/platform/opengl/opengl_api.h new file mode 100644 index 0000000000000..0eb23347fc3bb --- /dev/null +++ b/taichi/platform/opengl/opengl_api.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include + +#include +#include + +#include "opengl_kernel_util.h" + +TLANG_NAMESPACE_BEGIN + +namespace opengl { + +void initialize_opengl(); +bool is_opengl_api_available(); +void launch_glsl_kernel(std::string source, std::vector iov); + +} // namespace opengl + +TLANG_NAMESPACE_END diff --git a/taichi/platform/opengl/opengl_data_types.h b/taichi/platform/opengl/opengl_data_types.h new file mode 100644 index 0000000000000..8df36e4f0a736 --- /dev/null +++ b/taichi/platform/opengl/opengl_data_types.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include + +TLANG_NAMESPACE_BEGIN +namespace opengl { + +inline std::string opengl_data_type_name(DataType dt) +{ + // https://www.khronos.org/opengl/wiki/Data_Type_(GLSL) + switch (dt) { + case DataType::f32: + return "float"; + case DataType::f64: + return "double"; + case DataType::i32: + return "int"; + default: + TI_NOT_IMPLEMENTED; + break; + } + return ""; +} + +inline bool is_opengl_binary_op_infix(BinaryOpType type) +{ + return !((type == BinaryOpType::min) || (type == BinaryOpType::max) || + (type == BinaryOpType::atan2) || (type == BinaryOpType::pow)); +} + +} // namespace opengl +TLANG_NAMESPACE_END diff --git a/taichi/platform/opengl/opengl_kernel.cpp b/taichi/platform/opengl/opengl_kernel.cpp new file mode 100644 index 0000000000000..36adc6141b6c4 --- /dev/null +++ b/taichi/platform/opengl/opengl_kernel.cpp @@ -0,0 +1,33 @@ +#include "opengl_kernel.h" +#include "opengl_api.h" + +#include +#include + +TLANG_NAMESPACE_BEGIN +namespace opengl { + +SSBO::SSBO(size_t data_size_) + : data_(data_size_), data_size(data_size_) +{} + +void SSBO::load_arguments_from(Context &ctx) +{ + uint64_t *data_i = (uint64_t *)data(); + for (int i = 0; i < taichi_max_num_args; i++) { + uint64_t value = ctx.get_arg(i); + data_i[i] = value; + } +} + +void SSBO::save_returns_to(Context &ctx) +{ + uint64_t *data_i = (uint64_t *)data(); + for (int i = 0; i < taichi_max_num_args; i++) { + uint64_t value = data_i[i]; + ctx.set_arg(i, value); + } +} + +} // namespace opengl +TLANG_NAMESPACE_END diff --git a/taichi/platform/opengl/opengl_kernel.h b/taichi/platform/opengl/opengl_kernel.h new file mode 100644 index 0000000000000..d95449b0bbdc4 --- /dev/null +++ b/taichi/platform/opengl/opengl_kernel.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include + +#include +#include + +#include "opengl_kernel_util.h" + + +TLANG_NAMESPACE_BEGIN + +namespace opengl { + +struct SSBO +{ + std::vector data_; + const size_t data_size; + + SSBO(size_t data_size); + + void load_arguments_from(Context &ctx); + void save_returns_to(Context &ctx); + inline void *data() + { + return (void *)data_.data(); + } + + inline operator IOV() + { + return IOV{data(), data_size}; + } +}; + +} // namespace opengl + +TLANG_NAMESPACE_END diff --git a/taichi/platform/opengl/opengl_kernel_util.h b/taichi/platform/opengl/opengl_kernel_util.h new file mode 100644 index 0000000000000..7c703683c4423 --- /dev/null +++ b/taichi/platform/opengl/opengl_kernel_util.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +#include + +TLANG_NAMESPACE_BEGIN + +class SNode; + +namespace opengl { + +struct StructCompiledResult { + // Source code of the SNode data structures compiled to GLSL + std::string source_code; + // Root buffer size in bytes. + size_t root_size; +}; + +struct IOV +{ + void *base; + size_t size; +}; + +} // namespace opengl + +TLANG_NAMESPACE_END diff --git a/taichi/profiler.cpp b/taichi/profiler.cpp index e7352dcea443f..e822119cd7021 100644 --- a/taichi/profiler.cpp +++ b/taichi/profiler.cpp @@ -149,7 +149,7 @@ class CUDAProfiler : public ProfilerBase { } // namespace std::unique_ptr make_profiler(Arch arch) { - if (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::metal) { + if (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::metal || arch == Arch::opengl) { return std::make_unique(arch); } else if (arch == Arch::cuda) { return std::make_unique(); diff --git a/taichi/program.cpp b/taichi/program.cpp index 89def61830131..f1dce2960855c 100644 --- a/taichi/program.cpp +++ b/taichi/program.cpp @@ -4,12 +4,15 @@ #include #include +#include #include "backends/codegen_cuda.h" #include "backends/codegen_metal.h" +#include "backends/codegen_opengl.h" #include "backends/codegen_x86.h" #include "backends/struct.h" #include "backends/struct_metal.h" +#include "backends/struct_opengl.h" #include "unified_allocator.h" #include "snode.h" @@ -39,7 +42,7 @@ Program::Program(Arch arch) { #if !defined(TI_WITH_CUDA) if (arch == Arch::cuda) { TI_WARN("Taichi is not compiled with CUDA."); - TI_WARN("Falling back to x86_64"); + TI_WARN("Falling back to x64"); arch = Arch::x64; } #else @@ -47,14 +50,20 @@ Program::Program(Arch arch) { cuda_context = std::make_unique(); if (!cuda_context->detected()) { TI_WARN("No CUDA device detected."); - TI_WARN("Falling back to x86_64"); + TI_WARN("Falling back to x64"); arch = Arch::x64; } } #endif if (arch == Arch::metal) { if (!metal::is_metal_api_available()) { - TI_WARN("No Metal API detected, falling back to x86_64"); + TI_WARN("No Metal API detected, falling back to x64"); + arch = Arch::x64; + } + } + if (arch == Arch::opengl) { + if (!opengl::is_opengl_api_available()) { + TI_WARN("No OpenGL API detected, falling back to x64"); arch = Arch::x64; } } @@ -96,6 +105,9 @@ FunctionType Program::compile(Kernel &kernel) { } else if (kernel.arch == Arch::metal) { metal::MetalCodeGen codegen(kernel.name, &metal_struct_compiled_.value()); ret = codegen.compile(*this, kernel, metal_runtime_.get()); + } else if (kernel.arch == Arch::opengl) { + opengl::OpenglCodeGen codegen(kernel.name, &opengl_struct_compiled_.value()); + ret = codegen.compile(*this, kernel); } else { TI_NOT_IMPLEMENTED; } @@ -173,7 +185,7 @@ void Program::initialize_runtime_system(StructCompiler *scomp) { } void Program::materialize_layout() { - // always use arch=x86_64 since this is for host accessors + // always use arch=x64 since this is for host accessors // TODO: arch may also be arm etc. std::unique_ptr scomp = StructCompiler::make(this, Arch::x64); scomp->run(*snode_root, true); @@ -183,6 +195,7 @@ void Program::materialize_layout() { initialize_runtime_system(scomp.get()); } + TI_INFO("materialize_layout called"); if (config.arch == Arch::cuda && config.use_llvm) { initialize_device_llvm_context(); // llvm_context_device->get_init_module(); @@ -204,6 +217,11 @@ void Program::materialize_layout() { params.profiler = profiler.get(); metal_runtime_ = std::make_unique(std::move(params)); } + } else if (config.arch == Arch::opengl) { + opengl::OpenglStructCompiler scomp; + opengl_struct_compiled_ = scomp.run(*snode_root); + TI_INFO("OpenGL root buffer size: {} B", opengl_struct_compiled_->root_size); + opengl::initialize_opengl(); } } @@ -311,6 +329,15 @@ void Program::initialize_device_llvm_context() { } } +Arch Program::get_snode_accessor_arch() +{ + if (config.arch == Arch::opengl) { + return Arch::opengl; + } else { + return get_host_arch(); + } +} + Kernel &Program::get_snode_reader(SNode *snode) { TI_ASSERT(snode->type == SNodeType::place); auto kernel_name = fmt::format("snode_reader_{}", snode->id); @@ -323,7 +350,7 @@ Kernel &Program::get_snode_reader(SNode *snode) { snode->num_active_indices, load_if_ptr((snode->expr)[indices])); current_ast_builder().insert(std::move(ret)); }); - ker.set_arch(get_host_arch()); + ker.set_arch(get_snode_accessor_arch()); ker.name = kernel_name; ker.is_accessor = true; for (int i = 0; i < snode->num_active_indices; i++) @@ -344,7 +371,7 @@ Kernel &Program::get_snode_writer(SNode *snode) { (snode->expr)[indices] = Expr::make(snode->num_active_indices); }); - ker.set_arch(get_host_arch()); + ker.set_arch(get_snode_accessor_arch()); ker.name = kernel_name; ker.is_accessor = true; for (int i = 0; i < snode->num_active_indices; i++) diff --git a/taichi/program.h b/taichi/program.h index abf2433172050..982f0c72b1df8 100644 --- a/taichi/program.h +++ b/taichi/program.h @@ -17,6 +17,7 @@ #include #include #include +#include #if defined(TI_PLATFORM_UNIX) #include @@ -158,6 +159,8 @@ class Program { return Arch::x64; } + Arch get_snode_accessor_arch(); + float64 get_total_compilation_time() { return total_compilation_time; } @@ -168,6 +171,7 @@ class Program { private: std::optional metal_struct_compiled_; + std::optional opengl_struct_compiled_; std::unique_ptr metal_runtime_; }; diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp index 18ae9bbf28b9f..d378421dd3bf1 100644 --- a/taichi/python/export_misc.cpp +++ b/taichi/python/export_misc.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #if defined(TI_WITH_CUDA) #include #endif @@ -159,6 +160,7 @@ void export_misc(py::module &m) { }); m.def("with_cuda", with_cuda); m.def("with_metal", taichi::Tlang::metal::is_metal_api_available); + m.def("with_opengl", taichi::Tlang::opengl::is_opengl_api_available); } TI_NAMESPACE_END diff --git a/test_abs.py b/test_abs.py new file mode 100644 index 0000000000000..a8c534b4c6a5f --- /dev/null +++ b/test_abs.py @@ -0,0 +1,35 @@ +import taichi as ti + +@ti.all_archs +def test_abs(): + x = ti.var(ti.f32) + y = ti.var(ti.f32) + + N = 16 + + ti.root.dense(ti.i, N).place(x) + ti.root.dense(ti.i, N).place(y) + ti.root.lazy_grad() + + @ti.kernel + def func(): + for i in range(N): + x[i] = ti.abs(y[i]) + + for i in range(N): + y[i] = i - 10 + x.grad[i] = 1 + + func() + func.grad() + + def sgn(x): + if x > 0: + return 1 + if x < 0: + return -1 + return 0 + + for i in range(N): + assert x[i] == abs(y[i]) + assert y.grad[i] == sgn(y[i])