From dd6da1f516b40dd64b9a536adcaa0cbc2b8f0947 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BD=AD=E4=BA=8E=E6=96=8C?= <17721388340@163.com>
Date: Thu, 27 Feb 2020 10:35:32 -0600
Subject: [PATCH] OpenGL backend (stage 1) (#535)

* [skip ci] basic start up for opengl

[skip ci] test inout data[3] success

[skip ci] try to pass kernel arguments

* [skip ci] use a struct_metal style struct compiler

* [skip ci] basic allocator

* [skip ci] basic read/write tensor

fix bug x[4] -> x[4 >> 2] = x[1]

* adding f32&f64 support for opengl

remove opengl from all_archs to prevent test

* Update test_abs.py

* Update TaichiCore.cmake for TI_WITH_OPENGL

optional TI_WITH_OPENGL ON (detect auto OFF)

TI_WITH_OPENGL guard opengl_api.cpp

fix typo

* merge g1 with master fix: x86_64 -> x64

Update profiler.cpp

* replace opengl_unary_op_type_symbol with unary_op_type_name

* snode_io_arch -> snode_accessor_arch

* use GLEW_LIBRARY_DIR

TI_WITH_OPENGL=OFF by default

TI_WITH_OPENGL default to ON

* no more USE_GLEW macro

* no more mallocs in SSBO

* remove opengl_backend.py, add comment in mpm99 for ti.opengl

* fix memory leakage using std::vector

* better cmake: find GLFW and glfw3

* support arch: uncomment with_opengl

* initialize_opengl in materialize_layout

fix typo

Co-authored-by: Yuanming Hu <yuanming-hu@users.noreply.github.com>
---
 cmake/TaichiCore.cmake                      |  25 +
 examples/mpm99.py                           |   2 +-
 python/taichi/lang/__init__.py              |   3 +
 taichi/backends/codegen_opengl.cpp          | 518 ++++++++++++++++++++
 taichi/backends/codegen_opengl.h            |  40 ++
 taichi/backends/struct_opengl.cpp           |  91 ++++
 taichi/backends/struct_opengl.h             |  40 ++
 taichi/platform/opengl/opengl_api.cpp       | 289 +++++++++++
 taichi/platform/opengl/opengl_api.h         |  21 +
 taichi/platform/opengl/opengl_data_types.h  |  33 ++
 taichi/platform/opengl/opengl_kernel.cpp    |  33 ++
 taichi/platform/opengl/opengl_kernel.h      |  39 ++
 taichi/platform/opengl/opengl_kernel_util.h |  29 ++
 taichi/profiler.cpp                         |   2 +-
 taichi/program.cpp                          |  39 +-
 taichi/program.h                            |   4 +
 taichi/python/export_misc.cpp               |   2 +
 test_abs.py                                 |  35 ++
 18 files changed, 1237 insertions(+), 8 deletions(-)
 create mode 100644 taichi/backends/codegen_opengl.cpp
 create mode 100644 taichi/backends/codegen_opengl.h
 create mode 100644 taichi/backends/struct_opengl.cpp
 create mode 100644 taichi/backends/struct_opengl.h
 create mode 100644 taichi/platform/opengl/opengl_api.cpp
 create mode 100644 taichi/platform/opengl/opengl_api.h
 create mode 100644 taichi/platform/opengl/opengl_data_types.h
 create mode 100644 taichi/platform/opengl/opengl_kernel.cpp
 create mode 100644 taichi/platform/opengl/opengl_kernel.h
 create mode 100644 taichi/platform/opengl/opengl_kernel_util.h
 create mode 100644 test_abs.py

diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 803b7c79fc601..e6c109cf1e799 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -24,6 +24,8 @@ endif()
 
 option(USE_STDCPP "Use -stdlib=libc++" OFF)
 option(TI_WITH_CUDA "Build with CUDA support" OFF)
+option(TI_WITH_OPENGL "Build with OpenGL backend" ON)
+option(GLEW_USE_STATIC_LIBS OFF)
 
 include_directories(${CMAKE_SOURCE_DIR})
 include_directories(external/xxhash)
@@ -53,6 +55,29 @@ if (TI_WITH_CUDA)
     endif()
 endif()
 
+if (TI_WITH_OPENGL)
+  if(NOT GLEW_VERSION)
+    set(GLEW_VERSION 2.0.0)
+  endif()
+  find_package(GLEW ${GLEW_VERSION})
+  if (GLEW_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_OPENGL")
+    message("Building with GLEW ${GLEW_VERSION}")
+    message("Using GLEW: ${GLEW_LIBRARIES}")
+    target_include_directories(${LIBRARY_NAME} PUBLIC ${GLEW_INCLUDE_DIRS})
+    target_link_libraries(${LIBRARY_NAME} ${GLEW_LIBRARIES} GLEW)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGLEW_STATIC")
+    find_package(glfw3 REQUIRED)
+    if (NOT glfw3_FOUND)
+      message(FATAL_ERROR "glfw3 not found.")
+    endif()
+    message("Building with glfw ${glfw3_VERSION}")
+    target_link_libraries(${LIBRARY_NAME} glfw)
+  else()
+    message(WARNING "GLEW not found, ignoring TI_WITH_OPENGL.")
+  endif()
+endif()
+
 # http://llvm.org/docs/CMake.html#embedding-llvm-in-your-project
 find_package(LLVM REQUIRED CONFIG 8.0)
 message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
diff --git a/examples/mpm99.py b/examples/mpm99.py
index 1bb23917beab1..023af5b5a9c10 100644
--- a/examples/mpm99.py
+++ b/examples/mpm99.py
@@ -1,5 +1,5 @@
 import taichi as ti
-ti.init(arch=ti.cuda) # Try to run on GPU
+ti.init(arch=ti.cuda) # Try to run on GPU. Use arch=ti.opengl on old GPUs
 quality = 1 # Use a larger value for higher-res simulations
 n_particles, n_grid = 9000 * quality ** 2, 128 * quality
 dx, inv_dx = 1 / n_grid, float(n_grid)
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index 27d9944196bac..b1bc4b9036c11 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -29,6 +29,7 @@
 x64 = core.x64
 cuda = core.cuda
 metal = core.metal
+opengl = core.opengl
 profiler_print = lambda: core.get_current_program().profiler_print()
 profiler_clear = lambda: core.get_current_program().profiler_clear()
 profiler_start = lambda n: core.get_current_program().profiler_start(n)
@@ -222,6 +223,8 @@ def supported_archs():
     archs.append(cuda)
   if ti.core.with_metal():
     archs.append(metal)
+  if ti.core.with_opengl():
+    archs.append(opengl)
   return archs
 
 class _ArchCheckers(object):
diff --git a/taichi/backends/codegen_opengl.cpp b/taichi/backends/codegen_opengl.cpp
new file mode 100644
index 0000000000000..ae8579695edc6
--- /dev/null
+++ b/taichi/backends/codegen_opengl.cpp
@@ -0,0 +1,518 @@
+#include "codegen_opengl.h"
+#include <taichi/platform/opengl/opengl_api.h>
+#include <taichi/platform/opengl/opengl_kernel.h>
+#include <taichi/platform/opengl/opengl_data_types.h>
+
+#include <string>
+#include <taichi/ir.h>
+
+TLANG_NAMESPACE_BEGIN
+namespace opengl {
+namespace {
+
+class KernelGen : public IRVisitor
+{
+  Kernel *kernel;
+
+public:
+  KernelGen(Kernel *kernel, std::string kernel_name,
+      const StructCompiledResult *struct_compiled)
+    : kernel(kernel),
+      struct_compiled_(struct_compiled),
+      kernel_name_(kernel_name),
+      glsl_kernel_prefix_(kernel_name)
+  {
+    allow_undefined_visitor = true;
+    invoke_default_visitor = true;
+  }
+
+private: // {{{
+  std::string kernel_src_code_;
+  std::string indent_;
+  bool is_top_level_{true};
+
+  const StructCompiledResult *struct_compiled_;
+  const SNode *root_snode_;
+  GetRootStmt *root_stmt_;
+  std::string kernel_name_;
+  std::string glsl_kernel_name_;
+  std::string root_snode_type_name_;
+  std::string glsl_kernel_prefix_;
+  int glsl_kernel_count_{0};
+
+  void push_indent()
+  {
+    indent_ += "  ";
+  }
+
+  void pop_indent()
+  {
+    indent_.pop_back();
+    indent_.pop_back();
+  }
+
+  template <typename... Args>
+  void emit(std::string f, Args &&... args)
+  {
+    kernel_src_code_ +=
+        indent_ + fmt::format(f, std::forward<Args>(args)...) + "\n";
+  }
+
+  void generate_header()
+  {
+    emit("#version 430 core");
+    emit("#extension GL_ARB_compute_shader: enable");
+    emit("{}", struct_compiled_->source_code);
+    emit("layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;");
+    emit("#define NARGS {}", taichi_max_num_args);
+    emit("layout(std430, binding = 0) buffer args_i32");
+    emit("{{");
+    emit("  int _args_i32_[NARGS * 2];");
+    emit("}};");
+    emit("layout(std430, binding = 0) buffer args_f32");
+    emit("{{");
+    emit("  float _args_f32_[NARGS * 2];");
+    emit("}};");
+    emit("layout(std430, binding = 0) buffer args_f64");
+    emit("{{");
+    emit("  double _args_f64_[NARGS];");
+    emit("}};");
+    emit("layout(std430, binding = 1) buffer data_i32");
+    emit("{{");
+    emit("  int _data_i32_[];");
+    emit("}};");
+    emit("layout(std430, binding = 1) buffer data_f32");
+    emit("{{");
+    emit("  float _data_f32_[];");
+    emit("}};");
+    emit("layout(std430, binding = 1) buffer data_f64");
+    emit("{{");
+    emit("  double _data_f64_[];");
+    emit("}};");
+    emit("#define _arg_i32(x) _args_i32_[(x) << 1]"); // skip to 64bit stride
+    emit("#define _arg_f32(x) _args_f32_[(x) << 1]");
+    emit("#define _arg_i64(x) _args_i64_[(x) << 0]");
+    emit("#define _arg_f64(x) _args_f64_[(x) << 0]");
+    emit("#define _mem_i32(x) _data_i32_[(x) >> 2]");
+    emit("#define _mem_f32(x) _data_f32_[(x) >> 2]");
+    emit("#define _mem_i64(x) _data_i64_[(x) >> 3]");
+    emit("#define _mem_f64(x) _data_f64_[(x) >> 3]");
+    emit("");
+  }
+
+  void generate_bottom()
+  {
+    // TODO(archibate): <kernel_name>() really necessary? How about just main()?
+    emit("void main()");
+    emit("{{");
+    emit("  {}();", glsl_kernel_name_);
+    emit("}}");
+  }
+
+  void visit(Block *stmt) override
+  {
+    if (!is_top_level_) push_indent();
+    for (auto &s : stmt->statements) {
+      //TI_INFO("visiting sub stmt {}", typeid(*s).name());
+      s->accept(this);
+    }
+    if (!is_top_level_) pop_indent();
+  }
+
+  void visit(LinearizeStmt *stmt) override
+  {
+    std::string val = "0";
+    for (int i = 0; i < (int)stmt->inputs.size(); i++) {
+      val = fmt::format("({} * {} + {})", val, stmt->strides[i],
+                        stmt->inputs[i]->raw_name());
+    }
+    emit("const int {} = {};", stmt->raw_name(), val);
+  }
+
+  void visit(OffsetAndExtractBitsStmt *stmt) override
+  {
+    emit("int {} = ((({} + {}) >> {}) & ((1 << {}) - 1));",
+         stmt->raw_name(), stmt->offset, stmt->input->raw_name(),
+         stmt->bit_begin, stmt->bit_end - stmt->bit_begin);
+  }
+
+  void visit(GetRootStmt *stmt) override
+  {
+    // Should we assert |root_stmt_| is assigned only once?
+    root_stmt_ = stmt;
+    emit("{} {} = 0;", root_snode_type_name_, stmt->raw_name());
+  }
+
+  void visit(SNodeLookupStmt *stmt) override
+  {
+    Stmt *parent;
+    std::string parent_type;
+    if (stmt->input_snode) {
+      parent = stmt->input_snode;
+      parent_type = stmt->snode->node_type_name;
+    } else {
+      TI_ASSERT(root_stmt_ != nullptr);
+      parent = root_stmt_;
+      parent_type = root_snode_type_name_;
+    }
+
+    emit("{}_ch {} = {}_children({}, {});", stmt->snode->node_type_name,
+         stmt->raw_name(), parent_type, parent->raw_name(),
+         stmt->input_index->raw_name());
+  }
+
+  void visit(GetChStmt *stmt) override
+  {
+    if (stmt->output_snode->is_place()) {
+      emit("{} /* place {} */ {} = {}_get{}({});",
+          stmt->output_snode->node_type_name,
+          opengl_data_type_name(stmt->output_snode->dt),
+          stmt->raw_name(), stmt->input_snode->node_type_name,
+          stmt->chid, stmt->input_ptr->raw_name());
+    } else {
+      emit("{} {} = {}_get{}({});", stmt->output_snode->node_type_name,
+          stmt->raw_name(), stmt->input_snode->node_type_name,
+          stmt->chid, stmt->input_ptr->raw_name());
+    }
+  }
+
+  void visit(GlobalStoreStmt *stmt) override
+  {
+    TI_ASSERT(stmt->width() == 1);
+    emit("_mem_{}({}) = {};", data_type_short_name(stmt->element_type()),
+        stmt->ptr->raw_name(), stmt->data->raw_name());
+  }
+
+  void visit(GlobalLoadStmt *stmt) override
+  {
+    TI_ASSERT(stmt->width() == 1);
+    emit("{} {} = _mem_{}({});", opengl_data_type_name(stmt->element_type()),
+         stmt->raw_name(), data_type_short_name(stmt->element_type()), stmt->ptr->raw_name());
+  }
+
+  void visit(UnaryOpStmt *stmt) override
+  {
+    if (stmt->op_type != UnaryOpType::cast) {
+      emit("const {} {} = {}({});", opengl_data_type_name(stmt->element_type()),
+           stmt->raw_name(), unary_op_type_name(stmt->op_type),
+           stmt->operand->raw_name());
+    } else {
+      // cast
+      if (stmt->cast_by_value) {
+        emit("const {} {} = {}({});",
+             opengl_data_type_name(stmt->element_type()), stmt->raw_name(),
+             opengl_data_type_name(stmt->cast_type), stmt->operand->raw_name());
+      } else {
+        TI_NOT_IMPLEMENTED;
+      }
+    }
+  }
+
+  void visit(BinaryOpStmt *bin) override
+  {
+    const auto dt_name = opengl_data_type_name(bin->element_type());
+    const auto lhs_name = bin->lhs->raw_name();
+    const auto rhs_name = bin->rhs->raw_name();
+    const auto bin_name = bin->raw_name();
+    if (bin->op_type == BinaryOpType::floordiv) {
+      if (is_integral(bin->element_type())) {
+        emit("const {} {} = int(floor({} / {}));", dt_name, bin_name, lhs_name,
+             rhs_name);
+      } else {
+        emit("const {} {} = floor({} / {});", dt_name, bin_name, lhs_name,
+             rhs_name);
+      }
+      return;
+    }
+    const auto binop = binary_op_type_symbol(bin->op_type);
+    if (is_opengl_binary_op_infix(bin->op_type)) {
+      emit("const {} {} = ({} {} {});", dt_name, bin_name, lhs_name, binop,
+           rhs_name);
+    } else {
+      // This is a function call
+      emit("const {} {} = {}({}, {});", dt_name, bin_name, binop, lhs_name,
+           rhs_name);
+    }
+  }
+
+  void visit(TernaryOpStmt *tri) override
+  {
+    TI_ASSERT(tri->op_type == TernaryOpType::select);
+    emit("const {} {} = ({}) ? ({}) : ({});",
+         opengl_data_type_name(tri->element_type()), tri->raw_name(),
+         tri->op1->raw_name(), tri->op2->raw_name(), tri->op3->raw_name());
+  }
+
+  void visit(LocalLoadStmt *stmt) override
+  {
+    bool linear_index = true;
+    for (int i = 0; i < (int)stmt->ptr.size(); i++) {
+      if (stmt->ptr[i].offset != i) {
+        linear_index = false;
+      }
+    }
+    if (stmt->same_source() && linear_index &&
+        stmt->width() == stmt->ptr[0].var->width()) {
+      auto ptr = stmt->ptr[0].var;
+      emit("const {} {}({});", opengl_data_type_name(stmt->element_type()),
+           stmt->raw_name(), ptr->raw_name());
+    } else {
+      TI_NOT_IMPLEMENTED;
+    }
+  }
+
+  void visit(LocalStoreStmt *stmt) override
+  {
+    emit("{} = {};", stmt->ptr->raw_name(), stmt->data->raw_name());
+  }
+
+  void visit(AllocaStmt *alloca) override
+  {
+    emit("{} {}(0);",
+        opengl_data_type_name(alloca->element_type()),
+        alloca->raw_name());
+  }
+
+  void visit(ConstStmt *const_stmt) override
+  {
+    TI_ASSERT(const_stmt->width() == 1);
+    emit("const {} {} = {};", opengl_data_type_name(const_stmt->element_type()),
+         const_stmt->raw_name(), const_stmt->val[0].stringify());
+  }
+
+  void visit(ArgLoadStmt *stmt) override
+  {
+    const auto dt = opengl_data_type_name(stmt->element_type());
+    if (stmt->is_ptr) {
+      emit("const {} {} = _arg_{}({}); // is_ptr", dt, stmt->raw_name(),
+          data_type_short_name(stmt->element_type()), stmt->arg_id);
+    } else {
+      emit("const {} {} = _arg_{}({});", dt, stmt->raw_name(),
+          data_type_short_name(stmt->element_type()), stmt->arg_id);
+    }
+  }
+
+  void visit(ArgStoreStmt *stmt) override
+  {
+    TI_ASSERT(!stmt->is_ptr);
+    emit("_arg_{}({}) = {};", data_type_short_name(stmt->element_type()),
+        stmt->arg_id, stmt->val->raw_name());
+  }
+
+  std::string make_kernel_name()
+  {
+    return fmt::format("{}{}", glsl_kernel_prefix_, glsl_kernel_count_++);
+  }
+
+  void generate_serial_kernel(OffloadedStmt *stmt) {
+    TI_ASSERT(stmt->task_type == OffloadedStmt::TaskType::serial);
+    const std::string glsl_kernel_name = make_kernel_name();
+    this->glsl_kernel_name_ = glsl_kernel_name;
+    emit("void {}()", glsl_kernel_name);
+    emit("{{ // serial");
+    stmt->body->accept(this);
+    emit("}}\n");
+  }
+
+
+  void visit(OffloadedStmt *stmt) override
+  {
+    TI_ASSERT(is_top_level_); // TODO(archibate): remove for nested kernel (?)
+    is_top_level_ = false;
+    using Type = OffloadedStmt::TaskType;
+    if (stmt->task_type == Type::serial) {
+      generate_serial_kernel(stmt);
+    /*} else if (stmt->task_type == Type::range_for) {
+      generate_range_for_kernel(stmt);*/
+    } else {
+      // struct_for is automatically lowered to ranged_for for dense snodes
+      // (#378). So we only need to support serial and range_for tasks.
+      TI_ERROR("Unsupported offload type={} on OpenGL arch", stmt->task_name());
+    }
+    is_top_level_ = true;
+  }
+
+
+public:
+  const std::string &kernel_source_code() const
+  {
+    return kernel_src_code_;
+  }
+
+  SSBO *create_root_ssbo()
+  {
+    static SSBO *root_ssbo;
+    if (!root_ssbo) {
+      TI_INFO("[glsl] creating root buffer of size {} B", struct_compiled_->root_size);
+      root_ssbo = new SSBO(struct_compiled_->root_size);
+    }
+    return root_ssbo;
+  }
+
+  void run(const SNode &root_snode)
+  {
+    //TI_INFO("ntm:: {}", root_snode.node_type_name);
+    root_snode_ = &root_snode;
+    root_snode_type_name_ = root_snode.node_type_name;
+    generate_header();
+    //irpass::print(kernel->ir);
+    kernel->ir->accept(this);
+    generate_bottom();
+  }
+};
+
+} // namespace
+
+void OpenglCodeGen::lower()
+{
+  auto ir = kernel_->ir;
+  const bool print_ir = prog_->config.print_ir;
+  if (print_ir) {
+    TI_TRACE("Initial IR:");
+    irpass::print(ir);
+  }
+
+  if (kernel_->grad) {
+    irpass::reverse_segments(ir);
+    irpass::re_id(ir);
+    if (print_ir) {
+      TI_TRACE("Segment reversed (for autodiff):");
+      irpass::print(ir);
+    }
+  }
+
+  irpass::lower(ir);
+  irpass::re_id(ir);
+  if (print_ir) {
+    TI_TRACE("Lowered:");
+    irpass::print(ir);
+  }
+
+  irpass::typecheck(ir);
+  irpass::re_id(ir);
+  if (print_ir) {
+    TI_TRACE("Typechecked:");
+    irpass::print(ir);
+  }
+
+  irpass::demote_dense_struct_fors(ir);
+  irpass::typecheck(ir);
+  if (print_ir) {
+    TI_TRACE("Dense Struct-for demoted:");
+    irpass::print(ir);
+  }
+
+  irpass::constant_fold(ir);
+  if (prog_->config.simplify_before_lower_access) {
+    irpass::simplify(ir);
+    irpass::re_id(ir);
+    if (print_ir) {
+      TI_TRACE("Simplified I:");
+      irpass::print(ir);
+    }
+  }
+
+  if (kernel_->grad) {
+    irpass::demote_atomics(ir);
+    irpass::full_simplify(ir, prog_->config);
+    irpass::typecheck(ir);
+    if (print_ir) {
+      TI_TRACE("Before make_adjoint:");
+      irpass::print(ir);
+    }
+    irpass::make_adjoint(ir);
+    if (print_ir) {
+      TI_TRACE("After make_adjoint:");
+      irpass::print(ir);
+    }
+    irpass::typecheck(ir);
+  }
+
+  irpass::lower_access(ir, prog_->config.use_llvm);
+  irpass::re_id(ir);
+  if (print_ir) {
+    TI_TRACE("Access Lowered:");
+    irpass::print(ir);
+  }
+
+  irpass::die(ir);
+  irpass::re_id(ir);
+  if (print_ir) {
+    TI_TRACE("DIEd:");
+    irpass::print(ir);
+  }
+
+  irpass::flag_access(ir);
+  irpass::re_id(ir);
+  if (print_ir) {
+    TI_TRACE("Access Flagged:");
+    irpass::print(ir);
+  }
+
+  irpass::constant_fold(ir);
+  if (print_ir) {
+    TI_TRACE("Constant folded:");
+    irpass::re_id(ir);
+    irpass::print(ir);
+  }
+
+  global_tmps_buffer_size_ =
+      std::max(irpass::offload(ir).total_size, (size_t)(1));
+  if (print_ir) {
+    TI_TRACE("Offloaded:");
+    irpass::re_id(ir);
+    irpass::print(ir);
+  }
+
+  irpass::full_simplify(ir, prog_->config);
+  if (print_ir) {
+    TI_TRACE("Simplified II:");
+    irpass::re_id(ir);
+    irpass::print(ir);
+  }
+
+  irpass::demote_atomics(ir);
+  if (print_ir) {
+    TI_TRACE("Atomics demoted:");
+    irpass::re_id(ir);
+    irpass::print(ir);
+  }
+}
+
+FunctionType OpenglCodeGen::gen(void)
+{
+  KernelGen codegen(kernel_, kernel_name_, struct_compiled_);
+  codegen.run(*prog_->snode_root);
+  SSBO *root_sb = codegen.create_root_ssbo();
+  const std::string kernel_source_code = codegen.kernel_source_code();
+  //TI_INFO("source of kernel [{}]:\n{}", kernel_name_, kernel_source_code);
+
+  return [kernel_source_code, root_sb](Context &ctx) {
+    // TODO(archibate): find out where get_arg<uint64_t> stored, and just new SSBO(ctx)
+    SSBO *arg_sb = new SSBO(taichi_max_num_args * sizeof(uint64_t));
+    arg_sb->load_arguments_from(ctx);
+    std::vector<IOV> iov = {*arg_sb, *root_sb};
+    /*TI_INFO("data[0] = {}", ((int*)root_sb->data)[0]);
+    TI_INFO("data[1] = {}", ((int*)root_sb->data)[1]);
+    TI_INFO("args[0] = {}", ((uint64_t*)arg_sb->data)[0]);
+    TI_INFO("args[1] = {}", ((uint64_t*)arg_sb->data)[1]);*/
+    launch_glsl_kernel(kernel_source_code, iov);
+    /*TI_INFO("data[0] = {}", ((int*)root_sb->data)[0]);
+    TI_INFO("data[1] = {}", ((int*)root_sb->data)[1]);
+    TI_INFO("args[0] = {}", ((uint64_t*)arg_sb->data)[0]);
+    TI_INFO("args[1] = {}", ((uint64_t*)arg_sb->data)[1]);*/
+    arg_sb->save_returns_to(ctx);
+  };
+}
+
+FunctionType OpenglCodeGen::compile(Program &program, Kernel &kernel)
+{
+  this->prog_ = &program;
+  this->kernel_ = &kernel;
+
+  this->lower();
+  return this->gen();
+}
+
+} // namespace opengl
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/codegen_opengl.h b/taichi/backends/codegen_opengl.h
new file mode 100644
index 0000000000000..49a669408fd5e
--- /dev/null
+++ b/taichi/backends/codegen_opengl.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <taichi/common.h>
+#include <taichi/constants.h>
+#include <taichi/tlang_util.h>
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "base.h"
+#include "kernel.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace opengl {
+
+class OpenglCodeGen {
+ public:
+  OpenglCodeGen(const std::string &kernel_name,
+      const StructCompiledResult *struct_compiled)
+    : kernel_name_(kernel_name), struct_compiled_(struct_compiled)
+  {}
+
+  FunctionType compile(Program &program, Kernel &kernel);
+
+ private:
+  void lower();
+  FunctionType gen();
+
+  const std::string kernel_name_;
+
+  Program *prog_;
+  Kernel *kernel_;
+  const StructCompiledResult *struct_compiled_;
+  size_t global_tmps_buffer_size_{0};
+};
+
+} // namespace opengl
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/struct_opengl.cpp b/taichi/backends/struct_opengl.cpp
new file mode 100644
index 0000000000000..ef7597d8aea0d
--- /dev/null
+++ b/taichi/backends/struct_opengl.cpp
@@ -0,0 +1,91 @@
+#include "struct_opengl.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace opengl {
+
+OpenglStructCompiler::CompiledResult OpenglStructCompiler::run(SNode &node)
+{
+  TI_ASSERT(node.type == SNodeType::root);
+  collect_snodes(node);
+  // The host side has run this!
+  // infer_snode_properties(node);
+
+  auto snodes_rev = snodes_;
+  std::reverse(snodes_rev.begin(), snodes_rev.end());
+
+  for (auto &n : snodes_rev) {
+    generate_types(*n);
+  }
+  CompiledResult result;
+  result.source_code = std::move(src_code_);
+  result.root_size = compute_snode_size(node);
+  return result;
+}
+
+void OpenglStructCompiler::collect_snodes(SNode &snode) {
+  snodes_.push_back(&snode);
+  for (int ch_id = 0; ch_id < (int)snode.ch.size(); ch_id++) {
+    auto &ch = snode.ch[ch_id];
+    collect_snodes(*ch);
+  }
+}
+// TODO(archibate): really need fit struct_metal so much?
+void OpenglStructCompiler::generate_types(const SNode &snode) {
+  const bool is_place = snode.is_place();
+  if (!is_place) {
+    const std::string class_name = snode.node_type_name + "_ch";
+    emit("#define {} int", class_name);
+    std::string stride_str;
+    for (int i = 0; i < (int)snode.ch.size(); i++) {
+      const auto &ch_node_name = snode.ch[i]->node_type_name;
+      if (stride_str.empty()) {
+        emit("#define {}_get{}(a_) (a_) // {}",
+            snode.node_type_name, i, ch_node_name);
+        stride_str = ch_node_name + "_stride";
+      } else {
+        emit("#define {}_get{}(a_) ((a_) + ({})) // {}",
+            snode.node_type_name, i, stride_str, ch_node_name);
+        stride_str += " + " + ch_node_name + "_stride";
+      }
+    }
+    if (stride_str.empty()) {
+      // Is it possible for this to have no children?
+      stride_str = "0";
+    }
+    emit("#define {}_stride ({})", class_name, stride_str);
+  }
+  emit("");
+  const auto &node_name = snode.node_type_name;
+  if (is_place) {
+    const auto dt_name = opengl_data_type_name(snode.dt);
+    emit("#define {} int // place {}", node_name, dt_name);
+    emit("#define {}_stride {} // sizeof({})", node_name, data_type_size(snode.dt), dt_name);
+  } else if (snode.type == SNodeType::dense || snode.type == SNodeType::root) {
+    emit("#define {} int // {}", node_name, snode_type_name(snode.type));
+    const int n = (snode.type == SNodeType::dense) ? snode.n : 1;
+    emit("#define {}_n {}", node_name, n);
+    emit("#define {}_stride ({}_ch_stride * {}_n)", node_name, node_name, node_name);
+    emit("#define {}_children(a_, i) ((a_) + {}_ch_stride * (i))", node_name, node_name);
+  } else {
+    TI_ERROR("SNodeType={} not supported on OpenGL",
+             snode_type_name(snode.type));
+    TI_NOT_IMPLEMENTED;
+  }
+  emit("");
+}
+
+size_t OpenglStructCompiler::compute_snode_size(const SNode &sn) {
+  if (sn.is_place()) {
+    return data_type_size(sn.dt);
+  }
+  size_t ch_size = 0;
+  for (const auto &ch : sn.ch) {
+    ch_size += compute_snode_size(*ch);
+  }
+  const int n = (sn.type == SNodeType::dense) ? sn.n : 1;
+  return n * ch_size;
+}
+
+
+}  // namespace opengl
+TLANG_NAMESPACE_END
diff --git a/taichi/backends/struct_opengl.h b/taichi/backends/struct_opengl.h
new file mode 100644
index 0000000000000..60c0e7e05da70
--- /dev/null
+++ b/taichi/backends/struct_opengl.h
@@ -0,0 +1,40 @@
+// Codegen for the hierarchical data structure
+#pragma once
+
+#include <taichi/platform/opengl/opengl_kernel_util.h>
+#include <taichi/platform/opengl/opengl_data_types.h>
+#include <taichi/snode.h>
+
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "base.h"
+
+TLANG_NAMESPACE_BEGIN
+namespace opengl {
+
+class OpenglStructCompiler {
+ public:
+  using CompiledResult = opengl::StructCompiledResult;
+
+  CompiledResult run(SNode &node);
+
+ private:
+  void collect_snodes(SNode &snode);
+  void generate_types(const SNode &snode);
+  size_t compute_snode_size(const SNode &sn);
+
+  template <typename... Args>
+  void emit(std::string f, Args &&... args)
+  {
+    src_code_ += fmt::format(f, std::forward<Args>(args)...) + '\n';
+  }
+
+  std::vector<SNode *> snodes_;
+  std::string src_code_;
+};
+
+}  // namespace opengl
+TLANG_NAMESPACE_END
diff --git a/taichi/platform/opengl/opengl_api.cpp b/taichi/platform/opengl/opengl_api.cpp
new file mode 100644
index 0000000000000..12f3f78a0407e
--- /dev/null
+++ b/taichi/platform/opengl/opengl_api.cpp
@@ -0,0 +1,289 @@
+#include "opengl_api.h"
+
+#ifdef TI_WITH_OPENGL
+
+#include <GL/glew.h>
+#include <GLFW/glfw3.h>
+#endif
+
+TLANG_NAMESPACE_BEGIN
+namespace opengl {
+
+#ifdef TI_WITH_OPENGL
+void glapi_set_uniform(GLuint loc, float value)
+{
+  glUniform1f(loc, value);
+}
+
+static std::string add_line_markers(std::string x)
+{
+  std::string marker;
+  size_t pos = 0, npos;
+  int line = 0;
+  while (1) {
+    npos = x.find_first_of('\n', pos);
+    marker = fmt::format("{:3d} ", ++line);
+    if (npos == std::string::npos)
+      break;
+    x.insert(pos, marker);
+    pos = npos + 1 + marker.size();
+  }
+  return x;
+}
+
+struct GLShader
+{
+  GLuint id_;
+
+  GLShader(GLuint type = GL_COMPUTE_SHADER)
+  {
+    id_ = glCreateShader(type);
+  }
+
+  GLShader(std::string source, GLuint type = GL_COMPUTE_SHADER)
+    : GLShader(type)
+  {
+    this->compile(source);
+  }
+
+  ~GLShader()
+  {
+    glDeleteShader(id_);
+  }
+
+  GLShader &compile(const std::string &source)
+  {
+    const GLchar *source_cstr = source.c_str();
+    glShaderSource(id_, 1, &source_cstr, nullptr);
+
+    glCompileShader(id_);
+    GLint status = GL_TRUE;
+    glGetShaderiv(id_, GL_COMPILE_STATUS, &status);
+    if (status != GL_TRUE) {
+      GLsizei logLength;
+      glGetShaderiv(id_, GL_INFO_LOG_LENGTH, &logLength);
+      auto log = std::vector<GLchar>(logLength + 1);
+      glGetShaderInfoLog(id_, logLength, &logLength, log.data());
+      log[logLength] = 0;
+      TI_ERROR("[glsl] error while compiling shader:\n{}\n{}",
+          add_line_markers(source), log.data());
+    }
+    return *this;
+  }
+};
+
+struct GLProgram
+{
+  GLuint id_;
+
+  GLProgram()
+  {
+    id_ = glCreateProgram();
+  }
+
+  GLProgram(GLShader &shader)
+    : GLProgram()
+  {
+    this->attach(shader);
+  }
+
+  ~GLProgram()
+  {
+    glDeleteProgram(id_);
+  }
+
+  GLProgram &attach(GLShader &shader)
+  {
+    glAttachShader(id_, shader.id_);
+    return *this;
+  }
+
+  GLProgram &link()
+  {
+    glLinkProgram(id_);
+    GLint status = GL_TRUE;
+    glGetProgramiv(id_, GL_LINK_STATUS, &status);
+    if (status != GL_TRUE) {
+      GLsizei logLength;
+      glGetProgramiv(id_, GL_INFO_LOG_LENGTH, &logLength);
+      auto log = std::vector<GLchar>(logLength + 1);
+      glGetProgramInfoLog(id_, logLength, &logLength, log.data());
+      log[logLength] = 0;
+      TI_ERROR("[glsl] error while linking program:\n{}", log.data());
+    }
+    return *this;
+  }
+
+  GLProgram &use()
+  {
+    glUseProgram(id_);
+    return *this;
+  }
+
+  template <typename T>
+  void set_uniform(std::string name, T value)
+  {
+    GLuint loc = glGetUniformLocation(id_, name.c_str());
+    glapi_set_uniform(loc, value);
+  }
+};
+
+
+// https://blog.csdn.net/ylbs110/article/details/52074826
+// https://www.khronos.org/opengl/wiki/Shader_Storage_Buffer_Object
+// This is Shader Storage Buffer, we use it to share data between CPU & GPU
+struct GLSSBO
+{
+  GLuint id_;
+
+  GLSSBO()
+  {
+    glGenBuffers(1, &id_);
+  }
+
+  ~GLSSBO()
+  {
+    glDeleteBuffers(1, &id_);
+  }
+
+  /***
+   GL_{frequency}_{nature}:
+
+
+   STREAM
+       The data store contents will be modified once and used at most a few times.
+
+   STATIC
+       The data store contents will be modified once and used many times.
+
+   DYNAMIC
+       The data store contents will be modified repeatedly and used many times.
+
+
+   DRAW
+       The data store contents are modified by the application, and used as the source
+       for GL drawing and image specification commands.
+
+   READ
+       The data store contents are modified by reading data from the GL, and used to
+       return that data when queried by the application.
+
+   COPY
+       The data store contents are modified by reading data from the GL, and used as the
+       source for GL drawing and image specification commands.
+   ***/
+
+  GLSSBO &bind_data(void *data, size_t size, GLuint usage = GL_STATIC_READ)
+  {
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, size, data, usage);
+    return *this;
+  }
+
+  GLSSBO &bind_index(size_t index)
+  {
+    // SSBO index, is `layout(std430, binding = <index>)` in shader.
+    // We use only one SSBO though...
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, index, id_);
+    return *this;
+  }
+
+  void *map(size_t offset, size_t length, GLbitfield access = GL_MAP_READ_BIT)
+  {
+    // map GPU memory to CPU address space, offset within SSBO data
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_);
+    return glMapBufferRange(GL_SHADER_STORAGE_BUFFER, offset, length, access);
+  }
+
+  void *map(GLbitfield access = GL_MAP_READ_BIT)
+  {
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_);
+    return glMapBuffer(GL_SHADER_STORAGE_BUFFER, access);
+  }
+};
+
+void initialize_opengl()
+{
+  static bool gl_inited = false;
+  if (gl_inited)
+    return;
+  TI_WARN("OpenGL backend currently WIP, MAY NOT WORK");
+  gl_inited = true;
+
+  glfwInit();
+  // Compute Shader requires OpenGL 4.3+ (or OpenGL ES 3.1+)
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
+  // GLEW cannot load GL without a context
+  // And the best way to make context is by creating a window
+  // Then hide it immediately, LOL
+  GLFWwindow *window = glfwCreateWindow(1, 1, "Make GLEW Happy", nullptr, nullptr);
+  if (!window) {
+    const char *desc = nullptr;
+    GLint status = glfwGetError(&desc);
+    if (!desc) desc = "Unknown Error";
+    TI_ERROR("[glsl] cannot create GLFW window: error {}: {}", status, desc);
+  }
+  glfwHideWindow(window);
+  glfwMakeContextCurrent(window);
+  GLint status = glewInit();
+  if (status != GLEW_OK) {
+    TI_ERROR("[glsl] cannot initialize GLEW: {}", glewGetErrorString(status));
+  }
+  const char *gl_version = (const char *)glGetString(GL_VERSION);
+  if (!gl_version) {
+    TI_WARN("[glsl] cannot get OpenGL version");
+  } else {
+    TI_INFO("[glsl] OpenGL {}", gl_version);
+  }
+}
+
+void launch_glsl_kernel(std::string source, std::vector<IOV> iov)
+{
+  GLShader shader(source);
+  GLProgram program(shader);
+  program.link();
+  program.use();
+
+  std::vector<GLSSBO> ssbo(iov.size());
+  for (int i = 0; i < ssbo.size(); i++) {
+    ssbo[i].bind_index(i);
+    ssbo[i].bind_data(iov[i].base, iov[i].size, GL_DYNAMIC_READ); // input
+  }
+
+  // https://www.khronos.org/opengl/wiki/Compute_Shader
+  // https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders
+  // https://www.khronos.org/assets/uploads/developers/library/2014-siggraph-bof/KITE-BOF_Aug14.pdf
+  //
+  // `glDispatchCompute(X, Y, Z)`   - the X*Y*Z  == `Blocks`   in CUDA
+  // `layout(local_size_x = X) in;` - the X      == `Threads`  in CUDA
+  //
+  glDispatchCompute(1, 1, 1);
+  glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); // TODO(archibate): move to Program::synchroize()
+
+  for (int i = 0; i < ssbo.size(); i++) {
+    void *p = ssbo[i].map(0, iov[i].size); // output
+    std::memcpy(iov[i].base, p, iov[i].size);
+    glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
+  }
+}
+
+bool is_opengl_api_available()
+{
+  return true;
+}
+
+#else
+void launch_glsl_kernel(std::string source, std::vector<IOV> iov)
+{
+  TI_NOT_IMPLEMENTED
+}
+
+bool is_opengl_api_available()
+{
+  return false;
+}
+#endif
+
+}
+TLANG_NAMESPACE_END
diff --git a/taichi/platform/opengl/opengl_api.h b/taichi/platform/opengl/opengl_api.h
new file mode 100644
index 0000000000000..0eb23347fc3bb
--- /dev/null
+++ b/taichi/platform/opengl/opengl_api.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <taichi/common.h>
+#include <taichi/common/util.h>
+
+#include <string>
+#include <vector>
+
+#include "opengl_kernel_util.h"
+
+TLANG_NAMESPACE_BEGIN
+
+namespace opengl {
+
+void initialize_opengl();
+bool is_opengl_api_available();
+void launch_glsl_kernel(std::string source, std::vector<IOV> iov);
+
+}  // namespace opengl
+
+TLANG_NAMESPACE_END
diff --git a/taichi/platform/opengl/opengl_data_types.h b/taichi/platform/opengl/opengl_data_types.h
new file mode 100644
index 0000000000000..8df36e4f0a736
--- /dev/null
+++ b/taichi/platform/opengl/opengl_data_types.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <taichi/tlang_util.h>
+#include <string>
+
+TLANG_NAMESPACE_BEGIN
+namespace opengl {
+
+inline std::string opengl_data_type_name(DataType dt)
+{
+  // https://www.khronos.org/opengl/wiki/Data_Type_(GLSL)
+  switch (dt) {
+    case DataType::f32:
+      return "float";
+    case DataType::f64:
+      return "double";
+    case DataType::i32:
+      return "int";
+    default:
+      TI_NOT_IMPLEMENTED;
+      break;
+  }
+  return "";
+}
+
+inline bool is_opengl_binary_op_infix(BinaryOpType type)
+{
+  return !((type == BinaryOpType::min) || (type == BinaryOpType::max) ||
+           (type == BinaryOpType::atan2) || (type == BinaryOpType::pow));
+}
+
+}  // namespace opengl
+TLANG_NAMESPACE_END
diff --git a/taichi/platform/opengl/opengl_kernel.cpp b/taichi/platform/opengl/opengl_kernel.cpp
new file mode 100644
index 0000000000000..36adc6141b6c4
--- /dev/null
+++ b/taichi/platform/opengl/opengl_kernel.cpp
@@ -0,0 +1,33 @@
+#include "opengl_kernel.h"
+#include "opengl_api.h"
+
+#include <taichi/backends/base.h>
+#include <taichi/backends/kernel.h>
+
+TLANG_NAMESPACE_BEGIN
+namespace opengl {
+
+SSBO::SSBO(size_t data_size_)
+  : data_(data_size_), data_size(data_size_)
+{}
+
+void SSBO::load_arguments_from(Context &ctx)
+{
+  uint64_t *data_i = (uint64_t *)data();
+  for (int i = 0; i < taichi_max_num_args; i++) {
+    uint64_t value = ctx.get_arg<uint64_t>(i);
+    data_i[i] = value;
+  }
+}
+
+void SSBO::save_returns_to(Context &ctx)
+{
+  uint64_t *data_i = (uint64_t *)data();
+  for (int i = 0; i < taichi_max_num_args; i++) {
+    uint64_t value = data_i[i];
+    ctx.set_arg<uint64_t>(i, value);
+  }
+}
+
+}  // namespace opengl
+TLANG_NAMESPACE_END
diff --git a/taichi/platform/opengl/opengl_kernel.h b/taichi/platform/opengl/opengl_kernel.h
new file mode 100644
index 0000000000000..d95449b0bbdc4
--- /dev/null
+++ b/taichi/platform/opengl/opengl_kernel.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <taichi/common.h>
+#include <taichi/constants.h>
+#include <taichi/tlang_util.h>
+
+#include <string>
+#include <vector>
+
+#include "opengl_kernel_util.h"
+
+
+TLANG_NAMESPACE_BEGIN
+
+namespace opengl {
+
+struct SSBO
+{
+  std::vector<uint8_t> data_;
+  const size_t data_size;
+
+  SSBO(size_t data_size);
+
+  void load_arguments_from(Context &ctx);
+  void save_returns_to(Context &ctx);
+  inline void *data()
+  {
+    return (void *)data_.data();
+  }
+
+  inline operator IOV()
+  {
+    return IOV{data(), data_size};
+  }
+};
+
+}  // namespace opengl
+
+TLANG_NAMESPACE_END
diff --git a/taichi/platform/opengl/opengl_kernel_util.h b/taichi/platform/opengl/opengl_kernel_util.h
new file mode 100644
index 0000000000000..7c703683c4423
--- /dev/null
+++ b/taichi/platform/opengl/opengl_kernel_util.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <taichi/statements.h>
+
+TLANG_NAMESPACE_BEGIN
+
+class SNode;
+
+namespace opengl {
+
+struct StructCompiledResult {
+  // Source code of the SNode data structures compiled to GLSL
+  std::string source_code;
+  // Root buffer size in bytes.
+  size_t root_size;
+};
+
+struct IOV
+{
+  void *base;
+  size_t size;
+};
+
+}  // namespace opengl
+
+TLANG_NAMESPACE_END
diff --git a/taichi/profiler.cpp b/taichi/profiler.cpp
index e7352dcea443f..e822119cd7021 100644
--- a/taichi/profiler.cpp
+++ b/taichi/profiler.cpp
@@ -149,7 +149,7 @@ class CUDAProfiler : public ProfilerBase {
 }  // namespace
 
 std::unique_ptr<ProfilerBase> make_profiler(Arch arch) {
-  if (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::metal) {
+  if (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::metal || arch == Arch::opengl) {
     return std::make_unique<DefaultProfiler>(arch);
   } else if (arch == Arch::cuda) {
     return std::make_unique<CUDAProfiler>();
diff --git a/taichi/program.cpp b/taichi/program.cpp
index 89def61830131..f1dce2960855c 100644
--- a/taichi/program.cpp
+++ b/taichi/program.cpp
@@ -4,12 +4,15 @@
 
 #include <taichi/common/task.h>
 #include <taichi/platform/metal/metal_api.h>
+#include <taichi/platform/opengl/opengl_api.h>
 
 #include "backends/codegen_cuda.h"
 #include "backends/codegen_metal.h"
+#include "backends/codegen_opengl.h"
 #include "backends/codegen_x86.h"
 #include "backends/struct.h"
 #include "backends/struct_metal.h"
+#include "backends/struct_opengl.h"
 #include "unified_allocator.h"
 #include "snode.h"
 
@@ -39,7 +42,7 @@ Program::Program(Arch arch) {
 #if !defined(TI_WITH_CUDA)
   if (arch == Arch::cuda) {
     TI_WARN("Taichi is not compiled with CUDA.");
-    TI_WARN("Falling back to x86_64");
+    TI_WARN("Falling back to x64");
     arch = Arch::x64;
   }
 #else
@@ -47,14 +50,20 @@ Program::Program(Arch arch) {
     cuda_context = std::make_unique<CUDAContext>();
     if (!cuda_context->detected()) {
       TI_WARN("No CUDA device detected.");
-      TI_WARN("Falling back to x86_64");
+      TI_WARN("Falling back to x64");
       arch = Arch::x64;
     }
   }
 #endif
   if (arch == Arch::metal) {
     if (!metal::is_metal_api_available()) {
-      TI_WARN("No Metal API detected, falling back to x86_64");
+      TI_WARN("No Metal API detected, falling back to x64");
+      arch = Arch::x64;
+    }
+  }
+  if (arch == Arch::opengl) {
+    if (!opengl::is_opengl_api_available()) {
+      TI_WARN("No OpenGL API detected, falling back to x64");
       arch = Arch::x64;
     }
   }
@@ -96,6 +105,9 @@ FunctionType Program::compile(Kernel &kernel) {
   } else if (kernel.arch == Arch::metal) {
     metal::MetalCodeGen codegen(kernel.name, &metal_struct_compiled_.value());
     ret = codegen.compile(*this, kernel, metal_runtime_.get());
+  } else if (kernel.arch == Arch::opengl) {
+    opengl::OpenglCodeGen codegen(kernel.name, &opengl_struct_compiled_.value());
+    ret = codegen.compile(*this, kernel);
   } else {
     TI_NOT_IMPLEMENTED;
   }
@@ -173,7 +185,7 @@ void Program::initialize_runtime_system(StructCompiler *scomp) {
 }
 
 void Program::materialize_layout() {
-  // always use arch=x86_64 since this is for host accessors
+  // always use arch=x64 since this is for host accessors
   // TODO: arch may also be arm etc.
   std::unique_ptr<StructCompiler> scomp = StructCompiler::make(this, Arch::x64);
   scomp->run(*snode_root, true);
@@ -183,6 +195,7 @@ void Program::materialize_layout() {
     initialize_runtime_system(scomp.get());
   }
 
+  TI_INFO("materialize_layout called");
   if (config.arch == Arch::cuda && config.use_llvm) {
     initialize_device_llvm_context();
     // llvm_context_device->get_init_module();
@@ -204,6 +217,11 @@ void Program::materialize_layout() {
       params.profiler = profiler.get();
       metal_runtime_ = std::make_unique<metal::MetalRuntime>(std::move(params));
     }
+  } else if (config.arch == Arch::opengl) {
+    opengl::OpenglStructCompiler scomp;
+    opengl_struct_compiled_ = scomp.run(*snode_root);
+    TI_INFO("OpenGL root buffer size: {} B", opengl_struct_compiled_->root_size);
+    opengl::initialize_opengl();
   }
 }
 
@@ -311,6 +329,15 @@ void Program::initialize_device_llvm_context() {
   }
 }
 
+Arch Program::get_snode_accessor_arch()
+{
+  if (config.arch == Arch::opengl) {
+    return Arch::opengl;
+  } else {
+    return get_host_arch();
+  }
+}
+
 Kernel &Program::get_snode_reader(SNode *snode) {
   TI_ASSERT(snode->type == SNodeType::place);
   auto kernel_name = fmt::format("snode_reader_{}", snode->id);
@@ -323,7 +350,7 @@ Kernel &Program::get_snode_reader(SNode *snode) {
         snode->num_active_indices, load_if_ptr((snode->expr)[indices]));
     current_ast_builder().insert(std::move(ret));
   });
-  ker.set_arch(get_host_arch());
+  ker.set_arch(get_snode_accessor_arch());
   ker.name = kernel_name;
   ker.is_accessor = true;
   for (int i = 0; i < snode->num_active_indices; i++)
@@ -344,7 +371,7 @@ Kernel &Program::get_snode_writer(SNode *snode) {
     (snode->expr)[indices] =
         Expr::make<ArgLoadExpression>(snode->num_active_indices);
   });
-  ker.set_arch(get_host_arch());
+  ker.set_arch(get_snode_accessor_arch());
   ker.name = kernel_name;
   ker.is_accessor = true;
   for (int i = 0; i < snode->num_active_indices; i++)
diff --git a/taichi/program.h b/taichi/program.h
index abf2433172050..982f0c72b1df8 100644
--- a/taichi/program.h
+++ b/taichi/program.h
@@ -17,6 +17,7 @@
 #include <optional>
 #include <taichi/platform/metal/metal_kernel_util.h>
 #include <taichi/platform/metal/metal_runtime.h>
+#include <taichi/platform/opengl/opengl_kernel_util.h>
 
 #if defined(TI_PLATFORM_UNIX)
 #include <dlfcn.h>
@@ -158,6 +159,8 @@ class Program {
     return Arch::x64;
   }
 
+  Arch get_snode_accessor_arch();
+
   float64 get_total_compilation_time() {
     return total_compilation_time;
   }
@@ -168,6 +171,7 @@ class Program {
 
  private:
   std::optional<metal::StructCompiledResult> metal_struct_compiled_;
+  std::optional<opengl::StructCompiledResult> opengl_struct_compiled_;
   std::unique_ptr<metal::MetalRuntime> metal_runtime_;
 };
 
diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp
index 18ae9bbf28b9f..d378421dd3bf1 100644
--- a/taichi/python/export_misc.cpp
+++ b/taichi/python/export_misc.cpp
@@ -13,6 +13,7 @@
 #include <taichi/system/memory.h>
 #include <taichi/system/unit_dll.h>
 #include <taichi/platform/metal/metal_api.h>
+#include <taichi/platform/opengl/opengl_api.h>
 #if defined(TI_WITH_CUDA)
 #include <cuda_runtime_api.h>
 #endif
@@ -159,6 +160,7 @@ void export_misc(py::module &m) {
   });
   m.def("with_cuda", with_cuda);
   m.def("with_metal", taichi::Tlang::metal::is_metal_api_available);
+  m.def("with_opengl", taichi::Tlang::opengl::is_opengl_api_available);
 }
 
 TI_NAMESPACE_END
diff --git a/test_abs.py b/test_abs.py
new file mode 100644
index 0000000000000..a8c534b4c6a5f
--- /dev/null
+++ b/test_abs.py
@@ -0,0 +1,35 @@
+import taichi as ti
+
+@ti.all_archs
+def test_abs():
+  x = ti.var(ti.f32)
+  y = ti.var(ti.f32)
+
+  N = 16
+
+  ti.root.dense(ti.i, N).place(x)
+  ti.root.dense(ti.i, N).place(y)
+  ti.root.lazy_grad()
+
+  @ti.kernel
+  def func():
+    for i in range(N):
+      x[i] = ti.abs(y[i])
+
+  for i in range(N):
+    y[i] = i - 10
+    x.grad[i] = 1
+
+  func()
+  func.grad()
+
+  def sgn(x):
+    if x > 0:
+      return 1
+    if x < 0:
+      return -1
+    return 0
+
+  for i in range(N):
+    assert x[i] == abs(y[i])
+    assert y.grad[i] == sgn(y[i])