diff --git a/taichi/backends/codegen_opengl.cpp b/taichi/backends/codegen_opengl.cpp
index a27984e865e3c7..414b61b03699cb 100644
--- a/taichi/backends/codegen_opengl.cpp
+++ b/taichi/backends/codegen_opengl.cpp
@@ -3,12 +3,59 @@
 #include <taichi/platform/opengl/opengl_data_types.h>
 
 #include <string>
+#include <cstdlib>
 #include <taichi/ir.h>
 
 TLANG_NAMESPACE_BEGIN
 namespace opengl {
 namespace {
 
+struct SSBO
+{
+  void *data;
+  const size_t data_size;
+
+  SSBO(size_t data_size)
+    : data_size(data_size)
+  {
+    TI_INFO("[glsl] Allocating {} B SSBO", data_size);
+    data = std::malloc(data_size);
+  }
+
+  void load_arguments_from(Context &ctx)
+  {
+    uint64_t *data_i = (uint64_t *)data;
+    for (int i = 0; i < taichi_max_num_args; i++) {
+      uint64_t value = ctx.get_arg<uint64_t>(i);
+      data_i[i] = value;
+    }
+  }
+
+  void save_returns_to(Context &ctx)
+  {
+    uint64_t *data_i = (uint64_t *)data;
+    for (int i = 0; i < taichi_max_num_args; i++) {
+      uint64_t value = data_i[i];
+      ctx.set_arg<uint64_t>(i, value);
+    }
+  }
+
+  void update(void *data_r)
+  {
+    std::memcpy(data, data_r, data_size);
+  }
+
+  operator IOV()
+  {
+    return IOV{data, data_size};
+  }
+
+  ~SSBO()
+  {
+    std::free(data);
+  }
+};
+
 class KernelGen : public IRVisitor
 {
   Kernel *kernel;
@@ -285,7 +332,7 @@ class KernelGen : public IRVisitor
 
   void visit(OffloadedStmt *stmt) override
   {
-    TI_ASSERT(is_top_level_);
+    TI_ASSERT(is_top_level_); // TODO(archibate): remove for nested kernel (?)
     is_top_level_ = false;
     using Type = OffloadedStmt::TaskType;
     if (stmt->task_type == Type::serial) {
@@ -307,6 +354,11 @@ class KernelGen : public IRVisitor
     return kernel_src_code_;
   }
 
+  SSBO *create_root_ssbo()
+  {
+    return new SSBO(struct_compiled_->root_size);
+  }
+
   void run(const SNode &root_snode)
   {
     //TI_INFO("ntm:: {}", root_snode.node_type_name);
@@ -437,39 +489,22 @@ void OpenglCodeGen::lower()
   }
 }
 
-void load_data(Context &ctx, void *data)
-{
-  int *data_ = (int *)data;
-  for (int i = 0; i < taichi_max_num_args; i++) {
-    int value = ctx.get_arg<int>(i);
-    data_[i] = value;
-  }
-}
-
-void save_data(Context &ctx, void *data)
-{
-  int *data_ = (int *)data;
-  for (int i = 0; i < taichi_max_num_args; i++) {
-    int value = data_[i];
-    ctx.set_arg<int>(i, value);
-  }
-}
-
 FunctionType OpenglCodeGen::gen(void)
 {
   KernelGen codegen(kernel_, kernel_name_, struct_compiled_);
   codegen.run(*prog_->snode_root);
+  SSBO *root_sb = codegen.create_root_ssbo();
   const std::string kernel_source_code = codegen.kernel_source_code();
-  //TI_INFO("\n{}", kernel_source_code);
-
-  return [kernel_source_code](Context &ctx) {
-    void *data, *data_r;
-    size_t data_size = 1024; // ...
-    data = malloc(data_size);
-    load_data(ctx, data);
-    data_r = launch_glsl_kernel(kernel_source_code, data, data_size);
-    free(data);
-    save_data(ctx, data_r);
+
+  return [kernel_source_code, root_sb](Context &ctx) {
+    // TODO(archibate): find out where get_arg<int> stored, and just new SSBO(ctx)
+    SSBO *arg_sb = new SSBO(taichi_max_num_args * sizeof(uint64_t));
+    arg_sb->load_arguments_from(ctx);
+    std::vector<IOV> iov = {*arg_sb, *root_sb};
+    std::vector<void *> res = launch_glsl_kernel(kernel_source_code, iov);
+    arg_sb->update(res[0]);
+    arg_sb->save_returns_to(ctx);
+    unmap_all_ssbo();
   };
 }
 
diff --git a/taichi/platform/opengl/opengl_api.cpp b/taichi/platform/opengl/opengl_api.cpp
index 53628053f64c06..b5893a90b715d3 100644
--- a/taichi/platform/opengl/opengl_api.cpp
+++ b/taichi/platform/opengl/opengl_api.cpp
@@ -129,6 +129,7 @@ struct GLProgram
   }
 };
 
+
 // https://blog.csdn.net/ylbs110/article/details/52074826
 // https://www.khronos.org/opengl/wiki/Shader_Storage_Buffer_Object
 // This is Shader Storage Buffer, we use it to share data between CPU & GPU
@@ -191,8 +192,15 @@ struct GLSSBO
   void *map(size_t offset, size_t length, GLbitfield access = GL_MAP_READ_BIT)
   {
     // map GPU memory to CPU address space, offset within SSBO data
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_);
     return glMapBufferRange(GL_SHADER_STORAGE_BUFFER, offset, length, access);
   }
+
+  void *map(GLbitfield access = GL_MAP_READ_BIT)
+  {
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_);
+    return glMapBuffer(GL_SHADER_STORAGE_BUFFER, access);
+  }
 };
 
 void initialize_opengl()
@@ -227,7 +235,7 @@ void initialize_opengl()
     }
 }
 
-void *launch_glsl_kernel(std::string source, void *data, size_t data_size)
+std::vector<void *> launch_glsl_kernel(std::string source, std::vector<IOV> iov)
 {
   static bool gl_inited = false;
   if (!gl_inited) {
@@ -241,9 +249,11 @@ void *launch_glsl_kernel(std::string source, void *data, size_t data_size)
   program.link();
   program.use();
 
-  GLSSBO ssbo;
-  ssbo.bind_index(0);
-  ssbo.bind_data(data, data_size, GL_DYNAMIC_READ); // input
+  std::vector<GLSSBO> ssbo(iov.size());
+  for (int i = 0; i < ssbo.size(); i++) {
+    ssbo[i].bind_index(i);
+    ssbo[i].bind_data(iov[i].base, iov[i].size, GL_DYNAMIC_READ); // input
+  }
 
   // https://www.khronos.org/opengl/wiki/Compute_Shader
   // https://community.arm.com/developer/tools-software/graphics/b/blog/posts/get-started-with-compute-shaders
@@ -253,10 +263,18 @@ void *launch_glsl_kernel(std::string source, void *data, size_t data_size)
   // `layout(local_size_x = X) in;` - the X      == `Threads`  in CUDA
   //
   glDispatchCompute(1, 1, 1);
-  glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+  glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); // TODO(archibate): move to Program::synchroize()
 
-  void *data_r = ssbo.map(0, data_size); // output
-  return data_r;
+  std::vector<void *> maps(ssbo.size());
+  for (int i = 0; i < ssbo.size(); i++) {
+    maps[i] = ssbo[i].map(0, iov[i].size); // output
+  }
+  return maps;
+}
+
+void unmap_all_ssbo()
+{
+  glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
 }
 
 bool is_opengl_api_available()
diff --git a/taichi/platform/opengl/opengl_api.h b/taichi/platform/opengl/opengl_api.h
index f8589d59689834..a896c0e9accf28 100644
--- a/taichi/platform/opengl/opengl_api.h
+++ b/taichi/platform/opengl/opengl_api.h
@@ -10,8 +10,15 @@ TLANG_NAMESPACE_BEGIN
 
 namespace opengl {
 
+struct IOV
+{
+  void *base;
+  size_t size;
+};
+
 bool is_opengl_api_available();
-void *launch_glsl_kernel(std::string source, void *data, size_t data_size);
+std::vector<void *> launch_glsl_kernel(std::string source, std::vector<IOV> iov);
+void unmap_all_ssbo();
 
 }  // namespace opengl