galeselee · galeselee · Dec 31, 2022 · Dec 30, 2022 · Dec 30, 2022 · Dec 30, 2022
diff --git a/.github/workflows/scripts/aot-demo.sh b/.github/workflows/scripts/aot-demo.sh
@@ -4,7 +4,7 @@ set -ex
 export TI_SKIP_VERSION_CHECK=ON
 export TI_CI=1
 
-export TAICHI_AOT_DEMO_URL=https://github.com/taichi-dev/taichi-aot-demo
+export TAICHI_AOT_DEMO_URL=https://github.com/bobcao3/taichi-aot-demo
 export TAICHI_AOT_DEMO_BRANCH=master
 
 export TAICHI_UNITY2_URL=https://github.com/taichi-dev/taichi-unity2

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -291,7 +291,7 @@ jobs:
           . .github/workflows/scripts/common-utils.sh
 
           ci-docker-run-amdgpu --name taichi-build \
-            registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.5 \
             /home/dev/taichi/.github/workflows/scripts/build.py
 
         env:
@@ -302,6 +302,7 @@ jobs:
             -DTI_WITH_VULKAN:BOOL=OFF
             -DTI_WITH_OPENGL:BOOL=OFF
             -DTI_BUILD_TESTS:BOOL=ON
+            -DTI_WITH_AMDGPU:BOOL=ON
 
       - name: Test
         id: test
@@ -310,7 +311,7 @@ jobs:
           . .github/workflows/scripts/common-utils.sh
 
           ci-docker-run-amdgpu --name taichi-test \
-             registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \
+             registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.5 \
              /home/dev/taichi/.github/workflows/scripts/unix_test.sh
         env:
           PY: '3.8'

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -181,6 +181,10 @@ if (TI_WITH_CUDA)
     set(CUDA_ARCH "cuda")
 endif()
 
+if (TI_WITH_AMDGPU)
+    set(AMDGPU_ARCH "amdgpu")
+endif()
+
 if (TI_WITH_DX12)
     set(DX12_ARCH "dx12")
 endif()

diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
@@ -3,6 +3,7 @@ option(TI_WITH_LLVM "Build with LLVM backends" ON)
 option(TI_WITH_METAL "Build with the Metal backend" ON)
 option(TI_WITH_CUDA "Build with the CUDA backend" ON)
 option(TI_WITH_CUDA_TOOLKIT "Build with the CUDA toolkit" OFF)
+option(TI_WITH_AMDGPU "Build with the AMDGPU backend" OFF)
 option(TI_WITH_OPENGL "Build with the OpenGL backend" ON)
 option(TI_WITH_CC "Build with the C backend" ON)
 option(TI_WITH_VULKAN "Build with the Vulkan backend" OFF)
@@ -34,6 +35,10 @@ if(ANDROID)
     set(TI_WITH_DX12 OFF)
 endif()
 
+if (TI_WITH_AMDGPU AND TI_WITH_CUDA)
+    message(WARNING "Compiling CUDA and AMDGPU backends simultaneously")
+endif()
+
 if(UNIX AND NOT APPLE)
     # Handy helper for Linux
     # https://stackoverflow.com/a/32259072/12003165
@@ -53,13 +58,21 @@ if (APPLE)
         set(TI_WITH_CC OFF)
         message(WARNING "C backend not supported on OS X. Setting TI_WITH_CC to OFF.")
     endif()
+    if (TI_WITH_AMDGPU)
+        set(TI_WITH_AMDGPU OFF)
+        message(WARNING "AMDGPU backend not supported on OS X. Setting TI_WITH_AMDGPU to OFF.")
+    endif()
 endif()
 
 if (WIN32)
     if (TI_WITH_CC)
         set(TI_WITH_CC OFF)
         message(WARNING "C backend not supported on Windows. Setting TI_WITH_CC to OFF.")
     endif()
+    if (TI_WITH_AMDGPU)
+        set(TI_WITH_AMDGPU OFF)
+        message(WARNING "AMDGPU backend not supported on Windows. Setting TI_WITH_AMDGPU to OFF.")
+    endif()
 endif()
 
 if(TI_WITH_VULKAN)
@@ -108,6 +121,12 @@ if (TI_WITH_CUDA)
   list(APPEND TAICHI_CORE_SOURCE ${TAICHI_CUDA_RUNTIME_SOURCE})
 endif()
 
+if (TI_WITH_AMDGPU)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_AMDGPU")
+# file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
+  list(APPEND TAIHI_CORE_SOURCE ${TAICHI_AMDGPU_RUNTIME_SOURCE})
+endif()
+
 if (TI_WITH_DX12)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_DX12")
 endif()
@@ -215,6 +234,12 @@ if(TI_WITH_LLVM)
         target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE cuda_rhi)
     endif()
 
+    if (TI_WITH_AMDGPU)
+        llvm_map_components_to_libnames(llvm_amdgpu_libs AMDGPU)
+        add_subdirectory(taichi/rhi/amdgpu)
+        target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE amdgpu_rhi)
+    endif()
+
     if (TI_WITH_DX12)
         llvm_map_components_to_libnames(llvm_directx_libs DirectX)
 

diff --git a/cpp_examples/rhi_examples/sample_2_triangle.cpp b/cpp_examples/rhi_examples/sample_2_triangle.cpp
@@ -73,6 +73,12 @@ class SampleApp : public App {
       device->unmap(*vertex_buffer);
     }
 
+    // Define the raster state
+    {
+      raster_resources = device->create_raster_resources_unique();
+      raster_resources->vertex_buffer(vertex_buffer->get_ptr(0), 0);
+    }
+
     TI_INFO("App Init Done");
   }
 
@@ -94,10 +100,7 @@ class SampleApp : public App {
 
     // Bind our triangle pipeline
     cmdlist->bind_pipeline(pipeline.get());
-    // Get the binder and bind our vertex buffer
-    auto resource_binder = pipeline->resource_binder();
-    resource_binder->vertex_buffer(vertex_buffer->get_ptr(0), 0);
-    cmdlist->bind_resources(resource_binder);
+    cmdlist->bind_raster_resources(raster_resources.get());
     // Render the triangle
     cmdlist->draw(3, 0);
     // End rendering
@@ -110,9 +113,10 @@ class SampleApp : public App {
   }
 
  public:
-  std::unique_ptr<Pipeline> pipeline;
+  std::unique_ptr<Pipeline> pipeline{nullptr};
+  std::unique_ptr<RasterResources> raster_resources{nullptr};
 
-  std::unique_ptr<DeviceAllocationGuard> vertex_buffer;
+  std::unique_ptr<DeviceAllocationGuard> vertex_buffer{nullptr};
 };
 
 int main() {

diff --git a/taichi/analysis/gather_statement_usages.cpp b/taichi/analysis/gather_statement_usages.cpp
@@ -0,0 +1,55 @@
+#include "taichi/ir/ir.h"
+#include "taichi/ir/statements.h"
+#include "taichi/ir/transforms.h"
+#include "taichi/ir/visitors.h"
+
+namespace taichi::lang {
+
+class GatherStatementUsages : public BasicStmtVisitor {
+ private:
+  using BasicStmtVisitor::visit;
+
+  // maps a stmt to all its usages <stmt, operand>
+  std::unordered_map<Stmt *, std::vector<std::pair<Stmt *, int>>> stmt_usages_;
+
+ public:
+  explicit GatherStatementUsages() {
+    invoke_default_visitor = true;
+  }
+
+  void default_visit(Stmt *stmt) {
+    auto ops = stmt->get_operands();
+    for (int i = 0; i < ops.size(); i++) {
+      auto &op = ops[i];
+      if (op != nullptr) {
+        stmt_usages_[op].push_back({stmt, i});
+      }
+    }
+  }
+
+  void visit(Stmt *stmt) override {
+    default_visit(stmt);
+  }
+
+  void preprocess_container_stmt(Stmt *stmt) override {
+    default_visit(stmt);
+  }
+
+  static std::unordered_map<Stmt *, std::vector<std::pair<Stmt *, int>>> run(
+      IRNode *node) {
+    GatherStatementUsages pass;
+    node->accept(&pass);
+    return pass.stmt_usages_;
+  }
+};
+
+namespace irpass::analysis {
+
+std::unordered_map<Stmt *, std::vector<std::pair<Stmt *, int>>>
+gather_statement_usages(IRNode *root) {
+  return GatherStatementUsages::run(root);
+}
+
+}  // namespace irpass::analysis
+
+}  // namespace taichi::lang
diff --git a/taichi/cache/metal/cache_manager.cpp b/taichi/cache/metal/cache_manager.cpp
@@ -59,13 +59,15 @@ CacheManager::CacheManager(Params &&init_params)
   if (config_.mode == MemAndDiskCache) {
     const auto filepath = join_path(config_.cache_path, kMetadataFilename);
     const auto lock_path = join_path(config_.cache_path, kMetadataLockName);
-    if (lock_with_file(lock_path)) {
-      auto _ = make_unlocker(lock_path);
-      offline_cache::load_metadata_with_checking(cached_data_, filepath);
-    } else {
-      TI_WARN(
-          "Lock {} failed. You can run 'ti cache clean -p {}' and try again.",
-          lock_path, config_.cache_path);
+    if (path_exists(filepath)) {
+      if (lock_with_file(lock_path)) {
+        auto _ = make_unlocker(lock_path);
+        offline_cache::load_metadata_with_checking(cached_data_, filepath);
+      } else {
+        TI_WARN(
+            "Lock {} failed. You can run 'ti cache clean -p {}' and try again.",
+            lock_path, config_.cache_path);
+      }
     }
   }
 }

diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -91,7 +91,7 @@ class TaskCodegen : public IRVisitor {
 
   void fill_snode_to_root() {
     for (int root = 0; root < compiled_structs_.size(); ++root) {
-      for (auto [node_id, node] : compiled_structs_[root].snode_descriptors) {
+      for (auto &[node_id, node] : compiled_structs_[root].snode_descriptors) {
         snode_to_root_[node_id] = root;
       }
     }
@@ -108,9 +108,6 @@ class TaskCodegen : public IRVisitor {
     kernel_function_ = ir_->new_function();  // void main();
     ir_->debug_name(spv::OpName, kernel_function_, "main");
 
-    compile_args_struct();
-    compile_ret_struct();
-
     if (task_ir_->task_type == OffloadedTaskType::serial) {
       generate_serial_kernel(task_ir_);
     } else if (task_ir_->task_type == OffloadedTaskType::range_for) {
@@ -1749,22 +1746,21 @@ class TaskCodegen : public IRVisitor {
     std::vector<spirv::Value> buffers;
     if (caps_->get(DeviceCapability::spirv_version) > 0x10300) {
       buffers = shared_array_binds_;
-      std::unordered_set<BufferInfo, BufferInfoHasher> unique_bufs;
       // One buffer can be bound to different bind points but has to be unique
       // in OpEntryPoint interface declarations.
       // From Spec: before SPIR-V version 1.4, duplication of these interface id
       // is tolerated. Starting with version 1.4, an interface id must not
       // appear more than once.
+      std::unordered_set<spirv::Value, spirv::ValueHasher> entry_point_values;
       for (const auto &bb : task_attribs_.buffer_binds) {
-        if (unique_bufs.count(bb.buffer) == 0) {
-          for (auto &it : buffer_value_map_) {
-            if (it.first.first == bb.buffer) {
-              buffers.push_back(it.second);
-            }
+        for (auto &it : buffer_value_map_) {
+          if (it.first.first == bb.buffer) {
+            entry_point_values.insert(it.second);
           }
-          unique_bufs.insert(bb.buffer);
         }
       }
+      buffers.insert(buffers.end(), entry_point_values.begin(),
+                     entry_point_values.end());
     }
     ir_->commit_kernel_function(kernel_function_, "main", buffers,
                                 group_size);  // kernel entry
@@ -2248,12 +2244,16 @@ class TaskCodegen : public IRVisitor {
     }
 
     if (buffer.type == BufferType::Args) {
+      compile_args_struct();
+
       buffer_binding_map_[key] = 0;
       buffer_value_map_[key] = args_buffer_value_;
       return args_buffer_value_;
     }
 
     if (buffer.type == BufferType::Rets) {
+      compile_ret_struct();
+
       buffer_binding_map_[key] = 1;
       buffer_value_map_[key] = ret_buffer_value_;
       return ret_buffer_value_;
@@ -2537,7 +2537,7 @@ void KernelCodegen::run(TaichiKernelAttributes &kernel_attribs,
 
     size_t last_size;
     bool success = true;
-    do {
+    {
       last_size = optimized_spv.size();
       bool result = false;
       TI_ERROR_IF(
@@ -2546,9 +2546,8 @@ void KernelCodegen::run(TaichiKernelAttributes &kernel_attribs,
           "SPIRV optimization failed");
       if (result) {
         success = false;
-        break;
       }
-    } while (last_size != optimized_spv.size());
+    }
 
     TI_TRACE("SPIRV-Tools-opt: binary size, before={}, after={}",
              task_res.spirv_code.size(), optimized_spv.size());

diff --git a/taichi/codegen/spirv/spirv_ir_builder.cpp b/taichi/codegen/spirv/spirv_ir_builder.cpp
@@ -835,11 +835,7 @@ Value IRBuilder::fetch_texel(Value texture_var,
   // OpImageFetch requires operand with OpImageType
   // We have to extract the underlying OpImage from OpSampledImage here
   SType image_type = get_underlying_image_type(f32_type(), args.size());
-  Value image_val = new_value(image_type, ValueKind::kNormal);
-
-  ib_.begin(spv::OpImage)
-      .add_seq(image_type, image_val, sampled_image)
-      .commit(&function_);
+  Value image_val = make_value(spv::OpImage, image_type, sampled_image);
 
   Value uv;
   if (args.size() == 1) {

diff --git a/taichi/codegen/spirv/spirv_ir_builder.h b/taichi/codegen/spirv/spirv_ir_builder.h
@@ -86,6 +86,16 @@ struct Value {
   SType stype;
   // Additional flags about the value
   ValueKind flag{ValueKind::kNormal};
+
+  bool operator==(const Value &rhs) const {
+    return id == rhs.id;
+  }
+};
+
+struct ValueHasher {
+  size_t operator()(const spirv::Value &v) const {
+    return std::hash<uint32_t>()(v.id);
+  }
 };
 
 // Represent the SPIRV Label

diff --git a/taichi/ir/analysis.h b/taichi/ir/analysis.h
@@ -95,6 +95,8 @@ bool definitely_same_address(Stmt *var1, Stmt *var2);
 
 std::unordered_set<Stmt *> detect_fors_with_break(IRNode *root);
 std::unordered_set<Stmt *> detect_loops_with_continue(IRNode *root);
+std::unordered_map<Stmt *, std::vector<std::pair<Stmt *, int>>>
+gather_statement_usages(IRNode *root);
 std::unordered_set<Stmt *> gather_immutable_local_vars(IRNode *root);
 std::unordered_set<SNode *> gather_deactivations(IRNode *root);
 std::pair<std::unordered_set<SNode *>, std::unordered_set<SNode *>>

diff --git a/taichi/ir/ir.cpp b/taichi/ir/ir.cpp
@@ -4,7 +4,7 @@
 #include <thread>
 #include <unordered_map>
 
-// #include "taichi/ir/analysis.h"
+#include "taichi/ir/analysis.h"
 #include "taichi/ir/statements.h"
 #include "taichi/ir/transforms.h"
 
@@ -496,4 +496,16 @@ void DelayedIRModifier::mark_as_modified() {
   modified_ = true;
 }
 
+ImmediateIRModifier::ImmediateIRModifier(IRNode *root) {
+  stmt_usages_ = irpass::analysis::gather_statement_usages(root);
+}
+
+void ImmediateIRModifier::replace_usages_with(Stmt *old_stmt, Stmt *new_stmt) {
+  if (stmt_usages_.find(old_stmt) == stmt_usages_.end())
+    return;
+  for (auto &[usage, i] : stmt_usages_.at(old_stmt)) {
+    usage->set_operand(i, new_stmt);
+  }
+}
+
 }  // namespace taichi::lang
diff --git a/taichi/ir/ir.h b/taichi/ir/ir.h
@@ -609,6 +609,19 @@ class DelayedIRModifier {
   void mark_as_modified();
 };
 
+// ImmediateIRModifier aims at replacing Stmt::replace_usages_with, which visits
+// the whole tree for a single replacement. ImmediateIRModifier is currently
+// associated with a pass, visits the whole tree once at the beginning of that
+// pass, and performs a single replacement with amortized constant time.
+class ImmediateIRModifier {
+ private:
+  std::unordered_map<Stmt *, std::vector<std::pair<Stmt *, int>>> stmt_usages_;
+
+ public:
+  explicit ImmediateIRModifier(IRNode *root);
+  void replace_usages_with(Stmt *old_stmt, Stmt *new_stmt);
+};
+
 template <typename T>
 inline void StmtFieldManager::operator()(const char *key, T &&value) {
   using decay_T = typename std::decay<T>::type;