From f02eb6edf5d1610700a62a5167fd3e47a555961f Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 11:42:40 +0800
Subject: [PATCH 01/26] add platform and update TaichiCore.cmake

---
 cmake/TaichiCore.cmake                   |  5 +++--
 taichi/platform/amdgpu/detect_amdgpu.cpp | 17 +++++++++++++++++
 taichi/platform/amdgpu/detect_amdgpu.h   |  5 +++++
 3 files changed, 25 insertions(+), 2 deletions(-)
 create mode 100644 taichi/platform/amdgpu/detect_amdgpu.cpp
 create mode 100644 taichi/platform/amdgpu/detect_amdgpu.h

diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 6b94c0dbb9e51..bff51aeb1fd1a 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -104,7 +104,8 @@ file(GLOB TAICHI_CORE_SOURCE
     "taichi/system/*"
     "taichi/transforms/*"
     "taichi/aot/*.cpp" "taichi/aot/*.h"
-    "taichi/platform/cuda/*" "taichi/platform/mac/*" "taichi/platform/windows/*"
+    "taichi/platform/cuda/*" "taichi/platform/amdgpu/*"
+    "taichi/platform/mac/*" "taichi/platform/windows/*"
     "taichi/codegen/*.cpp" "taichi/codegen/*.h"
     "taichi/runtime/*.h" "taichi/runtime/*.cpp"
     "taichi/rhi/*.h" "taichi/rhi/*.cpp"
@@ -127,7 +128,7 @@ endif()
 
 if (TI_WITH_AMDGPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_AMDGPU")
-# file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
+  file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
   list(APPEND TAIHI_CORE_SOURCE ${TAICHI_AMDGPU_RUNTIME_SOURCE})
 endif()
 
diff --git a/taichi/platform/amdgpu/detect_amdgpu.cpp b/taichi/platform/amdgpu/detect_amdgpu.cpp
new file mode 100644
index 0000000000000..f8ce7e5b6927b
--- /dev/null
+++ b/taichi/platform/amdgpu/detect_amdgpu.cpp
@@ -0,0 +1,17 @@
+#include "taichi/platform/amdgpu/detect_amdgpu.h"
+
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/rhi/amdgpu/amdgpu_driver.h"
+#endif
+
+namespace taichi {
+
+bool is_rocm_api_available() {
+#if defined(TI_WITH_AMDGPU)
+  return lang::AMDGPUDriver::get_instance_without_context().detected();
+#else
+  return false;
+#endif
+}
+
+}  // namespace taichi
\ No newline at end of file
diff --git a/taichi/platform/amdgpu/detect_amdgpu.h b/taichi/platform/amdgpu/detect_amdgpu.h
new file mode 100644
index 0000000000000..c7638e938ce0e
--- /dev/null
+++ b/taichi/platform/amdgpu/detect_amdgpu.h
@@ -0,0 +1,5 @@
+#pragma once
+
+namespace taichi {
+bool is_rocm_api_available();
+}  // namespace taichi

From 1a618879b16ab84aa77b52b183ede5aaa05a7e03 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 11:52:11 +0800
Subject: [PATCH 02/26] config codegen

---
 taichi/codegen/codegen.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/taichi/codegen/codegen.cpp b/taichi/codegen/codegen.cpp
index bac987fcabb06..3b21478b6f7ca 100644
--- a/taichi/codegen/codegen.cpp
+++ b/taichi/codegen/codegen.cpp
@@ -10,10 +10,13 @@
 #endif
 #if defined(TI_WITH_CUDA)
 #include "taichi/codegen/cuda/codegen_cuda.h"
-#endif
+#endi
 #if defined(TI_WITH_DX12)
 #include "taichi/codegen/dx12/codegen_dx12.h"
 #endif
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/codegen/amdgpu/codegen_amdgpu.h"
+#endif
 #include "taichi/system/timer.h"
 #include "taichi/ir/analysis.h"
 #include "taichi/ir/transforms.h"
@@ -47,6 +50,12 @@ std::unique_ptr<KernelCodeGen> KernelCodeGen::create(
     return std::make_unique<KernelCodeGenDX12>(compile_config, kernel);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    return std::make_unique<KernelCodeGenAMDGPU>(compile_config, kernel);
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
     TI_NOT_IMPLEMENTED

From c78806450ffd6eb303f7e3bda5e75fccb19b0a37 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 15:28:40 +0800
Subject: [PATCH 03/26] fronted ir and jit_session

---
 taichi/codegen/amdgpu/codegen_amdgpu.cpp | 30 ++++++++++++++++++++----
 taichi/inc/archs.inc.h                   |  2 +-
 taichi/ir/frontend_ir.cpp                |  6 ++---
 taichi/ir/frontend_ir.h                  |  2 +-
 taichi/jit/jit_session.cpp               | 11 +++++++++
 5 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
index 14211c9a84c9d..5d402a32ed444 100644
--- a/taichi/codegen/amdgpu/codegen_amdgpu.cpp
+++ b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
@@ -265,12 +265,32 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
   }
 
   void visit(GlobalLoadStmt *stmt) override {
-    if (auto get_ch = stmt->src->cast<GetChStmt>()) {
-      bool should_cache_as_read_only = current_offload->mem_access_opt.has_flag(
-          get_ch->output_snode, SNodeAccessFlag::read_only);
-      create_global_load(stmt, should_cache_as_read_only);
+    auto ptr = llvm_val[stmt->src];
+    auto ptr_type = stmt->src->ret_type->as<PointerType>();
+    if (ptr_type->is_bit_pointer()) {
+      auto val_type = ptr_type->get_pointee_type();
+      auto get_ch = stmt->src->as<GetChStmt>();
+      auto physical_type =
+          tlctx->get_data_type(get_ch->input_snode->physical_type);
+      auto [byte_ptr, bit_offset] = load_bit_ptr(ptr);
+      auto physical_value = builder->CreateLoad(physical_type, byte_ptr);
+      if (auto qit = val_type->cast<QuantIntType>()) {
+        llvm_val[stmt] = extract_quant_int(physical_value, bit_offset, qit);
+      } else if (auto qfxt = val_type->cast<QuantFixedType>()) {
+        qit = qfxt->get_digits_type()->as<QuantIntType>();
+        auto digits = extract_quant_int(physical_value, bit_offset, qit);
+        llvm_val[stmt] = reconstruct_quant_fixed(digits, qfxt);
+      } else {
+        TI_ASSERT(val_type->is<QuantFloatType>());
+        TI_ASSERT(get_ch->input_snode->dt->is<BitStructType>());
+        llvm_val[stmt] = extract_quant_float(
+            physical_value, get_ch->input_snode->dt->as<BitStructType>(),
+            get_ch->output_snode->id_in_bit_struct);
+      }
     } else {
-      create_global_load(stmt, false);
+      // Byte pointer case.
+      llvm_val[stmt] =
+          builder->CreateLoad(tlctx->get_data_type(stmt->ret_type), ptr);
     }
   }
 
diff --git a/taichi/inc/archs.inc.h b/taichi/inc/archs.inc.h
index 979646b0a646b..51ff8772830df 100644
--- a/taichi/inc/archs.inc.h
+++ b/taichi/inc/archs.inc.h
@@ -14,6 +14,6 @@ PER_ARCH(opengl)  // OpenGL Compute Shaders
 PER_ARCH(dx11)    // Microsoft DirectX 11, WIP
 PER_ARCH(dx12)    // Microsoft DirectX 12, WIP
 PER_ARCH(opencl)  // OpenCL, N/A
-PER_ARCH(amdgpu)  // AMD GPU, WIP
+PER_ARCH(amdgpu)  // AMD GPU
 PER_ARCH(vulkan)  // Vulkan
 PER_ARCH(gles)    // OpenGL ES
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index a576418c24a7c..ee1c9573b7186 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -84,7 +84,7 @@ void FrontendForStmt::init_config(Arch arch, const ForLoopConfig &config) {
   strictly_serialized = config.strictly_serialized;
   mem_access_opt = config.mem_access_opt;
   block_dim = config.block_dim;
-  if (arch == Arch::cuda) {
+  if (arch == Arch::cuda || arch == Arch::amdgpu) {
     num_cpu_threads = 1;
     TI_ASSERT(block_dim <= taichi_max_gpu_block_dim);
   } else {  // cpu
@@ -1284,8 +1284,8 @@ void ASTBuilder::insert_for(const Expr &s,
 
 Expr ASTBuilder::insert_thread_idx_expr() {
   auto loop = stack_.size() ? stack_.back()->parent_stmt : nullptr;
-  TI_ERROR_IF(arch_ != Arch::cuda && !arch_is_cpu(arch_),
-              "ti.thread_idx() is only available in cuda or cpu context.");
+  TI_ERROR_IF(arch_ != Arch::cuda && !arch_is_cpu(arch_) && arch_ != Arch::amdgpu,
+              "ti.thread_idx() is only available in cuda or cpu or amdgpu context.");
   if (loop != nullptr) {
     auto i = stack_.size() - 1;
     while (!(loop->is<FrontendForStmt>())) {
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index 76cc244f057ea..92e5c427d37be 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -1029,7 +1029,7 @@ class ASTBuilder {
   }
 
   void block_dim(int v) {
-    if (arch_ == Arch::cuda || arch_ == Arch::vulkan) {
+    if (arch_ == Arch::cuda || arch_ == Arch::vulkan || arch_ == Arch::amdgpu) {
       TI_ASSERT((v % 32 == 0) || bit::is_power_of_two(v));
     } else {
       TI_ASSERT(bit::is_power_of_two(v));
diff --git a/taichi/jit/jit_session.cpp b/taichi/jit/jit_session.cpp
index af7e47e595265..903f37a85e707 100644
--- a/taichi/jit/jit_session.cpp
+++ b/taichi/jit/jit_session.cpp
@@ -16,6 +16,11 @@ std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
     TaichiLLVMContext *tlctx,
     CompileConfig *config,
     Arch arch);
+
+std::unique_ptr<JITSession> create_llvm_jit_session_amdgpu(
+    TaichiLLVMContext *tlctx,
+    CompileConfig *config,
+    Arch arch);
 #endif
 
 JITSession::JITSession(TaichiLLVMContext *tlctx, CompileConfig *config)
@@ -41,6 +46,12 @@ std::unique_ptr<JITSession> JITSession::create(TaichiLLVMContext *tlctx,
 #else
     TI_NOT_IMPLEMENTED
 #endif
+  } else if (arch == Arch::amdgpu) {
+#ifdef TI_WITH_AMDGPU
+    return create_llvm_jit_session_amdgpu(tlctx, config, arch);
+#else
+    TI_NOT_IMPLEMENTED
+#endif 
   }
 #else
   TI_ERROR("Llvm disabled");

From a218addfca46bdb238325482994660f7b2fbe0ad Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 15:35:28 +0800
Subject: [PATCH 04/26] compile config

---
 taichi/program/compile_config.cpp | 2 +-
 taichi/program/compile_config.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
index 76071a5e2e8dd..5f77d617e32bc 100644
--- a/taichi/program/compile_config.cpp
+++ b/taichi/program/compile_config.cpp
@@ -58,7 +58,7 @@ CompileConfig::CompileConfig() {
   print_kernel_nvptx = false;
   print_kernel_llvm_ir_optimized = false;
 
-  // CUDA backend options:
+  // CUDA/AMDGPU backend options:
   device_memory_GB = 1;  // by default, preallocate 1 GB GPU memory
   device_memory_fraction = 0.0;
 
diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
index 6b3525a700636..0ac55642735b0 100644
--- a/taichi/program/compile_config.h
+++ b/taichi/program/compile_config.h
@@ -65,7 +65,7 @@ struct CompileConfig {
   bool print_kernel_llvm_ir_optimized;
   bool print_kernel_nvptx;
 
-  // CUDA backend options:
+  // CUDA/AMDGPU backend options:
   float64 device_memory_GB;
   float64 device_memory_fraction;
 

From c44c679978f404b41b29fd3a62f9e427c4ef6c5f Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 15:37:51 +0800
Subject: [PATCH 05/26] kernel

---
 taichi/program/kernel.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 49d6d58059b3b..a5dc00be197fa 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -10,6 +10,10 @@
 #include "taichi/program/program.h"
 #include "taichi/util/action_recorder.h"
 
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/rhi/amdgpu/amdgpu_driver.h"
+#endif
+
 #ifdef TI_WITH_LLVM
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #endif
@@ -68,7 +72,7 @@ void Kernel::operator()(const CompileConfig &compile_config,
   compiled_(ctx_builder.get_context());
 
   const auto arch = compile_config.arch;
-  if (compile_config.debug && (arch_is_cpu(arch) || arch == Arch::cuda)) {
+  if (compile_config.debug && (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu)) {
     program->check_runtime_error();
   }
 }

From 77a691fd18891540b574505693be0be6df650f15 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 15:40:05 +0800
Subject: [PATCH 06/26] program

---
 taichi/program/program.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 52a629c69371a..911569df00654 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -17,6 +17,11 @@
 #include "taichi/ir/frontend_ir.h"
 #include "taichi/program/snode_expr_utils.h"
 #include "taichi/math/arithmetic.h"
+
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/platform/amdgpu/detect_amdgpu.h"
+#endif
+
 #ifdef TI_WITH_LLVM
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #include "taichi/codegen/llvm/struct_llvm.h"
@@ -437,7 +442,7 @@ Ndarray *Program::create_ndarray(const DataType type,
   auto arr = std::make_unique<Ndarray>(this, type, shape, layout);
   if (zero_fill) {
     Arch arch = this_thread_config().arch;
-    if (arch_is_cpu(arch) || arch == Arch::cuda) {
+    if (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu) {
       fill_ndarray_fast_u32(arr.get(), /*data=*/0);
     } else if (arch != Arch::dx12) {
       // Device api support for dx12 backend are not complete yet
@@ -496,7 +501,8 @@ Texture *Program::create_texture(const DataType type,
 intptr_t Program::get_ndarray_data_ptr_as_int(const Ndarray *ndarray) {
   uint64_t *data_ptr{nullptr};
   if (arch_is_cpu(this_thread_config().arch) ||
-      this_thread_config().arch == Arch::cuda) {
+      this_thread_config().arch == Arch::cuda ||
+      this_thread_config().arch == Arch::amdgpu) {
     // For the LLVM backends, device allocation is a physical pointer.
     data_ptr =
         program_impl_->get_ndarray_alloc_info_ptr(ndarray->ndarray_alloc_);

From 8d6c0c6057af2c557ec74fea568a33a87cb6e98e Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 15:47:57 +0800
Subject: [PATCH 07/26] cmake and arch

---
 taichi/rhi/arch.cpp               | 2 +-
 taichi/rhi/interop/CMakeLists.txt | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/taichi/rhi/arch.cpp b/taichi/rhi/arch.cpp
index a441b9345f08f..7e5beead84bb9 100644
--- a/taichi/rhi/arch.cpp
+++ b/taichi/rhi/arch.cpp
@@ -49,7 +49,7 @@ bool arch_is_cuda(Arch arch) {
 
 bool arch_uses_llvm(Arch arch) {
   return (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::cuda ||
-          arch == Arch::dx12 || arch == Arch::wasm);
+          arch == Arch::dx12 || arch == Arch::wasm || arch == Arch::amdgpu);
 }
 
 bool arch_is_gpu(Arch arch) {
diff --git a/taichi/rhi/interop/CMakeLists.txt b/taichi/rhi/interop/CMakeLists.txt
index 9f73e9761f80b..b88bca8b0fe1e 100644
--- a/taichi/rhi/interop/CMakeLists.txt
+++ b/taichi/rhi/interop/CMakeLists.txt
@@ -12,6 +12,10 @@ if (TI_WITH_CUDA)
   target_compile_definitions(${INTEROP_RHI} PRIVATE -DTI_WITH_CUDA)
 endif()
 
+if (TI_WITH_AMDGPU)
+  target_compile_definitions(${INTEROP_RHI} PRIVATE -DTI_WITH_AMDGPU)
+endif()
+
 if (TI_WITH_VULKAN)
   target_compile_definitions(${INTEROP_RHI} PRIVATE -DTI_WITH_VULKAN)
 endif()

From c1aa69bb6bfb0762d5b2c7d8105ff75c47916c0b Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 16:25:03 +0800
Subject: [PATCH 08/26] executor

---
 taichi/runtime/llvm/llvm_context.cpp          |  13 ++
 taichi/runtime/llvm/llvm_runtime_executor.cpp | 130 +++++++++++++++++-
 taichi/runtime/llvm/llvm_runtime_executor.h   |   5 +
 3 files changed, 141 insertions(+), 7 deletions(-)

diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index 6494bd72994ce..85c2a17ff4cc8 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -46,6 +46,7 @@
 #include "llvm_context.h"
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #include "taichi/codegen/codegen_utils.h"
+
 #ifdef TI_WITH_AMDGPU
 #include "taichi/runtime/llvm/llvm_context_pass.h"
 #endif
@@ -96,6 +97,16 @@ TaichiLLVMContext::TaichiLLVMContext(CompileConfig *config, Arch arch)
     LLVMInitializeDirectXTargetMC();
     LLVMInitializeDirectXTargetInfo();
     LLVMInitializeDirectXAsmPrinter();
+#endif
+  } else if (arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetMC();
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUAsmPrinter();
+    LLVMInitializeAMDGPUAsmParser();
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
 #if defined(TI_WITH_CUDA)
@@ -987,6 +998,8 @@ void TaichiLLVMContext::add_struct_for_func(llvm::Module *module,
     auto *new_alloca = builder.CreateAlloca(new_type);
     new_alloca->setAlignment(Align(8));
     TI_ASSERT(alloca->hasOneUse());
+    // TODO
+    // For AMDGPU user_back is addrspace cast
     auto *gep = llvm::cast<llvm::GetElementPtrInst>(alloca->user_back());
     TI_ASSERT(gep->getPointerOperand() == alloca);
     std::vector<Value *> indices(gep->idx_begin(), gep->idx_end());
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index f4323e25d61e0..b68e3e5772659 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -12,6 +12,7 @@
 #endif
 
 #if defined(TI_WITH_AMDGPU)
+#include "taichi/platform/amdgpu/detect_amdgpu.h"
 #include "taichi/rhi/amdgpu/amdgpu_driver.h"
 #include "taichi/rhi/amdgpu/amdgpu_device.h"
 #include "taichi/rhi/amdgpu/amdgpu_context.h"
@@ -52,6 +53,21 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
     if (config.arch != Arch::cuda) {
       TI_WARN("Falling back to {}.", arch_name(host_arch()));
     }
+  } else if (config.arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    if (!is_rocm_api_available()) {
+      TI_WARN("No AMDGPU ROCm API detected.");
+      config.arch = host_arch();
+    } else if (!AMDGPUContext::get_instance().detected()) {
+      TI_WARN("No AMDGPU device detected.");
+      config.arch = host_arch();
+    } else {
+      // AMDGPU runtime created successfully
+    }
+#else
+    TI_WARN("Taichi is not compiled with AMDGPU.");
+    config.arch = host_arch();
+#endif
   }
 
   snode_tree_buffer_manager_ = std::make_unique<SNodeTreeBufferManager>(this);
@@ -95,6 +111,25 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
 #if defined(TI_WITH_DX12)
     // FIXME: set value based on DX12.
     config.max_block_dim = 1024;
+#endif
+  } else if (config.arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    int num_workgroups{1};
+    AMDGPUDriver::get_instance().device_get_attribute(
+      &num_workgroups, HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
+    int query_max_block_dim{1024};
+    AMDGPUDriver::get_instance().device_get_attribute(
+      &query_max_block_dim, HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 0);
+    // magic number 32
+    // I didn't find the relevant parameter to limit the max block num per CU
+    // So ....
+    int query_max_block_per_cu{32};
+    if (config.max_block_dim == 0) {
+      config.max_block_dim = query_max_block_dim;
+    }
+    if (config.saturating_grid_dim == 0) {
+      config.saturating_grid_dim = num_workgroups * query_max_block_per_cu * 2;
+    }
 #endif
   }
 
@@ -204,6 +239,12 @@ void LlvmRuntimeExecutor::synchronize() {
     CUDADriver::get_instance().stream_synchronize(nullptr);
 #else
     TI_ERROR("No CUDA support");
+#endif
+  } else if (config_ -> arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+#else
+    TI_ERROR("No AMDGPU support");
 #endif
   }
   fflush(stdout);
@@ -220,6 +261,13 @@ uint64 LlvmRuntimeExecutor::fetch_result_uint64(int i, uint64 *result_buffer) {
                                                      sizeof(uint64));
 #else
     TI_NOT_IMPLEMENTED;
+#endif
+  } else if (config_->arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)    
+    AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
+                                                       sizeof(uint64));
+#else
+    TI_NOT_IMPLEMENTED;
 #endif
   } else {
     ret = result_buffer[i];
@@ -372,8 +420,8 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
     uint64 *result_buffer) {
   TaichiLLVMContext *tlctx = nullptr;
   if (config_->arch == Arch::cuda) {
-#if defined(TI_WITH_CUDA)
-    tlctx = llvm_context_device_.get();
+#if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU)
+   tlctx = llvm_context_device_.get();
 #else
     TI_NOT_IMPLEMENTED
 #endif
@@ -400,6 +448,12 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
     CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (config_->arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    AMDGPUDriver::get_instance().memset(root_buffer, 0, rounded_size);
+#else
+    TI_NOT_IMPLEMENTED;
 #endif
   } else {
     std::memset(root_buffer, 0, rounded_size);
@@ -412,6 +466,12 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
     alloc = cuda_device()->import_memory(root_buffer, rounded_size);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (config_->arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    alloc = amdgpu_device()->import_memory(root_buffer, rounded_size);
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
     alloc = cpu_device()->import_memory(root_buffer, rounded_size);
@@ -465,6 +525,13 @@ cuda::CudaDevice *LlvmRuntimeExecutor::cuda_device() {
   return static_cast<cuda::CudaDevice *>(device_.get());
 }
 
+amdgpu::AmdgpuDevice *LlvmRuntimeExecutor::amdgpu_device() {
+  if (config_->arch != Arch::amdgpu) {
+    TI_ERROR("arch is not amdgpu");
+  }
+  return static_cast<amdgpu::AmdgpuDevice *>(device_.get());
+}
+
 cpu::CpuDevice *LlvmRuntimeExecutor::cpu_device() {
   TI_ERROR_IF(!arch_is_cpu(config_->arch), "arch is not cpu");
   return static_cast<cpu::CpuDevice *>(device_.get());
@@ -507,6 +574,12 @@ void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc,
     CUDADriver::get_instance().memsetd32((void *)ptr, data, size);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (config_->arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    AMDGPUDriver::get_instance().memset((void *)ptr, data, size);
+#else
+    TI_NOT_IMPLEMENTED;
 #endif
   } else {
     std::fill((uint32_t *)ptr, (uint32_t *)ptr + size, data);
@@ -520,6 +593,12 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
     return (uint64_t *)cuda_device()->get_alloc_info(alloc).ptr;
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (config_->arch ==Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    return (uint64_t *)amdgpu_device()->get_alloc_info(alloc).ptr;
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
     return (uint64_t *)cpu_device()->get_alloc_info(alloc).ptr;
@@ -528,11 +607,17 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
 
 void LlvmRuntimeExecutor::finalize() {
   profiler_ = nullptr;
-#if defined(TI_WITH_CUDA)
   if (preallocated_device_buffer_ != nullptr) {
-    cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
-  }
+    if (config_->arch == Arch::cuda)
+#if defined(TI_WITH_CUDA)
+       cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
+#endif
+    } else if (config_->arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+      amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_);
 #endif
+    }
+  }
 }
 
 void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
@@ -570,6 +655,37 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
     tlctx = llvm_context_device_.get();
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (config_->arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    AMDGPUDriver::get_instance().malloc(
+        (void **)result_buffer_ptr,
+        sizeof(uint64) * taichi_result_buffer_entries);
+    const auto total_mem = runtime_mem_info_->get_total_memory();
+    if (config_->device_memory_fraction == 0) {
+      TI_ASSERT(config_->device_memory_GB > 0);
+      prealloc_size = std::size_t(config_->device_memory_GB * (1UL << 30));
+    } else {
+      prealloc_size = std::size_t(config_->device_memory_fraction * total_mem);
+    }
+    TI_ASSERT(prealloc_size <= total_mem);
+
+    TI_TRACE("Allocating device memory {:.2f} GB",
+             1.0 * prealloc_size / (1UL << 30));
+
+    Device::AllocParams preallocated_device_buffer_alloc_params;
+    preallocated_device_buffer_alloc_params.size = prealloc_size;
+    preallocated_device_buffer_alloc_ =
+        amdgpu_device()->allocate_memory(preallocated_device_buffer_alloc_params);
+    amdgpu::AmdgpuDevice::AllocInfo preallocated_device_buffer_alloc_info =
+        amdgpu_device()->get_alloc_info(preallocated_device_buffer_alloc_);
+    preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr;
+
+    AMDGPUDriver::get_instance().memset(preallocated_device_buffer_, 0,
+                                      prealloc_size);
+    tlctx = llvm_context_device_.get();
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
     *result_buffer_ptr = (uint64 *)memory_pool->allocate(
@@ -586,8 +702,8 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
   // Number of random states. One per CPU/CUDA thread.
   int num_rand_states = 0;
 
-  if (config_->arch == Arch::cuda) {
-#if defined(TI_WITH_CUDA)
+  if (config_->arch == Arch::cuda || config_->arch == Arch::amdgpu) {
+#if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU)
     // It is important to make sure that every CUDA thread has its own random
     // state so that we do not need expensive per-state locks.
     num_rand_states = config_->saturating_grid_dim * config_->max_block_dim;
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h
index 7bf1178397981..e3f81ffbf8134 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.h
+++ b/taichi/runtime/llvm/llvm_runtime_executor.h
@@ -25,6 +25,10 @@ namespace cuda {
 class CudaDevice;
 }  // namespace cuda
 
+namespace amdgpu {
+class AmdgpuDevice;
+}  // namespace amdgpu
+
 namespace cpu {
 class CpuDevice;
 }  // namespace cpu
@@ -120,6 +124,7 @@ class LlvmRuntimeExecutor {
   /* -------------------------- */
   cuda::CudaDevice *cuda_device();
   cpu::CpuDevice *cpu_device();
+  amdgpu::AmdgpuDevice *amdgpu_device();
 
   void initialize_host();
 

From b8b160f31e1182150615f9f688584f2050ddbf8e Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 16:30:20 +0800
Subject: [PATCH 09/26] llvm program

---
 taichi/runtime/llvm/runtime_module/CMakeLists.txt  |  2 +-
 taichi/runtime/program_impls/llvm/CMakeLists.txt   |  4 ++++
 taichi/runtime/program_impls/llvm/llvm_program.cpp | 10 ++++++++++
 taichi/runtime/program_impls/llvm/llvm_program.h   |  4 ++++
 4 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/taichi/runtime/llvm/runtime_module/CMakeLists.txt b/taichi/runtime/llvm/runtime_module/CMakeLists.txt
index 0edac71b82c7c..92303abcc675b 100644
--- a/taichi/runtime/llvm/runtime_module/CMakeLists.txt
+++ b/taichi/runtime/llvm/runtime_module/CMakeLists.txt
@@ -14,6 +14,6 @@ function(COMPILE_LLVM_RUNTIME rtm_arch)
 endfunction()
 
 # Build llvm-runtime for host arch and cuda (if available)
-foreach(arch IN LISTS HOST_ARCH CUDA_ARCH DX12_ARCH)
+foreach(arch IN LISTS HOST_ARCH CUDA_ARCH DX12_ARCH AMDGPU_ARCH)
   compile_llvm_runtime(${arch})
 endforeach()
diff --git a/taichi/runtime/program_impls/llvm/CMakeLists.txt b/taichi/runtime/program_impls/llvm/CMakeLists.txt
index ab4a73a9d30bb..0b63c2d618ae2 100644
--- a/taichi/runtime/program_impls/llvm/CMakeLists.txt
+++ b/taichi/runtime/program_impls/llvm/CMakeLists.txt
@@ -20,3 +20,7 @@ target_link_libraries(llvm_program_impl PRIVATE cpu_runtime)
 if (TI_WITH_CUDA)
     target_link_libraries(llvm_program_impl PRIVATE cuda_runtime)
 endif()
+
+if (TI_WITH_AMDGPU)
+    target_link_libraries(llvm_program_impl PRIVATE amdgpu_runtime)
+endif()
diff --git a/taichi/runtime/program_impls/llvm/llvm_program.cpp b/taichi/runtime/program_impls/llvm/llvm_program.cpp
index e30a6a5442253..08e65efe536e7 100644
--- a/taichi/runtime/program_impls/llvm/llvm_program.cpp
+++ b/taichi/runtime/program_impls/llvm/llvm_program.cpp
@@ -15,6 +15,10 @@
 #include "taichi/codegen/cuda/codegen_cuda.h"
 #endif
 
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/codegen/amdgpu/codegen_amdgpu.h"
+#endif
+
 #if defined(TI_WITH_DX12)
 #include "taichi/runtime/dx12/aot_module_builder_impl.h"
 #include "taichi/codegen/dx12/codegen_dx12.h"
@@ -55,6 +59,12 @@ std::unique_ptr<StructCompiler> LlvmProgramImpl::compile_snode_tree_types_impl(
         runtime_exec_->llvm_context_device_.get()->new_module("struct");
     struct_compiler = std::make_unique<StructCompilerLLVM>(
         Arch::dx12, this, std::move(device_module), tree->id());
+  } else if (config->arch == Arch::amdgpu) {
+    TI_ASSERT(config->arch == Arch::amdgpu);
+    auto device_module = clone_struct_compiler_initial_context(
+        has_multiple_snode_trees, runtime_exec_->llvm_context_device_.get());
+    struct_compiler = std::make_unique<StructCompilerLLVM>(
+        Arch::amdgpu, this, std::move(device_module), tree->id());
   } else {
     TI_ASSERT(config->arch == Arch::cuda);
     auto device_module =
diff --git a/taichi/runtime/program_impls/llvm/llvm_program.h b/taichi/runtime/program_impls/llvm/llvm_program.h
index 353c6bd751943..6fc60b9d4a637 100644
--- a/taichi/runtime/program_impls/llvm/llvm_program.h
+++ b/taichi/runtime/program_impls/llvm/llvm_program.h
@@ -26,6 +26,10 @@ namespace cuda {
 class CudaDevice;
 }  // namespace cuda
 
+namespace amdgpu {
+class AmdgpuDevice;
+}  // namespace amdgpu
+
 namespace cpu {
 class CpuDevice;
 }  // namespace cpu

From 1bc8ab71fbe25b11b96cd69206b2c30c0ddb5d97 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 16:37:50 +0800
Subject: [PATCH 10/26] arch

---
 c_api/include/taichi/taichi_core.h        |  2 ++
 python/taichi/lang/misc.py                | 12 +++++++++---
 taichi/system/memory_pool.h               |  2 ++
 taichi/transforms/compile_to_offloads.cpp |  2 +-
 4 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/c_api/include/taichi/taichi_core.h b/c_api/include/taichi/taichi_core.h
index 299a4d830bf29..aaf88e68d30b6 100644
--- a/c_api/include/taichi/taichi_core.h
+++ b/c_api/include/taichi/taichi_core.h
@@ -369,6 +369,8 @@ typedef enum TiArch {
   TI_ARCH_OPENGL = 6,
   // OpenGL ES GPU backend.
   TI_ARCH_GLES = 7,
+  // AMDGPU backend
+  TI_ARCH_AMDGPU = 8,
   TI_ARCH_MAX_ENUM = 0xffffffff,
 } TiArch;
 
diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index 324ef88866886..9c65fab9e3baf 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -119,6 +119,11 @@
 """
 # ----------------------
 
+amdgpu = _ti_core.amdgpu
+"""The AMDGPU backend.
+"""
+# ----------------------
+
 metal = _ti_core.metal
 """The Apple Metal backend.
 """
@@ -159,9 +164,9 @@
 """
 # ----------------------
 
-gpu = [cuda, metal, vulkan, opengl, dx11, dx12, gles]
+gpu = [cuda, metal, vulkan, opengl, dx11, dx12, gles, amdgpu]
 """A list of GPU backends supported on the current system.
-Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11', 'dx12', 'gles'.
+Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11', 'dx12', 'gles', 'amdgpu'.
 
 When this is used, Taichi automatically picks the matching GPU backend. If no
 GPU is detected, Taichi falls back to the CPU backend.
@@ -731,6 +736,7 @@ def is_arch_supported(arch):
 
     arch_table = {
         cuda: _ti_core.with_cuda,
+        amdgpu: _ti_core.with_amdgpu,
         metal: _ti_core.with_metal,
         opengl: functools.partial(_ti_core.with_opengl, False),
         gles: functools.partial(_ti_core.with_opengl, True),
@@ -778,7 +784,7 @@ def get_compute_stream_device_time_elapsed_us() -> float:
 __all__ = [
     'i', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'j', 'jk', 'jkl', 'jl',
     'k', 'kl', 'l', 'x86_64', 'x64', 'dx11', 'dx12', 'wasm', 'arm64', 'cc',
-    'cpu', 'cuda', 'gles', 'gpu', 'metal', 'opengl', 'vulkan', 'extension',
+    'cpu', 'cuda', 'amdgpu', 'gles', 'gpu', 'metal', 'opengl', 'vulkan', 'extension',
     'loop_config', 'global_thread_idx', 'assume_in_range', 'block_local',
     'cache_read_only', 'init', 'mesh_local', 'no_activate', 'reset',
     'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us'
diff --git a/taichi/system/memory_pool.h b/taichi/system/memory_pool.h
index 06dce540d3ad5..63fa6c0dcd8b4 100644
--- a/taichi/system/memory_pool.h
+++ b/taichi/system/memory_pool.h
@@ -27,6 +27,7 @@ class TI_DLL_EXPORT MemoryPool {
 
   MemRequestQueue *queue;
   void *cuda_stream{nullptr};
+  void *amdgpu_stream{nullptr};
 
   // In the future we wish to move the MemoryPool inside each Device
   // so that the memory allocated from each Device can be used as-is.
@@ -50,6 +51,7 @@ class TI_DLL_EXPORT MemoryPool {
 
  private:
   static constexpr bool use_cuda_stream = false;
+  static constexpr bool use_amdgpu_stream = false;
   Arch arch_;
   Device *device_;
 };
diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
index af253a1269fc5..e0d122ff79026 100644
--- a/taichi/transforms/compile_to_offloads.cpp
+++ b/taichi/transforms/compile_to_offloads.cpp
@@ -92,7 +92,7 @@ void compile_to_offloads(IRNode *ir,
 
   // TODO: strictly enforce bit vectorization for x86 cpu and CUDA now
   //       create a separate CompileConfig flag for the new pass
-  if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) {
+  if (arch_is_cpu(config.arch) || config.arch == Arch::cuda || config.arch == Arch::amdgpu) {
     irpass::bit_loop_vectorize(ir);
     irpass::type_check(ir, config);
     print("Bit Loop Vectorized");

From 3ba6bb36e753e6b2affdd2509aff51aabe158b1f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 17 Jan 2023 08:39:29 +0000
Subject: [PATCH 11/26] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 python/taichi/lang/misc.py                    |  8 +++---
 taichi/ir/frontend_ir.cpp                     |  5 ++--
 taichi/jit/jit_session.cpp                    |  2 +-
 taichi/platform/amdgpu/detect_amdgpu.cpp      |  2 +-
 taichi/program/kernel.cpp                     |  3 ++-
 taichi/runtime/llvm/llvm_runtime_executor.cpp | 26 +++++++++----------
 taichi/transforms/compile_to_offloads.cpp     |  3 ++-
 7 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index 9c65fab9e3baf..cbb911949341d 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -784,8 +784,8 @@ def get_compute_stream_device_time_elapsed_us() -> float:
 __all__ = [
     'i', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'j', 'jk', 'jkl', 'jl',
     'k', 'kl', 'l', 'x86_64', 'x64', 'dx11', 'dx12', 'wasm', 'arm64', 'cc',
-    'cpu', 'cuda', 'amdgpu', 'gles', 'gpu', 'metal', 'opengl', 'vulkan', 'extension',
-    'loop_config', 'global_thread_idx', 'assume_in_range', 'block_local',
-    'cache_read_only', 'init', 'mesh_local', 'no_activate', 'reset',
-    'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us'
+    'cpu', 'cuda', 'amdgpu', 'gles', 'gpu', 'metal', 'opengl', 'vulkan',
+    'extension', 'loop_config', 'global_thread_idx', 'assume_in_range',
+    'block_local', 'cache_read_only', 'init', 'mesh_local', 'no_activate',
+    'reset', 'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us'
 ]
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index ee1c9573b7186..ce3ef2dd1aecb 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -1284,8 +1284,9 @@ void ASTBuilder::insert_for(const Expr &s,
 
 Expr ASTBuilder::insert_thread_idx_expr() {
   auto loop = stack_.size() ? stack_.back()->parent_stmt : nullptr;
-  TI_ERROR_IF(arch_ != Arch::cuda && !arch_is_cpu(arch_) && arch_ != Arch::amdgpu,
-              "ti.thread_idx() is only available in cuda or cpu or amdgpu context.");
+  TI_ERROR_IF(
+      arch_ != Arch::cuda && !arch_is_cpu(arch_) && arch_ != Arch::amdgpu,
+      "ti.thread_idx() is only available in cuda or cpu or amdgpu context.");
   if (loop != nullptr) {
     auto i = stack_.size() - 1;
     while (!(loop->is<FrontendForStmt>())) {
diff --git a/taichi/jit/jit_session.cpp b/taichi/jit/jit_session.cpp
index 903f37a85e707..3ae369a00203d 100644
--- a/taichi/jit/jit_session.cpp
+++ b/taichi/jit/jit_session.cpp
@@ -51,7 +51,7 @@ std::unique_ptr<JITSession> JITSession::create(TaichiLLVMContext *tlctx,
     return create_llvm_jit_session_amdgpu(tlctx, config, arch);
 #else
     TI_NOT_IMPLEMENTED
-#endif 
+#endif
   }
 #else
   TI_ERROR("Llvm disabled");
diff --git a/taichi/platform/amdgpu/detect_amdgpu.cpp b/taichi/platform/amdgpu/detect_amdgpu.cpp
index f8ce7e5b6927b..8a5523f60055a 100644
--- a/taichi/platform/amdgpu/detect_amdgpu.cpp
+++ b/taichi/platform/amdgpu/detect_amdgpu.cpp
@@ -14,4 +14,4 @@ bool is_rocm_api_available() {
 #endif
 }
 
-}  // namespace taichi
\ No newline at end of file
+}  // namespace taichi
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index a5dc00be197fa..e8249afb8e5e5 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -72,7 +72,8 @@ void Kernel::operator()(const CompileConfig &compile_config,
   compiled_(ctx_builder.get_context());
 
   const auto arch = compile_config.arch;
-  if (compile_config.debug && (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu)) {
+  if (compile_config.debug &&
+      (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu)) {
     program->check_runtime_error();
   }
 }
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index b68e3e5772659..82cd3ba7bd888 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -116,10 +116,10 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
 #if defined(TI_WITH_AMDGPU)
     int num_workgroups{1};
     AMDGPUDriver::get_instance().device_get_attribute(
-      &num_workgroups, HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
+        &num_workgroups, HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
     int query_max_block_dim{1024};
     AMDGPUDriver::get_instance().device_get_attribute(
-      &query_max_block_dim, HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 0);
+        &query_max_block_dim, HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 0);
     // magic number 32
     // I didn't find the relevant parameter to limit the max block num per CU
     // So ....
@@ -240,7 +240,7 @@ void LlvmRuntimeExecutor::synchronize() {
 #else
     TI_ERROR("No CUDA support");
 #endif
-  } else if (config_ -> arch == Arch::amdgpu) {
+  } else if (config_->arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
     AMDGPUDriver::get_instance().stream_synchronize(nullptr);
 #else
@@ -263,7 +263,7 @@ uint64 LlvmRuntimeExecutor::fetch_result_uint64(int i, uint64 *result_buffer) {
     TI_NOT_IMPLEMENTED;
 #endif
   } else if (config_->arch == Arch::amdgpu) {
-#if defined(TI_WITH_AMDGPU)    
+#if defined(TI_WITH_AMDGPU)
     AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
                                                        sizeof(uint64));
 #else
@@ -421,7 +421,7 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
   TaichiLLVMContext *tlctx = nullptr;
   if (config_->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU)
-   tlctx = llvm_context_device_.get();
+    tlctx = llvm_context_device_.get();
 #else
     TI_NOT_IMPLEMENTED
 #endif
@@ -594,7 +594,7 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
 #else
     TI_NOT_IMPLEMENTED
 #endif
-  } else if (config_->arch ==Arch::amdgpu) {
+  } else if (config_->arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
     return (uint64_t *)amdgpu_device()->get_alloc_info(alloc).ptr;
 #else
@@ -610,15 +610,15 @@ void LlvmRuntimeExecutor::finalize() {
   if (preallocated_device_buffer_ != nullptr) {
     if (config_->arch == Arch::cuda)
 #if defined(TI_WITH_CUDA)
-       cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
+      cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
 #endif
-    } else if (config_->arch == Arch::amdgpu) {
+  } else if (config_->arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
-      amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_);
+    amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_);
 #endif
-    }
   }
 }
+}
 
 void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
                                               KernelProfilerBase *profiler,
@@ -675,14 +675,14 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
 
     Device::AllocParams preallocated_device_buffer_alloc_params;
     preallocated_device_buffer_alloc_params.size = prealloc_size;
-    preallocated_device_buffer_alloc_ =
-        amdgpu_device()->allocate_memory(preallocated_device_buffer_alloc_params);
+    preallocated_device_buffer_alloc_ = amdgpu_device()->allocate_memory(
+        preallocated_device_buffer_alloc_params);
     amdgpu::AmdgpuDevice::AllocInfo preallocated_device_buffer_alloc_info =
         amdgpu_device()->get_alloc_info(preallocated_device_buffer_alloc_);
     preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr;
 
     AMDGPUDriver::get_instance().memset(preallocated_device_buffer_, 0,
-                                      prealloc_size);
+                                        prealloc_size);
     tlctx = llvm_context_device_.get();
 #else
     TI_NOT_IMPLEMENTED
diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
index e0d122ff79026..ca49f80aa306f 100644
--- a/taichi/transforms/compile_to_offloads.cpp
+++ b/taichi/transforms/compile_to_offloads.cpp
@@ -92,7 +92,8 @@ void compile_to_offloads(IRNode *ir,
 
   // TODO: strictly enforce bit vectorization for x86 cpu and CUDA now
   //       create a separate CompileConfig flag for the new pass
-  if (arch_is_cpu(config.arch) || config.arch == Arch::cuda || config.arch == Arch::amdgpu) {
+  if (arch_is_cpu(config.arch) || config.arch == Arch::cuda ||
+      config.arch == Arch::amdgpu) {
     irpass::bit_loop_vectorize(ir);
     irpass::type_check(ir, config);
     print("Bit Loop Vectorized");

From 74dcea82c4e474aab95dd64eec4a116ab0626407 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 18:21:10 +0800
Subject: [PATCH 12/26] add misc

---
 python/taichi/lang/misc.py                         | 1 +
 taichi/codegen/codegen.cpp                         | 2 +-
 taichi/python/export_misc.cpp                      | 6 ++++++
 taichi/runtime/llvm/llvm_runtime_executor.cpp      | 5 ++---
 taichi/runtime/program_impls/llvm/llvm_program.cpp | 4 ++--
 taichi/transforms/ast_to_ir.cpp                    | 2 +-
 6 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index cbb911949341d..9ced08097bc52 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -748,6 +748,7 @@ def is_arch_supported(arch):
         cpu: lambda: True,
     }
     with_arch = arch_table.get(arch, lambda: False)
+    print(with_arch())
     try:
         return with_arch()
     except Exception as e:
diff --git a/taichi/codegen/codegen.cpp b/taichi/codegen/codegen.cpp
index 3b21478b6f7ca..89fb53ddd79cf 100644
--- a/taichi/codegen/codegen.cpp
+++ b/taichi/codegen/codegen.cpp
@@ -10,7 +10,7 @@
 #endif
 #if defined(TI_WITH_CUDA)
 #include "taichi/codegen/cuda/codegen_cuda.h"
-#endi
+#endif
 #if defined(TI_WITH_DX12)
 #include "taichi/codegen/dx12/codegen_dx12.h"
 #endif
diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp
index 3a6c2d90360cd..65bc95777b781 100644
--- a/taichi/python/export_misc.cpp
+++ b/taichi/python/export_misc.cpp
@@ -24,6 +24,11 @@
 #include "taichi/rhi/cuda/cuda_driver.h"
 #endif
 
+#include "taichi/platform/amdgpu/detect_amdgpu.h"
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/rhi/amdgpu/amdgpu_driver.h"
+#endif
+
 #ifdef TI_WITH_VULKAN
 #include "taichi/rhi/vulkan/vulkan_loader.h"
 #endif
@@ -144,6 +149,7 @@ void export_misc(py::module &m) {
   m.def("pop_python_print_buffer", []() { return py_cout.pop_content(); });
   m.def("toggle_python_print_buffer", [](bool opt) { py_cout.enabled = opt; });
   m.def("with_cuda", is_cuda_api_available);
+  m.def("with_amdgpu", is_rocm_api_available);
 #ifdef TI_WITH_METAL
   m.def("with_metal", taichi::lang::metal::is_metal_api_available);
 #else
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 82cd3ba7bd888..339206d94599d 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -11,10 +11,10 @@
 #include "taichi/rhi/cuda/cuda_context.h"
 #endif
 
-#if defined(TI_WITH_AMDGPU)
 #include "taichi/platform/amdgpu/detect_amdgpu.h"
 #include "taichi/rhi/amdgpu/amdgpu_driver.h"
 #include "taichi/rhi/amdgpu/amdgpu_device.h"
+#if defined(TI_WITH_AMDGPU)
 #include "taichi/rhi/amdgpu/amdgpu_context.h"
 #endif
 
@@ -618,7 +618,6 @@ void LlvmRuntimeExecutor::finalize() {
 #endif
   }
 }
-}
 
 void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
                                               KernelProfilerBase *profiler,
@@ -661,7 +660,7 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
     AMDGPUDriver::get_instance().malloc(
         (void **)result_buffer_ptr,
         sizeof(uint64) * taichi_result_buffer_entries);
-    const auto total_mem = runtime_mem_info_->get_total_memory();
+    const auto total_mem = AMDGPUContext::get_instance().get_total_memory();
     if (config_->device_memory_fraction == 0) {
       TI_ASSERT(config_->device_memory_GB > 0);
       prealloc_size = std::size_t(config_->device_memory_GB * (1UL << 30));
diff --git a/taichi/runtime/program_impls/llvm/llvm_program.cpp b/taichi/runtime/program_impls/llvm/llvm_program.cpp
index 08e65efe536e7..16e820e725827 100644
--- a/taichi/runtime/program_impls/llvm/llvm_program.cpp
+++ b/taichi/runtime/program_impls/llvm/llvm_program.cpp
@@ -61,8 +61,8 @@ std::unique_ptr<StructCompiler> LlvmProgramImpl::compile_snode_tree_types_impl(
         Arch::dx12, this, std::move(device_module), tree->id());
   } else if (config->arch == Arch::amdgpu) {
     TI_ASSERT(config->arch == Arch::amdgpu);
-    auto device_module = clone_struct_compiler_initial_context(
-        has_multiple_snode_trees, runtime_exec_->llvm_context_device_.get());
+    auto device_module =
+        runtime_exec_->llvm_context_device_.get()->new_module("struct");
     struct_compiler = std::make_unique<StructCompilerLLVM>(
         Arch::amdgpu, this, std::move(device_module), tree->id());
   } else {
diff --git a/taichi/transforms/ast_to_ir.cpp b/taichi/transforms/ast_to_ir.cpp
index da40e8d665d82..d7cce13a19661 100644
--- a/taichi/transforms/ast_to_ir.cpp
+++ b/taichi/transforms/ast_to_ir.cpp
@@ -7,7 +7,7 @@ namespace irpass {
 
 static bool supports_lowering(Arch arch) {
   return arch_is_cpu(arch) || (arch == Arch::cuda) || (arch == Arch::dx12) ||
-         (arch == Arch::metal);
+         (arch == Arch::metal) || (arch == Arch::amdgpu);
 }
 
 void ast_to_ir(const CompileConfig &config,

From b829caf6eb84ac3d0b56f1d045a5049b1883d8d3 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 18:34:49 +0800
Subject: [PATCH 13/26] add taichi kernel anno

---
 taichi/codegen/amdgpu/codegen_amdgpu.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
index 5d402a32ed444..9ff56bfbd9f86 100644
--- a/taichi/codegen/amdgpu/codegen_amdgpu.cpp
+++ b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
@@ -442,6 +442,11 @@ FunctionType AMDGPUModuleToFunctionConverter::convert(
     LLVMCompiledKernel data) const {
   auto &mod = data.module;
   auto &tasks = data.tasks;
+  for (const auto &task : tasks) {
+    llvm::Function *func = mod->getFunction(task.name);
+    TI_ASSERT(func);
+    func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+  }
   auto jit = tlctx_->jit.get();
   auto amdgpu_module =
       jit->add_module(std::move(mod), executor_->get_config()->gpu_max_reg);

From 9303e250ed6f3afe855e55916ccc8c2bb4730631 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 18:39:24 +0800
Subject: [PATCH 14/26] fix typo

---
 taichi/runtime/llvm/llvm_runtime_executor.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 339206d94599d..b0a3c4cf672bf 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -608,14 +608,16 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
 void LlvmRuntimeExecutor::finalize() {
   profiler_ = nullptr;
   if (preallocated_device_buffer_ != nullptr) {
-    if (config_->arch == Arch::cuda)
+    if (config_->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
       cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
 #endif
-  } else if (config_->arch == Arch::amdgpu) {
+    }
+    else if (config_->arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
-    amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_);
+      amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_);
 #endif
+    }
   }
 }
 

From ace56f3bac20e3ffd5949132e338d293aaa05c1f Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 18:48:17 +0800
Subject: [PATCH 15/26] del print

---
 python/taichi/lang/misc.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index 9ced08097bc52..cbb911949341d 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -748,7 +748,6 @@ def is_arch_supported(arch):
         cpu: lambda: True,
     }
     with_arch = arch_table.get(arch, lambda: False)
-    print(with_arch())
     try:
         return with_arch()
     except Exception as e:

From 3c10e6598a970a5bbe34040cb50ed7be03fb6ed7 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 17 Jan 2023 18:52:16 +0800
Subject: [PATCH 16/26] update test_api

---
 tests/python/test_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 9aaa9d5328c0a..c695dd8b0e186 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -63,7 +63,7 @@ def _get_expected_matrix_apis():
     'TaichiAssertionError', 'TaichiCompilationError', 'TaichiNameError',
     'TaichiRuntimeError', 'TaichiRuntimeTypeError', 'TaichiSyntaxError',
     'TaichiTypeError', 'Texture', 'Vector', 'VectorNdarray', 'WARN', 'abs',
-    'acos', 'activate', 'ad', 'algorithms', 'aot', 'append', 'arm64', 'asin',
+    'acos', 'activate', 'ad', 'algorithms', 'amdgpu', 'aot', 'append', 'arm64', 'asin',
     'assume_in_range', 'atan2', 'atomic_add', 'atomic_and', 'atomic_max',
     'atomic_min', 'atomic_or', 'atomic_sub', 'atomic_xor', 'axes', 'bit_cast',
     'bit_shr', 'block_local', 'cache_read_only', 'cast', 'cc', 'ceil', 'cos',

From 742a86b8035f20a7a8abccf2e634156de854623c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 17 Jan 2023 10:54:31 +0000
Subject: [PATCH 17/26] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 taichi/runtime/llvm/llvm_runtime_executor.cpp |  3 +-
 tests/python/test_api.py                      | 44 +++++++++----------
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index b0a3c4cf672bf..6600dabb5bdae 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -612,8 +612,7 @@ void LlvmRuntimeExecutor::finalize() {
 #if defined(TI_WITH_CUDA)
       cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
 #endif
-    }
-    else if (config_->arch == Arch::amdgpu) {
+    } else if (config_->arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
       amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_);
 #endif
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index c695dd8b0e186..688ad62c37357 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -63,28 +63,28 @@ def _get_expected_matrix_apis():
     'TaichiAssertionError', 'TaichiCompilationError', 'TaichiNameError',
     'TaichiRuntimeError', 'TaichiRuntimeTypeError', 'TaichiSyntaxError',
     'TaichiTypeError', 'Texture', 'Vector', 'VectorNdarray', 'WARN', 'abs',
-    'acos', 'activate', 'ad', 'algorithms', 'amdgpu', 'aot', 'append', 'arm64', 'asin',
-    'assume_in_range', 'atan2', 'atomic_add', 'atomic_and', 'atomic_max',
-    'atomic_min', 'atomic_or', 'atomic_sub', 'atomic_xor', 'axes', 'bit_cast',
-    'bit_shr', 'block_local', 'cache_read_only', 'cast', 'cc', 'ceil', 'cos',
-    'cpu', 'cuda', 'data_oriented', 'dataclass', 'deactivate',
-    'deactivate_all_snodes', 'dx11', 'dx12', 'eig', 'exp', 'experimental',
-    'extension', 'f16', 'f32', 'f64', 'field', 'float16', 'float32', 'float64',
-    'floor', 'func', 'get_addr', 'get_compute_stream_device_time_elapsed_us',
-    'gles', 'global_thread_idx', 'gpu', 'graph', 'grouped', 'hex_to_rgb', 'i',
-    'i16', 'i32', 'i64', 'i8', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il',
-    'init', 'int16', 'int32', 'int64', 'int8', 'is_active',
-    'is_logging_effective', 'j', 'jk', 'jkl', 'jl', 'k', 'kernel', 'kl', 'l',
-    'lang', 'length', 'linalg', 'log', 'loop_config', 'math', 'max',
-    'mesh_local', 'mesh_patch_idx', 'metal', 'min', 'ndarray', 'ndrange',
-    'no_activate', 'one', 'opengl', 'polar_decompose', 'pow', 'profiler',
-    'randn', 'random', 'raw_div', 'raw_mod', 'ref', 'rescale_index', 'reset',
-    'rgb_to_hex', 'root', 'round', 'rsqrt', 'select', 'set_logging_level',
-    'simt', 'sin', 'solve', 'sparse_matrix_builder', 'sqrt', 'static',
-    'static_assert', 'static_print', 'stop_grad', 'svd', 'swizzle_generator',
-    'sym_eig', 'sync', 'tan', 'tanh', 'template', 'tools', 'types', 'u16',
-    'u32', 'u64', 'u8', 'ui', 'uint16', 'uint32', 'uint64', 'uint8', 'vulkan',
-    'wasm', 'x64', 'x86_64', 'zero'
+    'acos', 'activate', 'ad', 'algorithms', 'amdgpu', 'aot', 'append', 'arm64',
+    'asin', 'assume_in_range', 'atan2', 'atomic_add', 'atomic_and',
+    'atomic_max', 'atomic_min', 'atomic_or', 'atomic_sub', 'atomic_xor',
+    'axes', 'bit_cast', 'bit_shr', 'block_local', 'cache_read_only', 'cast',
+    'cc', 'ceil', 'cos', 'cpu', 'cuda', 'data_oriented', 'dataclass',
+    'deactivate', 'deactivate_all_snodes', 'dx11', 'dx12', 'eig', 'exp',
+    'experimental', 'extension', 'f16', 'f32', 'f64', 'field', 'float16',
+    'float32', 'float64', 'floor', 'func', 'get_addr',
+    'get_compute_stream_device_time_elapsed_us', 'gles', 'global_thread_idx',
+    'gpu', 'graph', 'grouped', 'hex_to_rgb', 'i', 'i16', 'i32', 'i64', 'i8',
+    'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'init', 'int16', 'int32',
+    'int64', 'int8', 'is_active', 'is_logging_effective', 'j', 'jk', 'jkl',
+    'jl', 'k', 'kernel', 'kl', 'l', 'lang', 'length', 'linalg', 'log',
+    'loop_config', 'math', 'max', 'mesh_local', 'mesh_patch_idx', 'metal',
+    'min', 'ndarray', 'ndrange', 'no_activate', 'one', 'opengl',
+    'polar_decompose', 'pow', 'profiler', 'randn', 'random', 'raw_div',
+    'raw_mod', 'ref', 'rescale_index', 'reset', 'rgb_to_hex', 'root', 'round',
+    'rsqrt', 'select', 'set_logging_level', 'simt', 'sin', 'solve',
+    'sparse_matrix_builder', 'sqrt', 'static', 'static_assert', 'static_print',
+    'stop_grad', 'svd', 'swizzle_generator', 'sym_eig', 'sync', 'tan', 'tanh',
+    'template', 'tools', 'types', 'u16', 'u32', 'u64', 'u8', 'ui', 'uint16',
+    'uint32', 'uint64', 'uint8', 'vulkan', 'wasm', 'x64', 'x86_64', 'zero'
 ]
 user_api[ti.ad] = [
     'FwdMode', 'Tape', 'clear_all_gradients', 'grad_for', 'grad_replaced',

From 661bc22e0c17da7b3c9d7ca3fe7483cd506023a5 Mon Sep 17 00:00:00 2001
From: Zeyuli <li_zeyu@pku.edu.cn>
Date: Thu, 19 Jan 2023 13:52:08 +0800
Subject: [PATCH 18/26] add mark kernel

---
 taichi/codegen/amdgpu/codegen_amdgpu.cpp | 5 -----
 taichi/codegen/llvm/codegen_llvm.cpp     | 6 ++++++
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
index 9ff56bfbd9f86..5d402a32ed444 100644
--- a/taichi/codegen/amdgpu/codegen_amdgpu.cpp
+++ b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
@@ -442,11 +442,6 @@ FunctionType AMDGPUModuleToFunctionConverter::convert(
     LLVMCompiledKernel data) const {
   auto &mod = data.module;
   auto &tasks = data.tasks;
-  for (const auto &task : tasks) {
-    llvm::Function *func = mod->getFunction(task.name);
-    TI_ASSERT(func);
-    func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
-  }
   auto jit = tlctx_->jit.get();
   auto amdgpu_module =
       jit->add_module(std::move(mod), executor_->get_config()->gpu_max_reg);
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 67e93955624a9..d80bd154a035e 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -2639,6 +2639,12 @@ LLVMCompiledTask TaskCodeGenLLVM::run_compilation() {
       TI_ASSERT(func);
       tlctx->mark_function_as_cuda_kernel(func, task.block_dim);
     }
+  } else if (config.arch == Arch::amdgpu) {
+    for (const auto &task : offloaded_tasks) {
+      llvm::Function *func = mod->getFunction(task.name);
+      TI_ASSERT(func);
+      func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+    }
   }
 
   return {std::move(offloaded_tasks), std::move(module),

From aa49452c754c237922503283fcdd5a493b1d7508 Mon Sep 17 00:00:00 2001
From: Zeyuli <li_zeyu@pku.edu.cn>
Date: Thu, 19 Jan 2023 14:02:27 +0800
Subject: [PATCH 19/26] fix typo

---
 taichi/codegen/llvm/codegen_llvm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index d80bd154a035e..2bf0b8ec998bc 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -2641,7 +2641,7 @@ LLVMCompiledTask TaskCodeGenLLVM::run_compilation() {
     }
   } else if (config.arch == Arch::amdgpu) {
     for (const auto &task : offloaded_tasks) {
-      llvm::Function *func = mod->getFunction(task.name);
+      llvm::Function *func = module->getFunction(task.name);
       TI_ASSERT(func);
       func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
     }

From d540255bd21134a9933c437c9bdd11bc0dd02f88 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 20 Jan 2023 07:20:40 +0000
Subject: [PATCH 20/26] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/python/test_api.py | 44 ++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index c1544305b3b17..0c2df0dd1496e 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -62,28 +62,28 @@ def _get_expected_matrix_apis():
     'TaichiAssertionError', 'TaichiCompilationError', 'TaichiNameError',
     'TaichiRuntimeError', 'TaichiRuntimeTypeError', 'TaichiSyntaxError',
     'TaichiTypeError', 'Texture', 'Vector', 'VectorNdarray', 'WARN', 'abs',
-    'acos', 'activate', 'ad', 'algorithms', 'amdgpu', 'aot', 'append', 'arm64', 'asin',
-    'assume_in_range', 'atan2', 'atomic_add', 'atomic_and', 'atomic_max',
-    'atomic_min', 'atomic_or', 'atomic_sub', 'atomic_xor', 'axes', 'bit_cast',
-    'bit_shr', 'block_local', 'cache_read_only', 'cast', 'cc', 'ceil', 'cos',
-    'cpu', 'cuda', 'data_oriented', 'dataclass', 'deactivate',
-    'deactivate_all_snodes', 'dx11', 'dx12', 'eig', 'exp', 'experimental',
-    'extension', 'f16', 'f32', 'f64', 'field', 'float16', 'float32', 'float64',
-    'floor', 'func', 'get_addr', 'get_compute_stream_device_time_elapsed_us',
-    'gles', 'global_thread_idx', 'gpu', 'graph', 'grouped', 'hex_to_rgb', 'i',
-    'i16', 'i32', 'i64', 'i8', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il',
-    'init', 'int16', 'int32', 'int64', 'int8', 'is_active',
-    'is_logging_effective', 'j', 'jk', 'jkl', 'jl', 'k', 'kernel', 'kl', 'l',
-    'lang', 'length', 'linalg', 'log', 'loop_config', 'math', 'max',
-    'mesh_local', 'mesh_patch_idx', 'metal', 'min', 'ndarray', 'ndrange',
-    'no_activate', 'one', 'opengl', 'polar_decompose', 'pow', 'profiler',
-    'randn', 'random', 'raw_div', 'raw_mod', 'ref', 'rescale_index', 'reset',
-    'rgb_to_hex', 'root', 'round', 'rsqrt', 'select', 'set_logging_level',
-    'simt', 'sin', 'solve', 'sparse_matrix_builder', 'sqrt', 'static',
-    'static_assert', 'static_print', 'stop_grad', 'svd', 'sym_eig', 'sync',
-    'tan', 'tanh', 'template', 'tools', 'types', 'u16', 'u32', 'u64', 'u8',
-    'ui', 'uint16', 'uint32', 'uint64', 'uint8', 'vulkan', 'wasm', 'x64',
-    'x86_64', 'zero'
+    'acos', 'activate', 'ad', 'algorithms', 'amdgpu', 'aot', 'append', 'arm64',
+    'asin', 'assume_in_range', 'atan2', 'atomic_add', 'atomic_and',
+    'atomic_max', 'atomic_min', 'atomic_or', 'atomic_sub', 'atomic_xor',
+    'axes', 'bit_cast', 'bit_shr', 'block_local', 'cache_read_only', 'cast',
+    'cc', 'ceil', 'cos', 'cpu', 'cuda', 'data_oriented', 'dataclass',
+    'deactivate', 'deactivate_all_snodes', 'dx11', 'dx12', 'eig', 'exp',
+    'experimental', 'extension', 'f16', 'f32', 'f64', 'field', 'float16',
+    'float32', 'float64', 'floor', 'func', 'get_addr',
+    'get_compute_stream_device_time_elapsed_us', 'gles', 'global_thread_idx',
+    'gpu', 'graph', 'grouped', 'hex_to_rgb', 'i', 'i16', 'i32', 'i64', 'i8',
+    'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'init', 'int16', 'int32',
+    'int64', 'int8', 'is_active', 'is_logging_effective', 'j', 'jk', 'jkl',
+    'jl', 'k', 'kernel', 'kl', 'l', 'lang', 'length', 'linalg', 'log',
+    'loop_config', 'math', 'max', 'mesh_local', 'mesh_patch_idx', 'metal',
+    'min', 'ndarray', 'ndrange', 'no_activate', 'one', 'opengl',
+    'polar_decompose', 'pow', 'profiler', 'randn', 'random', 'raw_div',
+    'raw_mod', 'ref', 'rescale_index', 'reset', 'rgb_to_hex', 'root', 'round',
+    'rsqrt', 'select', 'set_logging_level', 'simt', 'sin', 'solve',
+    'sparse_matrix_builder', 'sqrt', 'static', 'static_assert', 'static_print',
+    'stop_grad', 'svd', 'sym_eig', 'sync', 'tan', 'tanh', 'template', 'tools',
+    'types', 'u16', 'u32', 'u64', 'u8', 'ui', 'uint16', 'uint32', 'uint64',
+    'uint8', 'vulkan', 'wasm', 'x64', 'x86_64', 'zero'
 ]
 user_api[ti.ad] = [
     'FwdMode', 'Tape', 'clear_all_gradients', 'grad_for', 'grad_replaced',

From 1146f9f75a99cd0865774a27b24731834b31fc96 Mon Sep 17 00:00:00 2001
From: Zeyuli <li_zeyu@pku.edu.cn>
Date: Fri, 20 Jan 2023 15:25:01 +0800
Subject: [PATCH 21/26] del redundant header files

---
 external/assets                               | 2 +-
 taichi/program/kernel.cpp                     | 4 ----
 taichi/program/program.cpp                    | 4 ----
 taichi/runtime/llvm/llvm_runtime_executor.cpp | 8 +++++++-
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/external/assets b/external/assets
index 2905391325512..150b16ad12ad5 160000
--- a/external/assets
+++ b/external/assets
@@ -1 +1 @@
-Subproject commit 2905391325512f58adb3f8684bafc06ef29f8e47
+Subproject commit 150b16ad12ad58a9a93b8988ded913e632a4df4f
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 0dff01fd363ff..754b990b3fb04 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -10,10 +10,6 @@
 #include "taichi/program/program.h"
 #include "taichi/util/action_recorder.h"
 
-#if defined(TI_WITH_AMDGPU)
-#include "taichi/rhi/amdgpu/amdgpu_driver.h"
-#endif
-
 #ifdef TI_WITH_LLVM
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #endif
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index aded7d81329dc..485449878081b 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -18,10 +18,6 @@
 #include "taichi/program/snode_expr_utils.h"
 #include "taichi/math/arithmetic.h"
 
-#if defined(TI_WITH_AMDGPU)
-#include "taichi/platform/amdgpu/detect_amdgpu.h"
-#endif
-
 #ifdef TI_WITH_LLVM
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #include "taichi/codegen/llvm/struct_llvm.h"
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 6600dabb5bdae..2ea3e69ab1f2c 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -420,11 +420,17 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
     uint64 *result_buffer) {
   TaichiLLVMContext *tlctx = nullptr;
   if (config_->arch == Arch::cuda) {
-#if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU)
+#if defined(TI_WITH_CUDA)
     tlctx = llvm_context_device_.get();
 #else
     TI_NOT_IMPLEMENTED
 #endif
+  } else if (config_->arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    tlctx = llvm_context_device_.get();
+#else
+    TI_NOT_IMPLEMENTED
+#endif 
   } else {
     tlctx = llvm_context_host_.get();
   }

From ce356f16a98c580415ae15699ed57236943b6d9d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 20 Jan 2023 07:26:17 +0000
Subject: [PATCH 22/26] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 taichi/runtime/llvm/llvm_runtime_executor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 2ea3e69ab1f2c..b39fd7a350d2f 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -430,7 +430,7 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
     tlctx = llvm_context_device_.get();
 #else
     TI_NOT_IMPLEMENTED
-#endif 
+#endif
   } else {
     tlctx = llvm_context_host_.get();
   }

From 18683c3240183a76867f0db27eb38e45b97d7602 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Sun, 29 Jan 2023 11:00:08 +0800
Subject: [PATCH 23/26] pack AMDGPU_KERNEL into mark_function_as_amdgpu_kernel

---
 taichi/codegen/llvm/codegen_llvm.cpp | 2 +-
 taichi/runtime/llvm/llvm_context.cpp | 4 ++++
 taichi/runtime/llvm/llvm_context.h   | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 9c07aeea84c3e..7e8c0b0e0c1c7 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -2636,7 +2636,7 @@ LLVMCompiledTask TaskCodeGenLLVM::run_compilation() {
     for (const auto &task : offloaded_tasks) {
       llvm::Function *func = module->getFunction(task.name);
       TI_ASSERT(func);
-      func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+      tlctx->mark_function_as_amdgpu_kernel(func);
     }
   }
 
diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index 7d1c6f2c74bd6..41c7d86d7aff1 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -809,6 +809,10 @@ void TaichiLLVMContext::mark_function_as_cuda_kernel(llvm::Function *func,
   }
 }
 
+void TaichiLLVMContext::mark_function_as_amdgpu_kernel(llvm::Function *func) {
+  func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+}
+
 void TaichiLLVMContext::eliminate_unused_functions(
     llvm::Module *module,
     std::function<bool(const std::string &)> export_indicator) {
diff --git a/taichi/runtime/llvm/llvm_context.h b/taichi/runtime/llvm/llvm_context.h
index 866c80e7882e4..8f1ab6daa0a67 100644
--- a/taichi/runtime/llvm/llvm_context.h
+++ b/taichi/runtime/llvm/llvm_context.h
@@ -125,6 +125,8 @@ class TaichiLLVMContext {
 
   void mark_function_as_cuda_kernel(llvm::Function *func, int block_dim = 0);
 
+  void mark_function_as_amdgpu_kernel(llvm::Function *func);
+
   void fetch_this_thread_struct_module();
   llvm::Module *get_this_thread_runtime_module();
   llvm::Function *get_runtime_function(const std::string &name);

From f2fb0355113841cf0a86f746b87f9e28288ba96b Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Thu, 2 Feb 2023 16:09:43 +0800
Subject: [PATCH 24/26] fix config_ isn't a pointer

---
 external/assets                               |  2 +-
 taichi/runtime/llvm/llvm_runtime_executor.cpp | 30 +++++++++----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/external/assets b/external/assets
index 150b16ad12ad5..2905391325512 160000
--- a/external/assets
+++ b/external/assets
@@ -1 +1 @@
-Subproject commit 150b16ad12ad58a9a93b8988ded913e632a4df4f
+Subproject commit 2905391325512f58adb3f8684bafc06ef29f8e47
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 773b14a1f9405..2fc59983dac9d 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -195,7 +195,7 @@ void LlvmRuntimeExecutor::synchronize() {
 #else
     TI_ERROR("No CUDA support");
 #endif
-  } else if (config_->arch == Arch::amdgpu) {
+  } else if (config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
     AMDGPUDriver::get_instance().stream_synchronize(nullptr);
 #else
@@ -217,7 +217,7 @@ uint64 LlvmRuntimeExecutor::fetch_result_uint64(int i, uint64 *result_buffer) {
 #else
     TI_NOT_IMPLEMENTED;
 #endif
-  } else if (config_->arch == Arch::amdgpu) {
+  } else if (config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
     AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
                                                        sizeof(uint64));
@@ -387,7 +387,7 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
 #else
     TI_NOT_IMPLEMENTED
 #endif
-  } else if (config_->arch == Arch::amdgpu) {
+  } else if (config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
     AMDGPUDriver::get_instance().memset(root_buffer, 0, rounded_size);
 #else
@@ -405,7 +405,7 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
 #else
     TI_NOT_IMPLEMENTED
 #endif
-  } else if (config_->arch == Arch::amdgpu) {
+  } else if (config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
     alloc = amdgpu_device()->import_memory(root_buffer, rounded_size);
 #else
@@ -464,7 +464,7 @@ cuda::CudaDevice *LlvmRuntimeExecutor::cuda_device() {
 }
 
 amdgpu::AmdgpuDevice *LlvmRuntimeExecutor::amdgpu_device() {
-  if (config_->arch != Arch::amdgpu) {
+  if (config_.arch != Arch::amdgpu) {
     TI_ERROR("arch is not amdgpu");
   }
   return static_cast<amdgpu::AmdgpuDevice *>(device_.get());
@@ -506,7 +506,7 @@ void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc,
 #else
     TI_NOT_IMPLEMENTED
 #endif
-  } else if (config_->arch == Arch::amdgpu) {
+  } else if (config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
     AMDGPUDriver::get_instance().memset((void *)ptr, data, size);
 #else
@@ -525,7 +525,7 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
 #else
     TI_NOT_IMPLEMENTED
 #endif
-  } else if (config_->arch == Arch::amdgpu) {
+  } else if (config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
     return (uint64_t *)amdgpu_device()->get_alloc_info(alloc).ptr;
 #else
@@ -539,11 +539,11 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
 void LlvmRuntimeExecutor::finalize() {
   profiler_ = nullptr;
   if (preallocated_device_buffer_ != nullptr) {
-    if (config_->arch == Arch::cuda) {
+    if (config_.arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
       cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
 #endif
-    } else if (config_->arch == Arch::amdgpu) {
+    } else if (config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
       amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_);
 #endif
@@ -585,17 +585,17 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
 #else
     TI_NOT_IMPLEMENTED
 #endif
-  } else if (config_->arch == Arch::amdgpu) {
+  } else if (config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
     AMDGPUDriver::get_instance().malloc(
         (void **)result_buffer_ptr,
         sizeof(uint64) * taichi_result_buffer_entries);
     const auto total_mem = AMDGPUContext::get_instance().get_total_memory();
-    if (config_->device_memory_fraction == 0) {
-      TI_ASSERT(config_->device_memory_GB > 0);
-      prealloc_size = std::size_t(config_->device_memory_GB * (1UL << 30));
+    if (config_.device_memory_fraction == 0) {
+      TI_ASSERT(config_.device_memory_GB > 0);
+      prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30));
     } else {
-      prealloc_size = std::size_t(config_->device_memory_fraction * total_mem);
+      prealloc_size = std::size_t(config_.device_memory_fraction * total_mem);
     }
     TI_ASSERT(prealloc_size <= total_mem);
 
@@ -630,7 +630,7 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
   // Number of random states. One per CPU/CUDA thread.
   int num_rand_states = 0;
 
-  if (config_->arch == Arch::cuda || config_->arch == Arch::amdgpu) {
+  if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU)
     // It is important to make sure that every CUDA thread has its own random
     // state so that we do not need expensive per-state locks.

From d48dc5b7aca0c468051eb58e0b839c734cc00125 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Thu, 2 Feb 2023 16:39:15 +0800
Subject: [PATCH 25/26] update config

---
 taichi/codegen/llvm/codegen_llvm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 39b29cd75c415..b4cb8b1fd8883 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -2650,7 +2650,7 @@ LLVMCompiledTask TaskCodeGenLLVM::run_compilation() {
       TI_ASSERT(func);
       tlctx->mark_function_as_cuda_kernel(func, task.block_dim);
     }
-  } else if (config.arch == Arch::amdgpu) {
+  } else if (compile_config.arch == Arch::amdgpu) {
     for (const auto &task : offloaded_tasks) {
       llvm::Function *func = module->getFunction(task.name);
       TI_ASSERT(func);

From 193f7c6d8cf344c7587ad590216344fc1adea295 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Fri, 3 Feb 2023 10:35:34 +0800
Subject: [PATCH 26/26] update jit_session

---
 taichi/jit/jit_session.cpp                    | 2 +-
 taichi/runtime/llvm/llvm_runtime_executor.cpp | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/taichi/jit/jit_session.cpp b/taichi/jit/jit_session.cpp
index 1d8b0bb4c28f6..7c0de151ce3f8 100644
--- a/taichi/jit/jit_session.cpp
+++ b/taichi/jit/jit_session.cpp
@@ -19,7 +19,7 @@ std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
 
 std::unique_ptr<JITSession> create_llvm_jit_session_amdgpu(
     TaichiLLVMContext *tlctx,
-    CompileConfig *config,
+    const CompileConfig &config,
     Arch arch);
 #endif
 
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 2fc59983dac9d..644c329c7eb7a 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -612,7 +612,6 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
 
     AMDGPUDriver::get_instance().memset(preallocated_device_buffer_, 0,
                                         prealloc_size);
-    tlctx = llvm_context_device_.get();
 #else
     TI_NOT_IMPLEMENTED
 #endif