diff --git a/c_api/include/taichi/taichi_core.h b/c_api/include/taichi/taichi_core.h
index 299a4d830bf29..aaf88e68d30b6 100644
--- a/c_api/include/taichi/taichi_core.h
+++ b/c_api/include/taichi/taichi_core.h
@@ -369,6 +369,8 @@ typedef enum TiArch {
   TI_ARCH_OPENGL = 6,
   // OpenGL ES GPU backend.
   TI_ARCH_GLES = 7,
+  // AMDGPU backend
+  TI_ARCH_AMDGPU = 8,
   TI_ARCH_MAX_ENUM = 0xffffffff,
 } TiArch;
 
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index b106039ac8a1c..09f27dcdbe703 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -93,7 +93,8 @@ file(GLOB TAICHI_CORE_SOURCE
     "taichi/system/*"
     "taichi/transforms/*"
     "taichi/aot/*.cpp" "taichi/aot/*.h"
-    "taichi/platform/cuda/*" "taichi/platform/mac/*" "taichi/platform/windows/*"
+    "taichi/platform/cuda/*" "taichi/platform/amdgpu/*"
+    "taichi/platform/mac/*" "taichi/platform/windows/*"
     "taichi/codegen/*.cpp" "taichi/codegen/*.h"
     "taichi/runtime/*.h" "taichi/runtime/*.cpp"
     "taichi/rhi/*.h" "taichi/rhi/*.cpp"
@@ -116,7 +117,7 @@ endif()
 
 if (TI_WITH_AMDGPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_AMDGPU")
-# file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
+  file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
   list(APPEND TAIHI_CORE_SOURCE ${TAICHI_AMDGPU_RUNTIME_SOURCE})
 endif()
 
diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index b21f533f1ab06..53ed51cceda1b 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -119,6 +119,11 @@
 """
 # ----------------------
 
+amdgpu = _ti_core.amdgpu
+"""The AMDGPU backend.
+"""
+# ----------------------
+
 metal = _ti_core.metal
 """The Apple Metal backend.
 """
@@ -159,9 +164,9 @@
 """
 # ----------------------
 
-gpu = [cuda, metal, vulkan, opengl, dx11, dx12, gles]
+gpu = [cuda, metal, vulkan, opengl, dx11, dx12, gles, amdgpu]
 """A list of GPU backends supported on the current system.
-Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11', 'dx12', 'gles'.
+Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11', 'dx12', 'gles', 'amdgpu'.
 
 When this is used, Taichi automatically picks the matching GPU backend. If no
 GPU is detected, Taichi falls back to the CPU backend.
@@ -726,6 +731,7 @@ def is_arch_supported(arch):
 
     arch_table = {
         cuda: _ti_core.with_cuda,
+        amdgpu: _ti_core.with_amdgpu,
         metal: _ti_core.with_metal,
         opengl: functools.partial(_ti_core.with_opengl, False),
         gles: functools.partial(_ti_core.with_opengl, True),
@@ -773,8 +779,8 @@ def get_compute_stream_device_time_elapsed_us() -> float:
 __all__ = [
     'i', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'j', 'jk', 'jkl', 'jl',
     'k', 'kl', 'l', 'x86_64', 'x64', 'dx11', 'dx12', 'wasm', 'arm64', 'cc',
-    'cpu', 'cuda', 'gles', 'gpu', 'metal', 'opengl', 'vulkan', 'extension',
-    'loop_config', 'global_thread_idx', 'assume_in_range', 'block_local',
-    'cache_read_only', 'init', 'mesh_local', 'no_activate', 'reset',
-    'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us'
+    'cpu', 'cuda', 'amdgpu', 'gles', 'gpu', 'metal', 'opengl', 'vulkan',
+    'extension', 'loop_config', 'global_thread_idx', 'assume_in_range',
+    'block_local', 'cache_read_only', 'init', 'mesh_local', 'no_activate',
+    'reset', 'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us'
 ]
diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
index 3923de44ed716..6d0e8b4c465c8 100644
--- a/taichi/codegen/amdgpu/codegen_amdgpu.cpp
+++ b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
@@ -265,12 +265,32 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
   }
 
   void visit(GlobalLoadStmt *stmt) override {
-    if (auto get_ch = stmt->src->cast<GetChStmt>()) {
-      bool should_cache_as_read_only = current_offload->mem_access_opt.has_flag(
-          get_ch->output_snode, SNodeAccessFlag::read_only);
-      create_global_load(stmt, should_cache_as_read_only);
+    auto ptr = llvm_val[stmt->src];
+    auto ptr_type = stmt->src->ret_type->as<PointerType>();
+    if (ptr_type->is_bit_pointer()) {
+      auto val_type = ptr_type->get_pointee_type();
+      auto get_ch = stmt->src->as<GetChStmt>();
+      auto physical_type =
+          tlctx->get_data_type(get_ch->input_snode->physical_type);
+      auto [byte_ptr, bit_offset] = load_bit_ptr(ptr);
+      auto physical_value = builder->CreateLoad(physical_type, byte_ptr);
+      if (auto qit = val_type->cast<QuantIntType>()) {
+        llvm_val[stmt] = extract_quant_int(physical_value, bit_offset, qit);
+      } else if (auto qfxt = val_type->cast<QuantFixedType>()) {
+        qit = qfxt->get_digits_type()->as<QuantIntType>();
+        auto digits = extract_quant_int(physical_value, bit_offset, qit);
+        llvm_val[stmt] = reconstruct_quant_fixed(digits, qfxt);
+      } else {
+        TI_ASSERT(val_type->is<QuantFloatType>());
+        TI_ASSERT(get_ch->input_snode->dt->is<BitStructType>());
+        llvm_val[stmt] = extract_quant_float(
+            physical_value, get_ch->input_snode->dt->as<BitStructType>(),
+            get_ch->output_snode->id_in_bit_struct);
+      }
     } else {
-      create_global_load(stmt, false);
+      // Byte pointer case.
+      llvm_val[stmt] =
+          builder->CreateLoad(tlctx->get_data_type(stmt->ret_type), ptr);
     }
   }
 
diff --git a/taichi/codegen/codegen.cpp b/taichi/codegen/codegen.cpp
index 65f8248f4d671..1bb19f198650a 100644
--- a/taichi/codegen/codegen.cpp
+++ b/taichi/codegen/codegen.cpp
@@ -14,6 +14,9 @@
 #if defined(TI_WITH_DX12)
 #include "taichi/codegen/dx12/codegen_dx12.h"
 #endif
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/codegen/amdgpu/codegen_amdgpu.h"
+#endif
 #include "taichi/system/timer.h"
 #include "taichi/ir/analysis.h"
 #include "taichi/ir/transforms.h"
@@ -47,6 +50,12 @@ std::unique_ptr<KernelCodeGen> KernelCodeGen::create(
     return std::make_unique<KernelCodeGenDX12>(compile_config, kernel);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    return std::make_unique<KernelCodeGenAMDGPU>(compile_config, kernel);
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
     TI_NOT_IMPLEMENTED
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 45a30bbc89990..e8dbcfc516e55 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -2634,6 +2634,12 @@ LLVMCompiledTask TaskCodeGenLLVM::run_compilation() {
       TI_ASSERT(func);
       tlctx->mark_function_as_cuda_kernel(func, task.block_dim);
     }
+  } else if (compile_config.arch == Arch::amdgpu) {
+    for (const auto &task : offloaded_tasks) {
+      llvm::Function *func = module->getFunction(task.name);
+      TI_ASSERT(func);
+      tlctx->mark_function_as_amdgpu_kernel(func);
+    }
   }
 
   return {std::move(offloaded_tasks), std::move(module),
diff --git a/taichi/inc/archs.inc.h b/taichi/inc/archs.inc.h
index 979646b0a646b..51ff8772830df 100644
--- a/taichi/inc/archs.inc.h
+++ b/taichi/inc/archs.inc.h
@@ -14,6 +14,6 @@ PER_ARCH(opengl)  // OpenGL Compute Shaders
 PER_ARCH(dx11)    // Microsoft DirectX 11, WIP
 PER_ARCH(dx12)    // Microsoft DirectX 12, WIP
 PER_ARCH(opencl)  // OpenCL, N/A
-PER_ARCH(amdgpu)  // AMD GPU, WIP
+PER_ARCH(amdgpu)  // AMD GPU
 PER_ARCH(vulkan)  // Vulkan
 PER_ARCH(gles)    // OpenGL ES
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 46bf005fdf96f..2dfb9204a3a38 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -84,7 +84,7 @@ void FrontendForStmt::init_config(Arch arch, const ForLoopConfig &config) {
   strictly_serialized = config.strictly_serialized;
   mem_access_opt = config.mem_access_opt;
   block_dim = config.block_dim;
-  if (arch == Arch::cuda) {
+  if (arch == Arch::cuda || arch == Arch::amdgpu) {
     num_cpu_threads = 1;
     TI_ASSERT(block_dim <= taichi_max_gpu_block_dim);
   } else {  // cpu
@@ -1284,8 +1284,9 @@ void ASTBuilder::insert_for(const Expr &s,
 
 Expr ASTBuilder::insert_thread_idx_expr() {
   auto loop = stack_.size() ? stack_.back()->parent_stmt : nullptr;
-  TI_ERROR_IF(arch_ != Arch::cuda && !arch_is_cpu(arch_),
-              "ti.thread_idx() is only available in cuda or cpu context.");
+  TI_ERROR_IF(
+      arch_ != Arch::cuda && !arch_is_cpu(arch_) && arch_ != Arch::amdgpu,
+      "ti.thread_idx() is only available in cuda or cpu or amdgpu context.");
   if (loop != nullptr) {
     auto i = stack_.size() - 1;
     while (!(loop->is<FrontendForStmt>())) {
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index 42e525567b2b7..405791152f86e 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -1029,7 +1029,7 @@ class ASTBuilder {
   }
 
   void block_dim(int v) {
-    if (arch_ == Arch::cuda || arch_ == Arch::vulkan) {
+    if (arch_ == Arch::cuda || arch_ == Arch::vulkan || arch_ == Arch::amdgpu) {
       TI_ASSERT((v % 32 == 0) || bit::is_power_of_two(v));
     } else {
       TI_ASSERT(bit::is_power_of_two(v));
diff --git a/taichi/jit/jit_session.cpp b/taichi/jit/jit_session.cpp
index 73864832a8de4..7c0de151ce3f8 100644
--- a/taichi/jit/jit_session.cpp
+++ b/taichi/jit/jit_session.cpp
@@ -16,6 +16,11 @@ std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
     TaichiLLVMContext *tlctx,
     const CompileConfig &config,
     Arch arch);
+
+std::unique_ptr<JITSession> create_llvm_jit_session_amdgpu(
+    TaichiLLVMContext *tlctx,
+    const CompileConfig &config,
+    Arch arch);
 #endif
 
 JITSession::JITSession(TaichiLLVMContext *tlctx, const CompileConfig &config)
@@ -40,6 +45,12 @@ std::unique_ptr<JITSession> JITSession::create(TaichiLLVMContext *tlctx,
     return create_llvm_jit_session_cpu(tlctx, config, Arch::x64);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (arch == Arch::amdgpu) {
+#ifdef TI_WITH_AMDGPU
+    return create_llvm_jit_session_amdgpu(tlctx, config, arch);
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   }
 #else
diff --git a/taichi/platform/amdgpu/detect_amdgpu.cpp b/taichi/platform/amdgpu/detect_amdgpu.cpp
new file mode 100644
index 0000000000000..8a5523f60055a
--- /dev/null
+++ b/taichi/platform/amdgpu/detect_amdgpu.cpp
@@ -0,0 +1,17 @@
+#include "taichi/platform/amdgpu/detect_amdgpu.h"
+
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/rhi/amdgpu/amdgpu_driver.h"
+#endif
+
+namespace taichi {
+
+bool is_rocm_api_available() {
+#if defined(TI_WITH_AMDGPU)
+  return lang::AMDGPUDriver::get_instance_without_context().detected();
+#else
+  return false;
+#endif
+}
+
+}  // namespace taichi
diff --git a/taichi/platform/amdgpu/detect_amdgpu.h b/taichi/platform/amdgpu/detect_amdgpu.h
new file mode 100644
index 0000000000000..c7638e938ce0e
--- /dev/null
+++ b/taichi/platform/amdgpu/detect_amdgpu.h
@@ -0,0 +1,5 @@
+#pragma once
+
+namespace taichi {
+bool is_rocm_api_available();
+}  // namespace taichi
diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
index 9e018b511d27c..6a6f9d2e965b1 100644
--- a/taichi/program/compile_config.cpp
+++ b/taichi/program/compile_config.cpp
@@ -57,7 +57,7 @@ CompileConfig::CompileConfig() {
   print_kernel_nvptx = false;
   print_kernel_llvm_ir_optimized = false;
 
-  // CUDA backend options:
+  // CUDA/AMDGPU backend options:
   device_memory_GB = 1;  // by default, preallocate 1 GB GPU memory
   device_memory_fraction = 0.0;
 
diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
index 1714edfd1c0df..99c4f6a177611 100644
--- a/taichi/program/compile_config.h
+++ b/taichi/program/compile_config.h
@@ -64,7 +64,7 @@ struct CompileConfig {
   bool print_kernel_llvm_ir_optimized;
   bool print_kernel_nvptx;
 
-  // CUDA backend options:
+  // CUDA/AMDGPU backend options:
   float64 device_memory_GB;
   float64 device_memory_fraction;
 
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 401463175f24b..12f01f0f7774f 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -66,7 +66,8 @@ void Kernel::operator()(const CompileConfig &compile_config,
   compiled_(ctx_builder.get_context());
 
   const auto arch = compile_config.arch;
-  if (compile_config.debug && (arch_is_cpu(arch) || arch == Arch::cuda)) {
+  if (compile_config.debug &&
+      (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu)) {
     program->check_runtime_error();
   }
 }
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 77ce3213f5f55..63879e6e4255a 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -17,6 +17,7 @@
 #include "taichi/ir/frontend_ir.h"
 #include "taichi/program/snode_expr_utils.h"
 #include "taichi/math/arithmetic.h"
+
 #ifdef TI_WITH_LLVM
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #include "taichi/codegen/llvm/struct_llvm.h"
@@ -349,7 +350,7 @@ Ndarray *Program::create_ndarray(const DataType type,
   auto arr = std::make_unique<Ndarray>(this, type, shape, layout);
   if (zero_fill) {
     Arch arch = compile_config().arch;
-    if (arch_is_cpu(arch) || arch == Arch::cuda) {
+    if (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu) {
       fill_ndarray_fast_u32(arr.get(), /*data=*/0);
     } else if (arch != Arch::dx12) {
       // Device api support for dx12 backend are not complete yet
@@ -408,7 +409,8 @@ Texture *Program::create_texture(const DataType type,
 intptr_t Program::get_ndarray_data_ptr_as_int(const Ndarray *ndarray) {
   uint64_t *data_ptr{nullptr};
   if (arch_is_cpu(compile_config().arch) ||
-      compile_config().arch == Arch::cuda) {
+      compile_config().arch == Arch::cuda ||
+      compile_config().arch == Arch::amdgpu) {
     // For the LLVM backends, device allocation is a physical pointer.
     data_ptr =
         program_impl_->get_ndarray_alloc_info_ptr(ndarray->ndarray_alloc_);
diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp
index 3a6c2d90360cd..65bc95777b781 100644
--- a/taichi/python/export_misc.cpp
+++ b/taichi/python/export_misc.cpp
@@ -24,6 +24,11 @@
 #include "taichi/rhi/cuda/cuda_driver.h"
 #endif
 
+#include "taichi/platform/amdgpu/detect_amdgpu.h"
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/rhi/amdgpu/amdgpu_driver.h"
+#endif
+
 #ifdef TI_WITH_VULKAN
 #include "taichi/rhi/vulkan/vulkan_loader.h"
 #endif
@@ -144,6 +149,7 @@ void export_misc(py::module &m) {
   m.def("pop_python_print_buffer", []() { return py_cout.pop_content(); });
   m.def("toggle_python_print_buffer", [](bool opt) { py_cout.enabled = opt; });
   m.def("with_cuda", is_cuda_api_available);
+  m.def("with_amdgpu", is_rocm_api_available);
 #ifdef TI_WITH_METAL
   m.def("with_metal", taichi::lang::metal::is_metal_api_available);
 #else
diff --git a/taichi/rhi/arch.cpp b/taichi/rhi/arch.cpp
index a441b9345f08f..7e5beead84bb9 100644
--- a/taichi/rhi/arch.cpp
+++ b/taichi/rhi/arch.cpp
@@ -49,7 +49,7 @@ bool arch_is_cuda(Arch arch) {
 
 bool arch_uses_llvm(Arch arch) {
   return (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::cuda ||
-          arch == Arch::dx12 || arch == Arch::wasm);
+          arch == Arch::dx12 || arch == Arch::wasm || arch == Arch::amdgpu);
 }
 
 bool arch_is_gpu(Arch arch) {
diff --git a/taichi/rhi/interop/CMakeLists.txt b/taichi/rhi/interop/CMakeLists.txt
index 9f73e9761f80b..b88bca8b0fe1e 100644
--- a/taichi/rhi/interop/CMakeLists.txt
+++ b/taichi/rhi/interop/CMakeLists.txt
@@ -12,6 +12,10 @@ if (TI_WITH_CUDA)
   target_compile_definitions(${INTEROP_RHI} PRIVATE -DTI_WITH_CUDA)
 endif()
 
+if (TI_WITH_AMDGPU)
+  target_compile_definitions(${INTEROP_RHI} PRIVATE -DTI_WITH_AMDGPU)
+endif()
+
 if (TI_WITH_VULKAN)
   target_compile_definitions(${INTEROP_RHI} PRIVATE -DTI_WITH_VULKAN)
 endif()
diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index 8318e00194c68..14f103bea3e6a 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -105,6 +105,16 @@ TaichiLLVMContext::TaichiLLVMContext(const CompileConfig &config, Arch arch)
     LLVMInitializeDirectXTargetMC();
     LLVMInitializeDirectXTargetInfo();
     LLVMInitializeDirectXAsmPrinter();
+#endif
+  } else if (arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetMC();
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUAsmPrinter();
+    LLVMInitializeAMDGPUAsmParser();
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
 #if defined(TI_WITH_CUDA)
@@ -803,6 +813,10 @@ void TaichiLLVMContext::mark_function_as_cuda_kernel(llvm::Function *func,
   }
 }
 
+void TaichiLLVMContext::mark_function_as_amdgpu_kernel(llvm::Function *func) {
+  func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+}
+
 void TaichiLLVMContext::eliminate_unused_functions(
     llvm::Module *module,
     std::function<bool(const std::string &)> export_indicator) {
diff --git a/taichi/runtime/llvm/llvm_context.h b/taichi/runtime/llvm/llvm_context.h
index 76ef78812e794..8190c66cd8bfa 100644
--- a/taichi/runtime/llvm/llvm_context.h
+++ b/taichi/runtime/llvm/llvm_context.h
@@ -125,6 +125,8 @@ class TaichiLLVMContext {
 
   void mark_function_as_cuda_kernel(llvm::Function *func, int block_dim = 0);
 
+  void mark_function_as_amdgpu_kernel(llvm::Function *func);
+
   void fetch_this_thread_struct_module();
   llvm::Module *get_this_thread_runtime_module();
   llvm::Function *get_runtime_function(const std::string &name);
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 2e6c7e6763683..644c329c7eb7a 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -11,9 +11,10 @@
 #include "taichi/rhi/cuda/cuda_context.h"
 #endif
 
-#if defined(TI_WITH_AMDGPU)
+#include "taichi/platform/amdgpu/detect_amdgpu.h"
 #include "taichi/rhi/amdgpu/amdgpu_driver.h"
 #include "taichi/rhi/amdgpu/amdgpu_device.h"
+#if defined(TI_WITH_AMDGPU)
 #include "taichi/rhi/amdgpu/amdgpu_context.h"
 #endif
 
@@ -52,6 +53,21 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
     if (config.arch != Arch::cuda) {
       TI_WARN("Falling back to {}.", arch_name(host_arch()));
     }
+  } else if (config.arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    if (!is_rocm_api_available()) {
+      TI_WARN("No AMDGPU ROCm API detected.");
+      config.arch = host_arch();
+    } else if (!AMDGPUContext::get_instance().detected()) {
+      TI_WARN("No AMDGPU device detected.");
+      config.arch = host_arch();
+    } else {
+      // AMDGPU runtime created successfully
+    }
+#else
+    TI_WARN("Taichi is not compiled with AMDGPU.");
+    config.arch = host_arch();
+#endif
   }
 
   if (config.kernel_profiler) {
@@ -108,6 +124,22 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
 
 #if defined(TI_WITH_AMDGPU)
   else if (config.arch == Arch::amdgpu) {
+    int num_workgroups{1};
+    AMDGPUDriver::get_instance().device_get_attribute(
+        &num_workgroups, HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
+    int query_max_block_dim{1024};
+    AMDGPUDriver::get_instance().device_get_attribute(
+        &query_max_block_dim, HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 0);
+    // magic number 32
+    // I didn't find the relevant parameter to limit the max block num per CU
+    // So ....
+    int query_max_block_per_cu{32};
+    if (config.max_block_dim == 0) {
+      config.max_block_dim = query_max_block_dim;
+    }
+    if (config.saturating_grid_dim == 0) {
+      config.saturating_grid_dim = num_workgroups * query_max_block_per_cu * 2;
+    }
     AMDGPUContext::get_instance().set_debug(config.debug);
     device_ = std::make_shared<amdgpu::AmdgpuDevice>();
   }
@@ -162,6 +194,12 @@ void LlvmRuntimeExecutor::synchronize() {
     CUDADriver::get_instance().stream_synchronize(nullptr);
 #else
     TI_ERROR("No CUDA support");
+#endif
+  } else if (config_.arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+#else
+    TI_ERROR("No AMDGPU support");
 #endif
   }
   fflush(stdout);
@@ -178,6 +216,13 @@ uint64 LlvmRuntimeExecutor::fetch_result_uint64(int i, uint64 *result_buffer) {
                                                      sizeof(uint64));
 #else
     TI_NOT_IMPLEMENTED;
+#endif
+  } else if (config_.arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
+                                                       sizeof(uint64));
+#else
+    TI_NOT_IMPLEMENTED;
 #endif
   } else {
     ret = result_buffer[i];
@@ -341,6 +386,12 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
     CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (config_.arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    AMDGPUDriver::get_instance().memset(root_buffer, 0, rounded_size);
+#else
+    TI_NOT_IMPLEMENTED;
 #endif
   } else {
     std::memset(root_buffer, 0, rounded_size);
@@ -353,6 +404,12 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
     alloc = cuda_device()->import_memory(root_buffer, rounded_size);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (config_.arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    alloc = amdgpu_device()->import_memory(root_buffer, rounded_size);
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
     alloc = cpu_device()->import_memory(root_buffer, rounded_size);
@@ -406,6 +463,13 @@ cuda::CudaDevice *LlvmRuntimeExecutor::cuda_device() {
   return static_cast<cuda::CudaDevice *>(device_.get());
 }
 
+amdgpu::AmdgpuDevice *LlvmRuntimeExecutor::amdgpu_device() {
+  if (config_.arch != Arch::amdgpu) {
+    TI_ERROR("arch is not amdgpu");
+  }
+  return static_cast<amdgpu::AmdgpuDevice *>(device_.get());
+}
+
 cpu::CpuDevice *LlvmRuntimeExecutor::cpu_device() {
   TI_ERROR_IF(!arch_is_cpu(config_.arch), "arch is not cpu");
   return static_cast<cpu::CpuDevice *>(device_.get());
@@ -441,6 +505,12 @@ void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc,
     CUDADriver::get_instance().memsetd32((void *)ptr, data, size);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (config_.arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    AMDGPUDriver::get_instance().memset((void *)ptr, data, size);
+#else
+    TI_NOT_IMPLEMENTED;
 #endif
   } else {
     std::fill((uint32_t *)ptr, (uint32_t *)ptr + size, data);
@@ -454,6 +524,12 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
     return (uint64_t *)cuda_device()->get_alloc_info(alloc).ptr;
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (config_.arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    return (uint64_t *)amdgpu_device()->get_alloc_info(alloc).ptr;
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
     return (uint64_t *)cpu_device()->get_alloc_info(alloc).ptr;
@@ -462,11 +538,17 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
 
 void LlvmRuntimeExecutor::finalize() {
   profiler_ = nullptr;
-#if defined(TI_WITH_CUDA)
   if (preallocated_device_buffer_ != nullptr) {
-    cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
-  }
+    if (config_.arch == Arch::cuda) {
+#if defined(TI_WITH_CUDA)
+      cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
 #endif
+    } else if (config_.arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+      amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_);
+#endif
+    }
+  }
 }
 
 void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
@@ -502,6 +584,36 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
                                       prealloc_size);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (config_.arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    AMDGPUDriver::get_instance().malloc(
+        (void **)result_buffer_ptr,
+        sizeof(uint64) * taichi_result_buffer_entries);
+    const auto total_mem = AMDGPUContext::get_instance().get_total_memory();
+    if (config_.device_memory_fraction == 0) {
+      TI_ASSERT(config_.device_memory_GB > 0);
+      prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30));
+    } else {
+      prealloc_size = std::size_t(config_.device_memory_fraction * total_mem);
+    }
+    TI_ASSERT(prealloc_size <= total_mem);
+
+    TI_TRACE("Allocating device memory {:.2f} GB",
+             1.0 * prealloc_size / (1UL << 30));
+
+    Device::AllocParams preallocated_device_buffer_alloc_params;
+    preallocated_device_buffer_alloc_params.size = prealloc_size;
+    preallocated_device_buffer_alloc_ = amdgpu_device()->allocate_memory(
+        preallocated_device_buffer_alloc_params);
+    amdgpu::AmdgpuDevice::AllocInfo preallocated_device_buffer_alloc_info =
+        amdgpu_device()->get_alloc_info(preallocated_device_buffer_alloc_);
+    preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr;
+
+    AMDGPUDriver::get_instance().memset(preallocated_device_buffer_, 0,
+                                        prealloc_size);
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
     *result_buffer_ptr = (uint64 *)memory_pool->allocate(
@@ -517,8 +629,8 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
   // Number of random states. One per CPU/CUDA thread.
   int num_rand_states = 0;
 
-  if (config_.arch == Arch::cuda) {
-#if defined(TI_WITH_CUDA)
+  if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
+#if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU)
     // It is important to make sure that every CUDA thread has its own random
     // state so that we do not need expensive per-state locks.
     num_rand_states = config_.saturating_grid_dim * config_.max_block_dim;
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h
index 58ebaf1bd68ae..5e3e7cf8afcdc 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.h
+++ b/taichi/runtime/llvm/llvm_runtime_executor.h
@@ -25,6 +25,10 @@ namespace cuda {
 class CudaDevice;
 }  // namespace cuda
 
+namespace amdgpu {
+class AmdgpuDevice;
+}  // namespace amdgpu
+
 namespace cpu {
 class CpuDevice;
 }  // namespace cpu
@@ -113,6 +117,7 @@ class LlvmRuntimeExecutor {
   /* -------------------------- */
   cuda::CudaDevice *cuda_device();
   cpu::CpuDevice *cpu_device();
+  amdgpu::AmdgpuDevice *amdgpu_device();
 
   void finalize();
 
diff --git a/taichi/runtime/llvm/runtime_module/CMakeLists.txt b/taichi/runtime/llvm/runtime_module/CMakeLists.txt
index 0edac71b82c7c..92303abcc675b 100644
--- a/taichi/runtime/llvm/runtime_module/CMakeLists.txt
+++ b/taichi/runtime/llvm/runtime_module/CMakeLists.txt
@@ -14,6 +14,6 @@ function(COMPILE_LLVM_RUNTIME rtm_arch)
 endfunction()
 
 # Build llvm-runtime for host arch and cuda (if available)
-foreach(arch IN LISTS HOST_ARCH CUDA_ARCH DX12_ARCH)
+foreach(arch IN LISTS HOST_ARCH CUDA_ARCH DX12_ARCH AMDGPU_ARCH)
   compile_llvm_runtime(${arch})
 endforeach()
diff --git a/taichi/runtime/program_impls/llvm/CMakeLists.txt b/taichi/runtime/program_impls/llvm/CMakeLists.txt
index ab4a73a9d30bb..0b63c2d618ae2 100644
--- a/taichi/runtime/program_impls/llvm/CMakeLists.txt
+++ b/taichi/runtime/program_impls/llvm/CMakeLists.txt
@@ -20,3 +20,7 @@ target_link_libraries(llvm_program_impl PRIVATE cpu_runtime)
 if (TI_WITH_CUDA)
     target_link_libraries(llvm_program_impl PRIVATE cuda_runtime)
 endif()
+
+if (TI_WITH_AMDGPU)
+    target_link_libraries(llvm_program_impl PRIVATE amdgpu_runtime)
+endif()
diff --git a/taichi/runtime/program_impls/llvm/llvm_program.cpp b/taichi/runtime/program_impls/llvm/llvm_program.cpp
index 14273b4e62268..a2395954e3420 100644
--- a/taichi/runtime/program_impls/llvm/llvm_program.cpp
+++ b/taichi/runtime/program_impls/llvm/llvm_program.cpp
@@ -15,6 +15,10 @@
 #include "taichi/codegen/cuda/codegen_cuda.h"
 #endif
 
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/codegen/amdgpu/codegen_amdgpu.h"
+#endif
+
 #if defined(TI_WITH_DX12)
 #include "taichi/runtime/dx12/aot_module_builder_impl.h"
 #include "taichi/codegen/dx12/codegen_dx12.h"
diff --git a/taichi/runtime/program_impls/llvm/llvm_program.h b/taichi/runtime/program_impls/llvm/llvm_program.h
index 600a134cc844a..75f83c1576437 100644
--- a/taichi/runtime/program_impls/llvm/llvm_program.h
+++ b/taichi/runtime/program_impls/llvm/llvm_program.h
@@ -26,6 +26,10 @@ namespace cuda {
 class CudaDevice;
 }  // namespace cuda
 
+namespace amdgpu {
+class AmdgpuDevice;
+}  // namespace amdgpu
+
 namespace cpu {
 class CpuDevice;
 }  // namespace cpu
diff --git a/taichi/system/memory_pool.h b/taichi/system/memory_pool.h
index 06dce540d3ad5..63fa6c0dcd8b4 100644
--- a/taichi/system/memory_pool.h
+++ b/taichi/system/memory_pool.h
@@ -27,6 +27,7 @@ class TI_DLL_EXPORT MemoryPool {
 
   MemRequestQueue *queue;
   void *cuda_stream{nullptr};
+  void *amdgpu_stream{nullptr};
 
   // In the future we wish to move the MemoryPool inside each Device
   // so that the memory allocated from each Device can be used as-is.
@@ -50,6 +51,7 @@ class TI_DLL_EXPORT MemoryPool {
 
  private:
   static constexpr bool use_cuda_stream = false;
+  static constexpr bool use_amdgpu_stream = false;
   Arch arch_;
   Device *device_;
 };
diff --git a/taichi/transforms/ast_to_ir.cpp b/taichi/transforms/ast_to_ir.cpp
index da40e8d665d82..d7cce13a19661 100644
--- a/taichi/transforms/ast_to_ir.cpp
+++ b/taichi/transforms/ast_to_ir.cpp
@@ -7,7 +7,7 @@ namespace irpass {
 
 static bool supports_lowering(Arch arch) {
   return arch_is_cpu(arch) || (arch == Arch::cuda) || (arch == Arch::dx12) ||
-         (arch == Arch::metal);
+         (arch == Arch::metal) || (arch == Arch::amdgpu);
 }
 
 void ast_to_ir(const CompileConfig &config,
diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
index cabcab019e305..33772f6314bd4 100644
--- a/taichi/transforms/compile_to_offloads.cpp
+++ b/taichi/transforms/compile_to_offloads.cpp
@@ -94,7 +94,8 @@ void compile_to_offloads(IRNode *ir,
 
   // TODO: strictly enforce bit vectorization for x86 cpu and CUDA now
   //       create a separate CompileConfig flag for the new pass
-  if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) {
+  if (arch_is_cpu(config.arch) || config.arch == Arch::cuda ||
+      config.arch == Arch::amdgpu) {
     irpass::bit_loop_vectorize(ir);
     irpass::type_check(ir, config);
     print("Bit Loop Vectorized");
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 6fdf9fec85a95..0c2df0dd1496e 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -62,28 +62,28 @@ def _get_expected_matrix_apis():
     'TaichiAssertionError', 'TaichiCompilationError', 'TaichiNameError',
     'TaichiRuntimeError', 'TaichiRuntimeTypeError', 'TaichiSyntaxError',
     'TaichiTypeError', 'Texture', 'Vector', 'VectorNdarray', 'WARN', 'abs',
-    'acos', 'activate', 'ad', 'algorithms', 'aot', 'append', 'arm64', 'asin',
-    'assume_in_range', 'atan2', 'atomic_add', 'atomic_and', 'atomic_max',
-    'atomic_min', 'atomic_or', 'atomic_sub', 'atomic_xor', 'axes', 'bit_cast',
-    'bit_shr', 'block_local', 'cache_read_only', 'cast', 'cc', 'ceil', 'cos',
-    'cpu', 'cuda', 'data_oriented', 'dataclass', 'deactivate',
-    'deactivate_all_snodes', 'dx11', 'dx12', 'eig', 'exp', 'experimental',
-    'extension', 'f16', 'f32', 'f64', 'field', 'float16', 'float32', 'float64',
-    'floor', 'func', 'get_addr', 'get_compute_stream_device_time_elapsed_us',
-    'gles', 'global_thread_idx', 'gpu', 'graph', 'grouped', 'hex_to_rgb', 'i',
-    'i16', 'i32', 'i64', 'i8', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il',
-    'init', 'int16', 'int32', 'int64', 'int8', 'is_active',
-    'is_logging_effective', 'j', 'jk', 'jkl', 'jl', 'k', 'kernel', 'kl', 'l',
-    'lang', 'length', 'linalg', 'log', 'loop_config', 'math', 'max',
-    'mesh_local', 'mesh_patch_idx', 'metal', 'min', 'ndarray', 'ndrange',
-    'no_activate', 'one', 'opengl', 'polar_decompose', 'pow', 'profiler',
-    'randn', 'random', 'raw_div', 'raw_mod', 'ref', 'rescale_index', 'reset',
-    'rgb_to_hex', 'root', 'round', 'rsqrt', 'select', 'set_logging_level',
-    'simt', 'sin', 'solve', 'sparse_matrix_builder', 'sqrt', 'static',
-    'static_assert', 'static_print', 'stop_grad', 'svd', 'sym_eig', 'sync',
-    'tan', 'tanh', 'template', 'tools', 'types', 'u16', 'u32', 'u64', 'u8',
-    'ui', 'uint16', 'uint32', 'uint64', 'uint8', 'vulkan', 'wasm', 'x64',
-    'x86_64', 'zero'
+    'acos', 'activate', 'ad', 'algorithms', 'amdgpu', 'aot', 'append', 'arm64',
+    'asin', 'assume_in_range', 'atan2', 'atomic_add', 'atomic_and',
+    'atomic_max', 'atomic_min', 'atomic_or', 'atomic_sub', 'atomic_xor',
+    'axes', 'bit_cast', 'bit_shr', 'block_local', 'cache_read_only', 'cast',
+    'cc', 'ceil', 'cos', 'cpu', 'cuda', 'data_oriented', 'dataclass',
+    'deactivate', 'deactivate_all_snodes', 'dx11', 'dx12', 'eig', 'exp',
+    'experimental', 'extension', 'f16', 'f32', 'f64', 'field', 'float16',
+    'float32', 'float64', 'floor', 'func', 'get_addr',
+    'get_compute_stream_device_time_elapsed_us', 'gles', 'global_thread_idx',
+    'gpu', 'graph', 'grouped', 'hex_to_rgb', 'i', 'i16', 'i32', 'i64', 'i8',
+    'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'init', 'int16', 'int32',
+    'int64', 'int8', 'is_active', 'is_logging_effective', 'j', 'jk', 'jkl',
+    'jl', 'k', 'kernel', 'kl', 'l', 'lang', 'length', 'linalg', 'log',
+    'loop_config', 'math', 'max', 'mesh_local', 'mesh_patch_idx', 'metal',
+    'min', 'ndarray', 'ndrange', 'no_activate', 'one', 'opengl',
+    'polar_decompose', 'pow', 'profiler', 'randn', 'random', 'raw_div',
+    'raw_mod', 'ref', 'rescale_index', 'reset', 'rgb_to_hex', 'root', 'round',
+    'rsqrt', 'select', 'set_logging_level', 'simt', 'sin', 'solve',
+    'sparse_matrix_builder', 'sqrt', 'static', 'static_assert', 'static_print',
+    'stop_grad', 'svd', 'sym_eig', 'sync', 'tan', 'tanh', 'template', 'tools',
+    'types', 'u16', 'u32', 'u64', 'u8', 'ui', 'uint16', 'uint32', 'uint64',
+    'uint8', 'vulkan', 'wasm', 'x64', 'x86_64', 'zero'
 ]
 user_api[ti.ad] = [
     'FwdMode', 'Tape', 'clear_all_gradients', 'grad_for', 'grad_replaced',