galeselee · galeselee · Feb 3, 2023 · Feb 3, 2023 · Feb 3, 2023 · Feb 3, 2023
diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh
@@ -123,7 +123,7 @@ if [ -z "$GPU_TEST" ]; then
     fi
 elif [ ! -z "$AMDGPU_TEST" ]; then
     run-it cpu    $(nproc)
-    # run-it amdgpu 4
+    run-it amdgpu 8
 else
     run-it cuda   8
     run-it cpu    $(nproc)

diff --git a/c_api/include/taichi/taichi_core.h b/c_api/include/taichi/taichi_core.h
@@ -369,6 +369,8 @@ typedef enum TiArch {
   TI_ARCH_OPENGL = 6,
   // OpenGL ES GPU backend.
   TI_ARCH_GLES = 7,
+  // AMDGPU backend
+  TI_ARCH_AMDGPU = 8,
   TI_ARCH_MAX_ENUM = 0xffffffff,
 } TiArch;
 

diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
@@ -93,7 +93,8 @@ file(GLOB TAICHI_CORE_SOURCE
     "taichi/system/*"
     "taichi/transforms/*"
     "taichi/aot/*.cpp" "taichi/aot/*.h"
-    "taichi/platform/cuda/*" "taichi/platform/mac/*" "taichi/platform/windows/*"
+    "taichi/platform/cuda/*" "taichi/platform/amdgpu/*"
+    "taichi/platform/mac/*" "taichi/platform/windows/*"
     "taichi/codegen/*.cpp" "taichi/codegen/*.h"
     "taichi/runtime/*.h" "taichi/runtime/*.cpp"
     "taichi/rhi/*.h" "taichi/rhi/*.cpp"
@@ -116,7 +117,7 @@ endif()
 
 if (TI_WITH_AMDGPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_AMDGPU")
-# file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
+  file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
   list(APPEND TAIHI_CORE_SOURCE ${TAICHI_AMDGPU_RUNTIME_SOURCE})
 endif()
 

diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
@@ -119,6 +119,11 @@
 """
 # ----------------------
 
+amdgpu = _ti_core.amdgpu
+"""The AMDGPU backend.
+"""
+# ----------------------
+
 metal = _ti_core.metal
 """The Apple Metal backend.
 """
@@ -159,9 +164,9 @@
 """
 # ----------------------
 
-gpu = [cuda, metal, vulkan, opengl, dx11, dx12, gles]
+gpu = [cuda, metal, vulkan, opengl, dx11, dx12, gles, amdgpu]
 """A list of GPU backends supported on the current system.
-Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11', 'dx12', 'gles'.
+Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11', 'dx12', 'gles', 'amdgpu'.
 
 When this is used, Taichi automatically picks the matching GPU backend. If no
 GPU is detected, Taichi falls back to the CPU backend.
@@ -726,6 +731,7 @@ def is_arch_supported(arch):
 
     arch_table = {
         cuda: _ti_core.with_cuda,
+        amdgpu: _ti_core.with_amdgpu,
         metal: _ti_core.with_metal,
         opengl: functools.partial(_ti_core.with_opengl, False),
         gles: functools.partial(_ti_core.with_opengl, True),
@@ -773,8 +779,8 @@ def get_compute_stream_device_time_elapsed_us() -> float:
 __all__ = [
     'i', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'j', 'jk', 'jkl', 'jl',
     'k', 'kl', 'l', 'x86_64', 'x64', 'dx11', 'dx12', 'wasm', 'arm64', 'cc',
-    'cpu', 'cuda', 'gles', 'gpu', 'metal', 'opengl', 'vulkan', 'extension',
-    'loop_config', 'global_thread_idx', 'assume_in_range', 'block_local',
-    'cache_read_only', 'init', 'mesh_local', 'no_activate', 'reset',
-    'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us'
+    'cpu', 'cuda', 'amdgpu', 'gles', 'gpu', 'metal', 'opengl', 'vulkan',
+    'extension', 'loop_config', 'global_thread_idx', 'assume_in_range',
+    'block_local', 'cache_read_only', 'init', 'mesh_local', 'no_activate',
+    'reset', 'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us'
 ]
diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
@@ -84,7 +84,8 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
       } else {
         TI_NOT_IMPLEMENTED
       }
-    } else if (op == UnaryOpType::sgn) {
+    }  // TODO simplify the impl of sgn
+    else if (op == UnaryOpType::sgn) {
       if (input_taichi_type->is_primitive(PrimitiveTypeID::i32)) {
         auto ashr = builder->CreateAShr(input, 31);
         auto sub = builder->CreateSub(0, input);
@@ -141,6 +142,57 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->SetInsertPoint(bb_merge);
         llvm_val[stmt] =
             builder->CreateLoad(llvm::Type::getFloatTy(*llvm_context), cast);
+      } else if (input_taichi_type->is_primitive(PrimitiveTypeID::f64)) {
+        auto func = builder->GetInsertBlock()->getParent();
+        auto bb_oeq_then = BasicBlock::Create(*llvm_context, "oeq_then", func);
+        auto bb_oeq_else = BasicBlock::Create(*llvm_context, "oeq_else");
+        auto bb_merge = BasicBlock::Create(*llvm_context, "merge");
+        auto bb_olt_then = BasicBlock::Create(*llvm_context, "olt_then", func);
+        auto bb_olt_else = BasicBlock::Create(*llvm_context, "olt_else");
+
+        auto alloc = builder->CreateAlloca(
+            llvm::Type::getDoubleTy(*llvm_context), (unsigned)5);
+        auto newty = llvm::PointerType::get(
+            llvm::Type::getDoubleTy(*llvm_context), (unsigned)0);
+        auto cast = builder->CreateAddrSpaceCast(alloc, newty);
+        auto fcmp_oeq = builder->CreateFCmpOEQ(
+            input,
+            llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0));
+        builder->CreateCondBr(fcmp_oeq, bb_oeq_then, bb_oeq_else);
+        builder->SetInsertPoint(bb_oeq_then);
+        builder->CreateStore(
+            llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0),
+            cast);
+        builder->CreateBr(bb_merge);
+        bb_oeq_then = builder->GetInsertBlock();
+
+        func->getBasicBlockList().push_back(bb_oeq_else);
+        builder->SetInsertPoint(bb_oeq_else);
+        auto fcmp_olt = builder->CreateFCmpOLT(
+            input,
+            llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0));
+        builder->CreateCondBr(fcmp_olt, bb_olt_then, bb_olt_else);
+        bb_oeq_else = builder->GetInsertBlock();
+
+        builder->SetInsertPoint(bb_olt_then);
+        builder->CreateStore(
+            llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), -1),
+            cast);
+        builder->CreateBr(bb_merge);
+        bb_olt_then = builder->GetInsertBlock();
+
+        func->getBasicBlockList().push_back(bb_olt_else);
+        builder->SetInsertPoint(bb_olt_else);
+        builder->CreateStore(
+            llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 1),
+            cast);
+        builder->CreateBr(bb_merge);
+        bb_olt_else = builder->GetInsertBlock();
+
+        func->getBasicBlockList().push_back(bb_merge);
+        builder->SetInsertPoint(bb_merge);
+        llvm_val[stmt] =
+            builder->CreateLoad(llvm::Type::getDoubleTy(*llvm_context), cast);
       }
     }
     UNARY_STD(cos)
@@ -265,12 +317,32 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
   }
 
   void visit(GlobalLoadStmt *stmt) override {
-    if (auto get_ch = stmt->src->cast<GetChStmt>()) {
-      bool should_cache_as_read_only = current_offload->mem_access_opt.has_flag(
-          get_ch->output_snode, SNodeAccessFlag::read_only);
-      create_global_load(stmt, should_cache_as_read_only);
+    auto ptr = llvm_val[stmt->src];
+    auto ptr_type = stmt->src->ret_type->as<PointerType>();
+    if (ptr_type->is_bit_pointer()) {
+      auto val_type = ptr_type->get_pointee_type();
+      auto get_ch = stmt->src->as<GetChStmt>();
+      auto physical_type =
+          tlctx->get_data_type(get_ch->input_snode->physical_type);
+      auto [byte_ptr, bit_offset] = load_bit_ptr(ptr);
+      auto physical_value = builder->CreateLoad(physical_type, byte_ptr);
+      if (auto qit = val_type->cast<QuantIntType>()) {
+        llvm_val[stmt] = extract_quant_int(physical_value, bit_offset, qit);
+      } else if (auto qfxt = val_type->cast<QuantFixedType>()) {
+        qit = qfxt->get_digits_type()->as<QuantIntType>();
+        auto digits = extract_quant_int(physical_value, bit_offset, qit);
+        llvm_val[stmt] = reconstruct_quant_fixed(digits, qfxt);
+      } else {
+        TI_ASSERT(val_type->is<QuantFloatType>());
+        TI_ASSERT(get_ch->input_snode->dt->is<BitStructType>());
+        llvm_val[stmt] = extract_quant_float(
+            physical_value, get_ch->input_snode->dt->as<BitStructType>(),
+            get_ch->output_snode->id_in_bit_struct);
+      }
     } else {
-      create_global_load(stmt, false);
+      // Byte pointer case.
+      llvm_val[stmt] =
+          builder->CreateLoad(tlctx->get_data_type(stmt->ret_type), ptr);
     }
   }
 
@@ -370,7 +442,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         llvm_val[stmt] = call("__ocml_pow_f16", {lhs, rhs});
       } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f32)) {
         llvm_val[stmt] = call("__ocml_pow_f32", {lhs, rhs});
-      } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i64)) {
+      } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f64)) {
         llvm_val[stmt] = call("__ocml_pow_f64", {lhs, rhs});
       } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i32)) {
         auto sitofp_lhs_ =
@@ -388,7 +460,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         llvm_val[stmt] = call("__ocml_atan2_f16", {lhs, rhs});
       } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f32)) {
         llvm_val[stmt] = call("__ocml_atan2_f32", {lhs, rhs});
-      } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i64)) {
+      } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f64)) {
         llvm_val[stmt] = call("__ocml_atan2_f64", {lhs, rhs});
       } else {
         TI_NOT_IMPLEMENTED

diff --git a/taichi/codegen/codegen.cpp b/taichi/codegen/codegen.cpp
@@ -14,6 +14,9 @@
 #if defined(TI_WITH_DX12)
 #include "taichi/codegen/dx12/codegen_dx12.h"
 #endif
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/codegen/amdgpu/codegen_amdgpu.h"
+#endif
 #include "taichi/system/timer.h"
 #include "taichi/ir/analysis.h"
 #include "taichi/ir/transforms.h"
@@ -47,6 +50,12 @@ std::unique_ptr<KernelCodeGen> KernelCodeGen::create(
     return std::make_unique<KernelCodeGenDX12>(compile_config, kernel);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (arch == Arch::amdgpu) {
+#if defined(TI_WITH_AMDGPU)
+    return std::make_unique<KernelCodeGenAMDGPU>(compile_config, kernel);
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
     TI_NOT_IMPLEMENTED

diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -2634,6 +2634,12 @@ LLVMCompiledTask TaskCodeGenLLVM::run_compilation() {
       TI_ASSERT(func);
       tlctx->mark_function_as_cuda_kernel(func, task.block_dim);
     }
+  } else if (compile_config.arch == Arch::amdgpu) {
+    for (const auto &task : offloaded_tasks) {
+      llvm::Function *func = module->getFunction(task.name);
+      TI_ASSERT(func);
+      tlctx->mark_function_as_amdgpu_kernel(func);
+    }
   }
 
   return {std::move(offloaded_tasks), std::move(module),

diff --git a/taichi/inc/archs.inc.h b/taichi/inc/archs.inc.h
@@ -14,6 +14,6 @@ PER_ARCH(opengl)  // OpenGL Compute Shaders
 PER_ARCH(dx11)    // Microsoft DirectX 11, WIP
 PER_ARCH(dx12)    // Microsoft DirectX 12, WIP
 PER_ARCH(opencl)  // OpenCL, N/A
-PER_ARCH(amdgpu)  // AMD GPU, WIP
+PER_ARCH(amdgpu)  // AMD GPU
 PER_ARCH(vulkan)  // Vulkan
 PER_ARCH(gles)    // OpenGL ES
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
@@ -84,7 +84,7 @@ void FrontendForStmt::init_config(Arch arch, const ForLoopConfig &config) {
   strictly_serialized = config.strictly_serialized;
   mem_access_opt = config.mem_access_opt;
   block_dim = config.block_dim;
-  if (arch == Arch::cuda) {
+  if (arch == Arch::cuda || arch == Arch::amdgpu) {
     num_cpu_threads = 1;
     TI_ASSERT(block_dim <= taichi_max_gpu_block_dim);
   } else {  // cpu
@@ -1284,8 +1284,9 @@ void ASTBuilder::insert_for(const Expr &s,
 
 Expr ASTBuilder::insert_thread_idx_expr() {
   auto loop = stack_.size() ? stack_.back()->parent_stmt : nullptr;
-  TI_ERROR_IF(arch_ != Arch::cuda && !arch_is_cpu(arch_),
-              "ti.thread_idx() is only available in cuda or cpu context.");
+  TI_ERROR_IF(
+      arch_ != Arch::cuda && !arch_is_cpu(arch_) && arch_ != Arch::amdgpu,
+      "ti.thread_idx() is only available in cuda or cpu or amdgpu context.");
   if (loop != nullptr) {
     auto i = stack_.size() - 1;
     while (!(loop->is<FrontendForStmt>())) {

diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
@@ -1029,7 +1029,7 @@ class ASTBuilder {
   }
 
   void block_dim(int v) {
-    if (arch_ == Arch::cuda || arch_ == Arch::vulkan) {
+    if (arch_ == Arch::cuda || arch_ == Arch::vulkan || arch_ == Arch::amdgpu) {
       TI_ASSERT((v % 32 == 0) || bit::is_power_of_two(v));
     } else {
       TI_ASSERT(bit::is_power_of_two(v));

diff --git a/taichi/jit/jit_session.cpp b/taichi/jit/jit_session.cpp
@@ -16,6 +16,11 @@ std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
     TaichiLLVMContext *tlctx,
     const CompileConfig &config,
     Arch arch);
+
+std::unique_ptr<JITSession> create_llvm_jit_session_amdgpu(
+    TaichiLLVMContext *tlctx,
+    const CompileConfig &config,
+    Arch arch);
 #endif
 
 JITSession::JITSession(TaichiLLVMContext *tlctx, const CompileConfig &config)
@@ -40,6 +45,12 @@ std::unique_ptr<JITSession> JITSession::create(TaichiLLVMContext *tlctx,
     return create_llvm_jit_session_cpu(tlctx, config, Arch::x64);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (arch == Arch::amdgpu) {
+#ifdef TI_WITH_AMDGPU
+    return create_llvm_jit_session_amdgpu(tlctx, config, arch);
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   }
 #else

diff --git a/taichi/platform/amdgpu/detect_amdgpu.cpp b/taichi/platform/amdgpu/detect_amdgpu.cpp
@@ -0,0 +1,17 @@
+#include "taichi/platform/amdgpu/detect_amdgpu.h"
+
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/rhi/amdgpu/amdgpu_driver.h"
+#endif
+
+namespace taichi {
+
+bool is_rocm_api_available() {
+#if defined(TI_WITH_AMDGPU)
+  return lang::AMDGPUDriver::get_instance_without_context().detected();
+#else
+  return false;
+#endif
+}
+
+}  // namespace taichi
diff --git a/taichi/platform/amdgpu/detect_amdgpu.h b/taichi/platform/amdgpu/detect_amdgpu.h
@@ -0,0 +1,5 @@
+#pragma once
+
+namespace taichi {
+bool is_rocm_api_available();
+}  // namespace taichi
diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
@@ -57,7 +57,7 @@ CompileConfig::CompileConfig() {
   print_kernel_nvptx = false;
   print_kernel_llvm_ir_optimized = false;
 
-  // CUDA backend options:
+  // CUDA/AMDGPU backend options:
   device_memory_GB = 1;  // by default, preallocate 1 GB GPU memory
   device_memory_fraction = 0.0;
 

diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
@@ -64,7 +64,7 @@ struct CompileConfig {
   bool print_kernel_llvm_ir_optimized;
   bool print_kernel_nvptx;
 
-  // CUDA backend options:
+  // CUDA/AMDGPU backend options:
   float64 device_memory_GB;
   float64 device_memory_fraction;
 

diff --git a/taichi/program/extension.cpp b/taichi/program/extension.cpp
@@ -19,6 +19,7 @@ bool is_extension_supported(Arch arch, Extension ext) {
        {Extension::sparse, Extension::quant, Extension::quant_basic,
         Extension::data64, Extension::adstack, Extension::bls,
         Extension::assertion, Extension::mesh}},
+      {Arch::amdgpu, {Extension::assertion}},
       {Arch::metal, {}},
       {Arch::opengl, {Extension::extfunc}},
       {Arch::gles, {}},

diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
@@ -66,7 +66,8 @@ void Kernel::operator()(const CompileConfig &compile_config,
   compiled_(ctx_builder.get_context());
 
   const auto arch = compile_config.arch;
-  if (compile_config.debug && (arch_is_cpu(arch) || arch == Arch::cuda)) {
+  if (compile_config.debug &&
+      (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu)) {
     program->check_runtime_error();
   }
 }

diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
@@ -17,6 +17,7 @@
 #include "taichi/ir/frontend_ir.h"
 #include "taichi/program/snode_expr_utils.h"
 #include "taichi/math/arithmetic.h"
+
 #ifdef TI_WITH_LLVM
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #include "taichi/codegen/llvm/struct_llvm.h"
@@ -349,7 +350,7 @@ Ndarray *Program::create_ndarray(const DataType type,
   auto arr = std::make_unique<Ndarray>(this, type, shape, layout);
   if (zero_fill) {
     Arch arch = compile_config().arch;
-    if (arch_is_cpu(arch) || arch == Arch::cuda) {
+    if (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu) {
       fill_ndarray_fast_u32(arr.get(), /*data=*/0);
     } else if (arch != Arch::dx12) {
       // Device api support for dx12 backend are not complete yet
@@ -408,7 +409,8 @@ Texture *Program::create_texture(const DataType type,
 intptr_t Program::get_ndarray_data_ptr_as_int(const Ndarray *ndarray) {
   uint64_t *data_ptr{nullptr};
   if (arch_is_cpu(compile_config().arch) ||
-      compile_config().arch == Arch::cuda) {
+      compile_config().arch == Arch::cuda ||
+      compile_config().arch == Arch::amdgpu) {
     // For the LLVM backends, device allocation is a physical pointer.
     data_ptr =
         program_impl_->get_ndarray_alloc_info_ptr(ndarray->ndarray_alloc_);