From 1445eb8ee91948cd3876cfc37b9c4869f33f87bc Mon Sep 17 00:00:00 2001 From: Zeyu Li Date: Fri, 3 Feb 2023 11:36:51 +0800 Subject: [PATCH] [amdgpu] Part5 enable the api of amdgpu (#7202) Issue: https://github.com/taichi-dev/taichi/issues/6434 ### Brief Summary 1. enable amdgpu api in taichi(except struct for) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- c_api/include/taichi/taichi_core.h | 2 + cmake/TaichiCore.cmake | 5 +- python/taichi/lang/misc.py | 18 ++- taichi/codegen/amdgpu/codegen_amdgpu.cpp | 30 ++++- taichi/codegen/codegen.cpp | 9 ++ taichi/codegen/llvm/codegen_llvm.cpp | 6 + taichi/inc/archs.inc.h | 2 +- taichi/ir/frontend_ir.cpp | 7 +- taichi/ir/frontend_ir.h | 2 +- taichi/jit/jit_session.cpp | 11 ++ taichi/platform/amdgpu/detect_amdgpu.cpp | 17 +++ taichi/platform/amdgpu/detect_amdgpu.h | 5 + taichi/program/compile_config.cpp | 2 +- taichi/program/compile_config.h | 2 +- taichi/program/kernel.cpp | 3 +- taichi/program/program.cpp | 6 +- taichi/python/export_misc.cpp | 6 + taichi/rhi/arch.cpp | 2 +- taichi/rhi/interop/CMakeLists.txt | 4 + taichi/runtime/llvm/llvm_context.cpp | 14 ++ taichi/runtime/llvm/llvm_context.h | 2 + taichi/runtime/llvm/llvm_runtime_executor.cpp | 124 +++++++++++++++++- taichi/runtime/llvm/llvm_runtime_executor.h | 5 + .../llvm/runtime_module/CMakeLists.txt | 2 +- .../runtime/program_impls/llvm/CMakeLists.txt | 4 + .../program_impls/llvm/llvm_program.cpp | 4 + .../runtime/program_impls/llvm/llvm_program.h | 4 + taichi/system/memory_pool.h | 2 + taichi/transforms/ast_to_ir.cpp | 2 +- taichi/transforms/compile_to_offloads.cpp | 3 +- tests/python/test_api.py | 44 +++---- 31 files changed, 294 insertions(+), 55 deletions(-) create mode 100644 taichi/platform/amdgpu/detect_amdgpu.cpp create mode 100644 taichi/platform/amdgpu/detect_amdgpu.h diff --git a/c_api/include/taichi/taichi_core.h b/c_api/include/taichi/taichi_core.h index 299a4d830bf29..aaf88e68d30b6 100644 --- a/c_api/include/taichi/taichi_core.h +++ b/c_api/include/taichi/taichi_core.h @@ -369,6 +369,8 @@ typedef enum TiArch { TI_ARCH_OPENGL = 6, // OpenGL ES GPU backend. TI_ARCH_GLES = 7, + // AMDGPU backend + TI_ARCH_AMDGPU = 8, TI_ARCH_MAX_ENUM = 0xffffffff, } TiArch; diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake index b106039ac8a1c..09f27dcdbe703 100644 --- a/cmake/TaichiCore.cmake +++ b/cmake/TaichiCore.cmake @@ -93,7 +93,8 @@ file(GLOB TAICHI_CORE_SOURCE "taichi/system/*" "taichi/transforms/*" "taichi/aot/*.cpp" "taichi/aot/*.h" - "taichi/platform/cuda/*" "taichi/platform/mac/*" "taichi/platform/windows/*" + "taichi/platform/cuda/*" "taichi/platform/amdgpu/*" + "taichi/platform/mac/*" "taichi/platform/windows/*" "taichi/codegen/*.cpp" "taichi/codegen/*.h" "taichi/runtime/*.h" "taichi/runtime/*.cpp" "taichi/rhi/*.h" "taichi/rhi/*.cpp" @@ -116,7 +117,7 @@ endif() if (TI_WITH_AMDGPU) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_AMDGPU") -# file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp") + file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp") list(APPEND TAIHI_CORE_SOURCE ${TAICHI_AMDGPU_RUNTIME_SOURCE}) endif() diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py index b21f533f1ab06..53ed51cceda1b 100644 --- a/python/taichi/lang/misc.py +++ b/python/taichi/lang/misc.py @@ -119,6 +119,11 @@ """ # ---------------------- +amdgpu = _ti_core.amdgpu +"""The AMDGPU backend. +""" +# ---------------------- + metal = _ti_core.metal """The Apple Metal backend. """ @@ -159,9 +164,9 @@ """ # ---------------------- -gpu = [cuda, metal, vulkan, opengl, dx11, dx12, gles] +gpu = [cuda, metal, vulkan, opengl, dx11, dx12, gles, amdgpu] """A list of GPU backends supported on the current system. -Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11', 'dx12', 'gles'. +Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11', 'dx12', 'gles', 'amdgpu'. When this is used, Taichi automatically picks the matching GPU backend. If no GPU is detected, Taichi falls back to the CPU backend. @@ -726,6 +731,7 @@ def is_arch_supported(arch): arch_table = { cuda: _ti_core.with_cuda, + amdgpu: _ti_core.with_amdgpu, metal: _ti_core.with_metal, opengl: functools.partial(_ti_core.with_opengl, False), gles: functools.partial(_ti_core.with_opengl, True), @@ -773,8 +779,8 @@ def get_compute_stream_device_time_elapsed_us() -> float: __all__ = [ 'i', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'j', 'jk', 'jkl', 'jl', 'k', 'kl', 'l', 'x86_64', 'x64', 'dx11', 'dx12', 'wasm', 'arm64', 'cc', - 'cpu', 'cuda', 'gles', 'gpu', 'metal', 'opengl', 'vulkan', 'extension', - 'loop_config', 'global_thread_idx', 'assume_in_range', 'block_local', - 'cache_read_only', 'init', 'mesh_local', 'no_activate', 'reset', - 'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us' + 'cpu', 'cuda', 'amdgpu', 'gles', 'gpu', 'metal', 'opengl', 'vulkan', + 'extension', 'loop_config', 'global_thread_idx', 'assume_in_range', + 'block_local', 'cache_read_only', 'init', 'mesh_local', 'no_activate', + 'reset', 'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us' ] diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp index 3923de44ed716..6d0e8b4c465c8 100644 --- a/taichi/codegen/amdgpu/codegen_amdgpu.cpp +++ b/taichi/codegen/amdgpu/codegen_amdgpu.cpp @@ -265,12 +265,32 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { } void visit(GlobalLoadStmt *stmt) override { - if (auto get_ch = stmt->src->cast()) { - bool should_cache_as_read_only = current_offload->mem_access_opt.has_flag( - get_ch->output_snode, SNodeAccessFlag::read_only); - create_global_load(stmt, should_cache_as_read_only); + auto ptr = llvm_val[stmt->src]; + auto ptr_type = stmt->src->ret_type->as(); + if (ptr_type->is_bit_pointer()) { + auto val_type = ptr_type->get_pointee_type(); + auto get_ch = stmt->src->as(); + auto physical_type = + tlctx->get_data_type(get_ch->input_snode->physical_type); + auto [byte_ptr, bit_offset] = load_bit_ptr(ptr); + auto physical_value = builder->CreateLoad(physical_type, byte_ptr); + if (auto qit = val_type->cast()) { + llvm_val[stmt] = extract_quant_int(physical_value, bit_offset, qit); + } else if (auto qfxt = val_type->cast()) { + qit = qfxt->get_digits_type()->as(); + auto digits = extract_quant_int(physical_value, bit_offset, qit); + llvm_val[stmt] = reconstruct_quant_fixed(digits, qfxt); + } else { + TI_ASSERT(val_type->is()); + TI_ASSERT(get_ch->input_snode->dt->is()); + llvm_val[stmt] = extract_quant_float( + physical_value, get_ch->input_snode->dt->as(), + get_ch->output_snode->id_in_bit_struct); + } } else { - create_global_load(stmt, false); + // Byte pointer case. + llvm_val[stmt] = + builder->CreateLoad(tlctx->get_data_type(stmt->ret_type), ptr); } } diff --git a/taichi/codegen/codegen.cpp b/taichi/codegen/codegen.cpp index 65f8248f4d671..1bb19f198650a 100644 --- a/taichi/codegen/codegen.cpp +++ b/taichi/codegen/codegen.cpp @@ -14,6 +14,9 @@ #if defined(TI_WITH_DX12) #include "taichi/codegen/dx12/codegen_dx12.h" #endif +#if defined(TI_WITH_AMDGPU) +#include "taichi/codegen/amdgpu/codegen_amdgpu.h" +#endif #include "taichi/system/timer.h" #include "taichi/ir/analysis.h" #include "taichi/ir/transforms.h" @@ -47,6 +50,12 @@ std::unique_ptr KernelCodeGen::create( return std::make_unique(compile_config, kernel); #else TI_NOT_IMPLEMENTED +#endif + } else if (arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + return std::make_unique(compile_config, kernel); +#else + TI_NOT_IMPLEMENTED #endif } else { TI_NOT_IMPLEMENTED diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp index 45a30bbc89990..e8dbcfc516e55 100644 --- a/taichi/codegen/llvm/codegen_llvm.cpp +++ b/taichi/codegen/llvm/codegen_llvm.cpp @@ -2634,6 +2634,12 @@ LLVMCompiledTask TaskCodeGenLLVM::run_compilation() { TI_ASSERT(func); tlctx->mark_function_as_cuda_kernel(func, task.block_dim); } + } else if (compile_config.arch == Arch::amdgpu) { + for (const auto &task : offloaded_tasks) { + llvm::Function *func = module->getFunction(task.name); + TI_ASSERT(func); + tlctx->mark_function_as_amdgpu_kernel(func); + } } return {std::move(offloaded_tasks), std::move(module), diff --git a/taichi/inc/archs.inc.h b/taichi/inc/archs.inc.h index 979646b0a646b..51ff8772830df 100644 --- a/taichi/inc/archs.inc.h +++ b/taichi/inc/archs.inc.h @@ -14,6 +14,6 @@ PER_ARCH(opengl) // OpenGL Compute Shaders PER_ARCH(dx11) // Microsoft DirectX 11, WIP PER_ARCH(dx12) // Microsoft DirectX 12, WIP PER_ARCH(opencl) // OpenCL, N/A -PER_ARCH(amdgpu) // AMD GPU, WIP +PER_ARCH(amdgpu) // AMD GPU PER_ARCH(vulkan) // Vulkan PER_ARCH(gles) // OpenGL ES diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp index 46bf005fdf96f..2dfb9204a3a38 100644 --- a/taichi/ir/frontend_ir.cpp +++ b/taichi/ir/frontend_ir.cpp @@ -84,7 +84,7 @@ void FrontendForStmt::init_config(Arch arch, const ForLoopConfig &config) { strictly_serialized = config.strictly_serialized; mem_access_opt = config.mem_access_opt; block_dim = config.block_dim; - if (arch == Arch::cuda) { + if (arch == Arch::cuda || arch == Arch::amdgpu) { num_cpu_threads = 1; TI_ASSERT(block_dim <= taichi_max_gpu_block_dim); } else { // cpu @@ -1284,8 +1284,9 @@ void ASTBuilder::insert_for(const Expr &s, Expr ASTBuilder::insert_thread_idx_expr() { auto loop = stack_.size() ? stack_.back()->parent_stmt : nullptr; - TI_ERROR_IF(arch_ != Arch::cuda && !arch_is_cpu(arch_), - "ti.thread_idx() is only available in cuda or cpu context."); + TI_ERROR_IF( + arch_ != Arch::cuda && !arch_is_cpu(arch_) && arch_ != Arch::amdgpu, + "ti.thread_idx() is only available in cuda or cpu or amdgpu context."); if (loop != nullptr) { auto i = stack_.size() - 1; while (!(loop->is())) { diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h index 42e525567b2b7..405791152f86e 100644 --- a/taichi/ir/frontend_ir.h +++ b/taichi/ir/frontend_ir.h @@ -1029,7 +1029,7 @@ class ASTBuilder { } void block_dim(int v) { - if (arch_ == Arch::cuda || arch_ == Arch::vulkan) { + if (arch_ == Arch::cuda || arch_ == Arch::vulkan || arch_ == Arch::amdgpu) { TI_ASSERT((v % 32 == 0) || bit::is_power_of_two(v)); } else { TI_ASSERT(bit::is_power_of_two(v)); diff --git a/taichi/jit/jit_session.cpp b/taichi/jit/jit_session.cpp index 73864832a8de4..7c0de151ce3f8 100644 --- a/taichi/jit/jit_session.cpp +++ b/taichi/jit/jit_session.cpp @@ -16,6 +16,11 @@ std::unique_ptr create_llvm_jit_session_cuda( TaichiLLVMContext *tlctx, const CompileConfig &config, Arch arch); + +std::unique_ptr create_llvm_jit_session_amdgpu( + TaichiLLVMContext *tlctx, + const CompileConfig &config, + Arch arch); #endif JITSession::JITSession(TaichiLLVMContext *tlctx, const CompileConfig &config) @@ -40,6 +45,12 @@ std::unique_ptr JITSession::create(TaichiLLVMContext *tlctx, return create_llvm_jit_session_cpu(tlctx, config, Arch::x64); #else TI_NOT_IMPLEMENTED +#endif + } else if (arch == Arch::amdgpu) { +#ifdef TI_WITH_AMDGPU + return create_llvm_jit_session_amdgpu(tlctx, config, arch); +#else + TI_NOT_IMPLEMENTED #endif } #else diff --git a/taichi/platform/amdgpu/detect_amdgpu.cpp b/taichi/platform/amdgpu/detect_amdgpu.cpp new file mode 100644 index 0000000000000..8a5523f60055a --- /dev/null +++ b/taichi/platform/amdgpu/detect_amdgpu.cpp @@ -0,0 +1,17 @@ +#include "taichi/platform/amdgpu/detect_amdgpu.h" + +#if defined(TI_WITH_AMDGPU) +#include "taichi/rhi/amdgpu/amdgpu_driver.h" +#endif + +namespace taichi { + +bool is_rocm_api_available() { +#if defined(TI_WITH_AMDGPU) + return lang::AMDGPUDriver::get_instance_without_context().detected(); +#else + return false; +#endif +} + +} // namespace taichi diff --git a/taichi/platform/amdgpu/detect_amdgpu.h b/taichi/platform/amdgpu/detect_amdgpu.h new file mode 100644 index 0000000000000..c7638e938ce0e --- /dev/null +++ b/taichi/platform/amdgpu/detect_amdgpu.h @@ -0,0 +1,5 @@ +#pragma once + +namespace taichi { +bool is_rocm_api_available(); +} // namespace taichi diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp index 9e018b511d27c..6a6f9d2e965b1 100644 --- a/taichi/program/compile_config.cpp +++ b/taichi/program/compile_config.cpp @@ -57,7 +57,7 @@ CompileConfig::CompileConfig() { print_kernel_nvptx = false; print_kernel_llvm_ir_optimized = false; - // CUDA backend options: + // CUDA/AMDGPU backend options: device_memory_GB = 1; // by default, preallocate 1 GB GPU memory device_memory_fraction = 0.0; diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h index 1714edfd1c0df..99c4f6a177611 100644 --- a/taichi/program/compile_config.h +++ b/taichi/program/compile_config.h @@ -64,7 +64,7 @@ struct CompileConfig { bool print_kernel_llvm_ir_optimized; bool print_kernel_nvptx; - // CUDA backend options: + // CUDA/AMDGPU backend options: float64 device_memory_GB; float64 device_memory_fraction; diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp index 401463175f24b..12f01f0f7774f 100644 --- a/taichi/program/kernel.cpp +++ b/taichi/program/kernel.cpp @@ -66,7 +66,8 @@ void Kernel::operator()(const CompileConfig &compile_config, compiled_(ctx_builder.get_context()); const auto arch = compile_config.arch; - if (compile_config.debug && (arch_is_cpu(arch) || arch == Arch::cuda)) { + if (compile_config.debug && + (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu)) { program->check_runtime_error(); } } diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp index 77ce3213f5f55..63879e6e4255a 100644 --- a/taichi/program/program.cpp +++ b/taichi/program/program.cpp @@ -17,6 +17,7 @@ #include "taichi/ir/frontend_ir.h" #include "taichi/program/snode_expr_utils.h" #include "taichi/math/arithmetic.h" + #ifdef TI_WITH_LLVM #include "taichi/runtime/program_impls/llvm/llvm_program.h" #include "taichi/codegen/llvm/struct_llvm.h" @@ -349,7 +350,7 @@ Ndarray *Program::create_ndarray(const DataType type, auto arr = std::make_unique(this, type, shape, layout); if (zero_fill) { Arch arch = compile_config().arch; - if (arch_is_cpu(arch) || arch == Arch::cuda) { + if (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu) { fill_ndarray_fast_u32(arr.get(), /*data=*/0); } else if (arch != Arch::dx12) { // Device api support for dx12 backend are not complete yet @@ -408,7 +409,8 @@ Texture *Program::create_texture(const DataType type, intptr_t Program::get_ndarray_data_ptr_as_int(const Ndarray *ndarray) { uint64_t *data_ptr{nullptr}; if (arch_is_cpu(compile_config().arch) || - compile_config().arch == Arch::cuda) { + compile_config().arch == Arch::cuda || + compile_config().arch == Arch::amdgpu) { // For the LLVM backends, device allocation is a physical pointer. data_ptr = program_impl_->get_ndarray_alloc_info_ptr(ndarray->ndarray_alloc_); diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp index 3a6c2d90360cd..65bc95777b781 100644 --- a/taichi/python/export_misc.cpp +++ b/taichi/python/export_misc.cpp @@ -24,6 +24,11 @@ #include "taichi/rhi/cuda/cuda_driver.h" #endif +#include "taichi/platform/amdgpu/detect_amdgpu.h" +#if defined(TI_WITH_AMDGPU) +#include "taichi/rhi/amdgpu/amdgpu_driver.h" +#endif + #ifdef TI_WITH_VULKAN #include "taichi/rhi/vulkan/vulkan_loader.h" #endif @@ -144,6 +149,7 @@ void export_misc(py::module &m) { m.def("pop_python_print_buffer", []() { return py_cout.pop_content(); }); m.def("toggle_python_print_buffer", [](bool opt) { py_cout.enabled = opt; }); m.def("with_cuda", is_cuda_api_available); + m.def("with_amdgpu", is_rocm_api_available); #ifdef TI_WITH_METAL m.def("with_metal", taichi::lang::metal::is_metal_api_available); #else diff --git a/taichi/rhi/arch.cpp b/taichi/rhi/arch.cpp index a441b9345f08f..7e5beead84bb9 100644 --- a/taichi/rhi/arch.cpp +++ b/taichi/rhi/arch.cpp @@ -49,7 +49,7 @@ bool arch_is_cuda(Arch arch) { bool arch_uses_llvm(Arch arch) { return (arch == Arch::x64 || arch == Arch::arm64 || arch == Arch::cuda || - arch == Arch::dx12 || arch == Arch::wasm); + arch == Arch::dx12 || arch == Arch::wasm || arch == Arch::amdgpu); } bool arch_is_gpu(Arch arch) { diff --git a/taichi/rhi/interop/CMakeLists.txt b/taichi/rhi/interop/CMakeLists.txt index 9f73e9761f80b..b88bca8b0fe1e 100644 --- a/taichi/rhi/interop/CMakeLists.txt +++ b/taichi/rhi/interop/CMakeLists.txt @@ -12,6 +12,10 @@ if (TI_WITH_CUDA) target_compile_definitions(${INTEROP_RHI} PRIVATE -DTI_WITH_CUDA) endif() +if (TI_WITH_AMDGPU) + target_compile_definitions(${INTEROP_RHI} PRIVATE -DTI_WITH_AMDGPU) +endif() + if (TI_WITH_VULKAN) target_compile_definitions(${INTEROP_RHI} PRIVATE -DTI_WITH_VULKAN) endif() diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index 8318e00194c68..14f103bea3e6a 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -105,6 +105,16 @@ TaichiLLVMContext::TaichiLLVMContext(const CompileConfig &config, Arch arch) LLVMInitializeDirectXTargetMC(); LLVMInitializeDirectXTargetInfo(); LLVMInitializeDirectXAsmPrinter(); +#endif + } else if (arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUAsmPrinter(); + LLVMInitializeAMDGPUAsmParser(); +#else + TI_NOT_IMPLEMENTED #endif } else { #if defined(TI_WITH_CUDA) @@ -803,6 +813,10 @@ void TaichiLLVMContext::mark_function_as_cuda_kernel(llvm::Function *func, } } +void TaichiLLVMContext::mark_function_as_amdgpu_kernel(llvm::Function *func) { + func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); +} + void TaichiLLVMContext::eliminate_unused_functions( llvm::Module *module, std::function export_indicator) { diff --git a/taichi/runtime/llvm/llvm_context.h b/taichi/runtime/llvm/llvm_context.h index 76ef78812e794..8190c66cd8bfa 100644 --- a/taichi/runtime/llvm/llvm_context.h +++ b/taichi/runtime/llvm/llvm_context.h @@ -125,6 +125,8 @@ class TaichiLLVMContext { void mark_function_as_cuda_kernel(llvm::Function *func, int block_dim = 0); + void mark_function_as_amdgpu_kernel(llvm::Function *func); + void fetch_this_thread_struct_module(); llvm::Module *get_this_thread_runtime_module(); llvm::Function *get_runtime_function(const std::string &name); diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp index 2e6c7e6763683..644c329c7eb7a 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.cpp +++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp @@ -11,9 +11,10 @@ #include "taichi/rhi/cuda/cuda_context.h" #endif -#if defined(TI_WITH_AMDGPU) +#include "taichi/platform/amdgpu/detect_amdgpu.h" #include "taichi/rhi/amdgpu/amdgpu_driver.h" #include "taichi/rhi/amdgpu/amdgpu_device.h" +#if defined(TI_WITH_AMDGPU) #include "taichi/rhi/amdgpu/amdgpu_context.h" #endif @@ -52,6 +53,21 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, if (config.arch != Arch::cuda) { TI_WARN("Falling back to {}.", arch_name(host_arch())); } + } else if (config.arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + if (!is_rocm_api_available()) { + TI_WARN("No AMDGPU ROCm API detected."); + config.arch = host_arch(); + } else if (!AMDGPUContext::get_instance().detected()) { + TI_WARN("No AMDGPU device detected."); + config.arch = host_arch(); + } else { + // AMDGPU runtime created successfully + } +#else + TI_WARN("Taichi is not compiled with AMDGPU."); + config.arch = host_arch(); +#endif } if (config.kernel_profiler) { @@ -108,6 +124,22 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, #if defined(TI_WITH_AMDGPU) else if (config.arch == Arch::amdgpu) { + int num_workgroups{1}; + AMDGPUDriver::get_instance().device_get_attribute( + &num_workgroups, HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0); + int query_max_block_dim{1024}; + AMDGPUDriver::get_instance().device_get_attribute( + &query_max_block_dim, HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 0); + // magic number 32 + // I didn't find the relevant parameter to limit the max block num per CU + // So .... + int query_max_block_per_cu{32}; + if (config.max_block_dim == 0) { + config.max_block_dim = query_max_block_dim; + } + if (config.saturating_grid_dim == 0) { + config.saturating_grid_dim = num_workgroups * query_max_block_per_cu * 2; + } AMDGPUContext::get_instance().set_debug(config.debug); device_ = std::make_shared(); } @@ -162,6 +194,12 @@ void LlvmRuntimeExecutor::synchronize() { CUDADriver::get_instance().stream_synchronize(nullptr); #else TI_ERROR("No CUDA support"); +#endif + } else if (config_.arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + AMDGPUDriver::get_instance().stream_synchronize(nullptr); +#else + TI_ERROR("No AMDGPU support"); #endif } fflush(stdout); @@ -178,6 +216,13 @@ uint64 LlvmRuntimeExecutor::fetch_result_uint64(int i, uint64 *result_buffer) { sizeof(uint64)); #else TI_NOT_IMPLEMENTED; +#endif + } else if (config_.arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i, + sizeof(uint64)); +#else + TI_NOT_IMPLEMENTED; #endif } else { ret = result_buffer[i]; @@ -341,6 +386,12 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes( CUDADriver::get_instance().memset(root_buffer, 0, rounded_size); #else TI_NOT_IMPLEMENTED +#endif + } else if (config_.arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + AMDGPUDriver::get_instance().memset(root_buffer, 0, rounded_size); +#else + TI_NOT_IMPLEMENTED; #endif } else { std::memset(root_buffer, 0, rounded_size); @@ -353,6 +404,12 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes( alloc = cuda_device()->import_memory(root_buffer, rounded_size); #else TI_NOT_IMPLEMENTED +#endif + } else if (config_.arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + alloc = amdgpu_device()->import_memory(root_buffer, rounded_size); +#else + TI_NOT_IMPLEMENTED #endif } else { alloc = cpu_device()->import_memory(root_buffer, rounded_size); @@ -406,6 +463,13 @@ cuda::CudaDevice *LlvmRuntimeExecutor::cuda_device() { return static_cast(device_.get()); } +amdgpu::AmdgpuDevice *LlvmRuntimeExecutor::amdgpu_device() { + if (config_.arch != Arch::amdgpu) { + TI_ERROR("arch is not amdgpu"); + } + return static_cast(device_.get()); +} + cpu::CpuDevice *LlvmRuntimeExecutor::cpu_device() { TI_ERROR_IF(!arch_is_cpu(config_.arch), "arch is not cpu"); return static_cast(device_.get()); @@ -441,6 +505,12 @@ void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc, CUDADriver::get_instance().memsetd32((void *)ptr, data, size); #else TI_NOT_IMPLEMENTED +#endif + } else if (config_.arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + AMDGPUDriver::get_instance().memset((void *)ptr, data, size); +#else + TI_NOT_IMPLEMENTED; #endif } else { std::fill((uint32_t *)ptr, (uint32_t *)ptr + size, data); @@ -454,6 +524,12 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr( return (uint64_t *)cuda_device()->get_alloc_info(alloc).ptr; #else TI_NOT_IMPLEMENTED +#endif + } else if (config_.arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + return (uint64_t *)amdgpu_device()->get_alloc_info(alloc).ptr; +#else + TI_NOT_IMPLEMENTED #endif } else { return (uint64_t *)cpu_device()->get_alloc_info(alloc).ptr; @@ -462,11 +538,17 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr( void LlvmRuntimeExecutor::finalize() { profiler_ = nullptr; -#if defined(TI_WITH_CUDA) if (preallocated_device_buffer_ != nullptr) { - cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_); - } + if (config_.arch == Arch::cuda) { +#if defined(TI_WITH_CUDA) + cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_); #endif + } else if (config_.arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_); +#endif + } + } } void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool, @@ -502,6 +584,36 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool, prealloc_size); #else TI_NOT_IMPLEMENTED +#endif + } else if (config_.arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + AMDGPUDriver::get_instance().malloc( + (void **)result_buffer_ptr, + sizeof(uint64) * taichi_result_buffer_entries); + const auto total_mem = AMDGPUContext::get_instance().get_total_memory(); + if (config_.device_memory_fraction == 0) { + TI_ASSERT(config_.device_memory_GB > 0); + prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30)); + } else { + prealloc_size = std::size_t(config_.device_memory_fraction * total_mem); + } + TI_ASSERT(prealloc_size <= total_mem); + + TI_TRACE("Allocating device memory {:.2f} GB", + 1.0 * prealloc_size / (1UL << 30)); + + Device::AllocParams preallocated_device_buffer_alloc_params; + preallocated_device_buffer_alloc_params.size = prealloc_size; + preallocated_device_buffer_alloc_ = amdgpu_device()->allocate_memory( + preallocated_device_buffer_alloc_params); + amdgpu::AmdgpuDevice::AllocInfo preallocated_device_buffer_alloc_info = + amdgpu_device()->get_alloc_info(preallocated_device_buffer_alloc_); + preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr; + + AMDGPUDriver::get_instance().memset(preallocated_device_buffer_, 0, + prealloc_size); +#else + TI_NOT_IMPLEMENTED #endif } else { *result_buffer_ptr = (uint64 *)memory_pool->allocate( @@ -517,8 +629,8 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool, // Number of random states. One per CPU/CUDA thread. int num_rand_states = 0; - if (config_.arch == Arch::cuda) { -#if defined(TI_WITH_CUDA) + if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) { +#if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU) // It is important to make sure that every CUDA thread has its own random // state so that we do not need expensive per-state locks. num_rand_states = config_.saturating_grid_dim * config_.max_block_dim; diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h index 58ebaf1bd68ae..5e3e7cf8afcdc 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.h +++ b/taichi/runtime/llvm/llvm_runtime_executor.h @@ -25,6 +25,10 @@ namespace cuda { class CudaDevice; } // namespace cuda +namespace amdgpu { +class AmdgpuDevice; +} // namespace amdgpu + namespace cpu { class CpuDevice; } // namespace cpu @@ -113,6 +117,7 @@ class LlvmRuntimeExecutor { /* -------------------------- */ cuda::CudaDevice *cuda_device(); cpu::CpuDevice *cpu_device(); + amdgpu::AmdgpuDevice *amdgpu_device(); void finalize(); diff --git a/taichi/runtime/llvm/runtime_module/CMakeLists.txt b/taichi/runtime/llvm/runtime_module/CMakeLists.txt index 0edac71b82c7c..92303abcc675b 100644 --- a/taichi/runtime/llvm/runtime_module/CMakeLists.txt +++ b/taichi/runtime/llvm/runtime_module/CMakeLists.txt @@ -14,6 +14,6 @@ function(COMPILE_LLVM_RUNTIME rtm_arch) endfunction() # Build llvm-runtime for host arch and cuda (if available) -foreach(arch IN LISTS HOST_ARCH CUDA_ARCH DX12_ARCH) +foreach(arch IN LISTS HOST_ARCH CUDA_ARCH DX12_ARCH AMDGPU_ARCH) compile_llvm_runtime(${arch}) endforeach() diff --git a/taichi/runtime/program_impls/llvm/CMakeLists.txt b/taichi/runtime/program_impls/llvm/CMakeLists.txt index ab4a73a9d30bb..0b63c2d618ae2 100644 --- a/taichi/runtime/program_impls/llvm/CMakeLists.txt +++ b/taichi/runtime/program_impls/llvm/CMakeLists.txt @@ -20,3 +20,7 @@ target_link_libraries(llvm_program_impl PRIVATE cpu_runtime) if (TI_WITH_CUDA) target_link_libraries(llvm_program_impl PRIVATE cuda_runtime) endif() + +if (TI_WITH_AMDGPU) + target_link_libraries(llvm_program_impl PRIVATE amdgpu_runtime) +endif() diff --git a/taichi/runtime/program_impls/llvm/llvm_program.cpp b/taichi/runtime/program_impls/llvm/llvm_program.cpp index 14273b4e62268..a2395954e3420 100644 --- a/taichi/runtime/program_impls/llvm/llvm_program.cpp +++ b/taichi/runtime/program_impls/llvm/llvm_program.cpp @@ -15,6 +15,10 @@ #include "taichi/codegen/cuda/codegen_cuda.h" #endif +#if defined(TI_WITH_AMDGPU) +#include "taichi/codegen/amdgpu/codegen_amdgpu.h" +#endif + #if defined(TI_WITH_DX12) #include "taichi/runtime/dx12/aot_module_builder_impl.h" #include "taichi/codegen/dx12/codegen_dx12.h" diff --git a/taichi/runtime/program_impls/llvm/llvm_program.h b/taichi/runtime/program_impls/llvm/llvm_program.h index 600a134cc844a..75f83c1576437 100644 --- a/taichi/runtime/program_impls/llvm/llvm_program.h +++ b/taichi/runtime/program_impls/llvm/llvm_program.h @@ -26,6 +26,10 @@ namespace cuda { class CudaDevice; } // namespace cuda +namespace amdgpu { +class AmdgpuDevice; +} // namespace amdgpu + namespace cpu { class CpuDevice; } // namespace cpu diff --git a/taichi/system/memory_pool.h b/taichi/system/memory_pool.h index 06dce540d3ad5..63fa6c0dcd8b4 100644 --- a/taichi/system/memory_pool.h +++ b/taichi/system/memory_pool.h @@ -27,6 +27,7 @@ class TI_DLL_EXPORT MemoryPool { MemRequestQueue *queue; void *cuda_stream{nullptr}; + void *amdgpu_stream{nullptr}; // In the future we wish to move the MemoryPool inside each Device // so that the memory allocated from each Device can be used as-is. @@ -50,6 +51,7 @@ class TI_DLL_EXPORT MemoryPool { private: static constexpr bool use_cuda_stream = false; + static constexpr bool use_amdgpu_stream = false; Arch arch_; Device *device_; }; diff --git a/taichi/transforms/ast_to_ir.cpp b/taichi/transforms/ast_to_ir.cpp index da40e8d665d82..d7cce13a19661 100644 --- a/taichi/transforms/ast_to_ir.cpp +++ b/taichi/transforms/ast_to_ir.cpp @@ -7,7 +7,7 @@ namespace irpass { static bool supports_lowering(Arch arch) { return arch_is_cpu(arch) || (arch == Arch::cuda) || (arch == Arch::dx12) || - (arch == Arch::metal); + (arch == Arch::metal) || (arch == Arch::amdgpu); } void ast_to_ir(const CompileConfig &config, diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index cabcab019e305..33772f6314bd4 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -94,7 +94,8 @@ void compile_to_offloads(IRNode *ir, // TODO: strictly enforce bit vectorization for x86 cpu and CUDA now // create a separate CompileConfig flag for the new pass - if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) { + if (arch_is_cpu(config.arch) || config.arch == Arch::cuda || + config.arch == Arch::amdgpu) { irpass::bit_loop_vectorize(ir); irpass::type_check(ir, config); print("Bit Loop Vectorized"); diff --git a/tests/python/test_api.py b/tests/python/test_api.py index 6fdf9fec85a95..0c2df0dd1496e 100644 --- a/tests/python/test_api.py +++ b/tests/python/test_api.py @@ -62,28 +62,28 @@ def _get_expected_matrix_apis(): 'TaichiAssertionError', 'TaichiCompilationError', 'TaichiNameError', 'TaichiRuntimeError', 'TaichiRuntimeTypeError', 'TaichiSyntaxError', 'TaichiTypeError', 'Texture', 'Vector', 'VectorNdarray', 'WARN', 'abs', - 'acos', 'activate', 'ad', 'algorithms', 'aot', 'append', 'arm64', 'asin', - 'assume_in_range', 'atan2', 'atomic_add', 'atomic_and', 'atomic_max', - 'atomic_min', 'atomic_or', 'atomic_sub', 'atomic_xor', 'axes', 'bit_cast', - 'bit_shr', 'block_local', 'cache_read_only', 'cast', 'cc', 'ceil', 'cos', - 'cpu', 'cuda', 'data_oriented', 'dataclass', 'deactivate', - 'deactivate_all_snodes', 'dx11', 'dx12', 'eig', 'exp', 'experimental', - 'extension', 'f16', 'f32', 'f64', 'field', 'float16', 'float32', 'float64', - 'floor', 'func', 'get_addr', 'get_compute_stream_device_time_elapsed_us', - 'gles', 'global_thread_idx', 'gpu', 'graph', 'grouped', 'hex_to_rgb', 'i', - 'i16', 'i32', 'i64', 'i8', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', - 'init', 'int16', 'int32', 'int64', 'int8', 'is_active', - 'is_logging_effective', 'j', 'jk', 'jkl', 'jl', 'k', 'kernel', 'kl', 'l', - 'lang', 'length', 'linalg', 'log', 'loop_config', 'math', 'max', - 'mesh_local', 'mesh_patch_idx', 'metal', 'min', 'ndarray', 'ndrange', - 'no_activate', 'one', 'opengl', 'polar_decompose', 'pow', 'profiler', - 'randn', 'random', 'raw_div', 'raw_mod', 'ref', 'rescale_index', 'reset', - 'rgb_to_hex', 'root', 'round', 'rsqrt', 'select', 'set_logging_level', - 'simt', 'sin', 'solve', 'sparse_matrix_builder', 'sqrt', 'static', - 'static_assert', 'static_print', 'stop_grad', 'svd', 'sym_eig', 'sync', - 'tan', 'tanh', 'template', 'tools', 'types', 'u16', 'u32', 'u64', 'u8', - 'ui', 'uint16', 'uint32', 'uint64', 'uint8', 'vulkan', 'wasm', 'x64', - 'x86_64', 'zero' + 'acos', 'activate', 'ad', 'algorithms', 'amdgpu', 'aot', 'append', 'arm64', + 'asin', 'assume_in_range', 'atan2', 'atomic_add', 'atomic_and', + 'atomic_max', 'atomic_min', 'atomic_or', 'atomic_sub', 'atomic_xor', + 'axes', 'bit_cast', 'bit_shr', 'block_local', 'cache_read_only', 'cast', + 'cc', 'ceil', 'cos', 'cpu', 'cuda', 'data_oriented', 'dataclass', + 'deactivate', 'deactivate_all_snodes', 'dx11', 'dx12', 'eig', 'exp', + 'experimental', 'extension', 'f16', 'f32', 'f64', 'field', 'float16', + 'float32', 'float64', 'floor', 'func', 'get_addr', + 'get_compute_stream_device_time_elapsed_us', 'gles', 'global_thread_idx', + 'gpu', 'graph', 'grouped', 'hex_to_rgb', 'i', 'i16', 'i32', 'i64', 'i8', + 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'init', 'int16', 'int32', + 'int64', 'int8', 'is_active', 'is_logging_effective', 'j', 'jk', 'jkl', + 'jl', 'k', 'kernel', 'kl', 'l', 'lang', 'length', 'linalg', 'log', + 'loop_config', 'math', 'max', 'mesh_local', 'mesh_patch_idx', 'metal', + 'min', 'ndarray', 'ndrange', 'no_activate', 'one', 'opengl', + 'polar_decompose', 'pow', 'profiler', 'randn', 'random', 'raw_div', + 'raw_mod', 'ref', 'rescale_index', 'reset', 'rgb_to_hex', 'root', 'round', + 'rsqrt', 'select', 'set_logging_level', 'simt', 'sin', 'solve', + 'sparse_matrix_builder', 'sqrt', 'static', 'static_assert', 'static_print', + 'stop_grad', 'svd', 'sym_eig', 'sync', 'tan', 'tanh', 'template', 'tools', + 'types', 'u16', 'u32', 'u64', 'u8', 'ui', 'uint16', 'uint32', 'uint64', + 'uint8', 'vulkan', 'wasm', 'x64', 'x86_64', 'zero' ] user_api[ti.ad] = [ 'FwdMode', 'Tape', 'clear_all_gradients', 'grad_for', 'grad_replaced',