Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

amdgpu fix ut #17

Merged
merged 6 commits into from
Feb 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/scripts/unix_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ if [ -z "$GPU_TEST" ]; then
fi
elif [ ! -z "$AMDGPU_TEST" ]; then
run-it cpu $(nproc)
# run-it amdgpu 4
run-it amdgpu 8
else
run-it cuda 8
run-it cpu $(nproc)
Expand Down
2 changes: 2 additions & 0 deletions c_api/include/taichi/taichi_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,8 @@ typedef enum TiArch {
TI_ARCH_OPENGL = 6,
// OpenGL ES GPU backend.
TI_ARCH_GLES = 7,
// AMDGPU backend
TI_ARCH_AMDGPU = 8,
TI_ARCH_MAX_ENUM = 0xffffffff,
} TiArch;

Expand Down
5 changes: 3 additions & 2 deletions cmake/TaichiCore.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ file(GLOB TAICHI_CORE_SOURCE
"taichi/system/*"
"taichi/transforms/*"
"taichi/aot/*.cpp" "taichi/aot/*.h"
"taichi/platform/cuda/*" "taichi/platform/mac/*" "taichi/platform/windows/*"
"taichi/platform/cuda/*" "taichi/platform/amdgpu/*"
"taichi/platform/mac/*" "taichi/platform/windows/*"
"taichi/codegen/*.cpp" "taichi/codegen/*.h"
"taichi/runtime/*.h" "taichi/runtime/*.cpp"
"taichi/rhi/*.h" "taichi/rhi/*.cpp"
Expand All @@ -116,7 +117,7 @@ endif()

if (TI_WITH_AMDGPU)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_AMDGPU")
# file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
list(APPEND TAIHI_CORE_SOURCE ${TAICHI_AMDGPU_RUNTIME_SOURCE})
endif()

Expand Down
18 changes: 12 additions & 6 deletions python/taichi/lang/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@
"""
# ----------------------

amdgpu = _ti_core.amdgpu
"""The AMDGPU backend.
"""
# ----------------------

metal = _ti_core.metal
"""The Apple Metal backend.
"""
Expand Down Expand Up @@ -159,9 +164,9 @@
"""
# ----------------------

gpu = [cuda, metal, vulkan, opengl, dx11, dx12, gles]
gpu = [cuda, metal, vulkan, opengl, dx11, dx12, gles, amdgpu]
"""A list of GPU backends supported on the current system.
Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11', 'dx12', 'gles'.
Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11', 'dx12', 'gles', 'amdgpu'.

When this is used, Taichi automatically picks the matching GPU backend. If no
GPU is detected, Taichi falls back to the CPU backend.
Expand Down Expand Up @@ -726,6 +731,7 @@ def is_arch_supported(arch):

arch_table = {
cuda: _ti_core.with_cuda,
amdgpu: _ti_core.with_amdgpu,
metal: _ti_core.with_metal,
opengl: functools.partial(_ti_core.with_opengl, False),
gles: functools.partial(_ti_core.with_opengl, True),
Expand Down Expand Up @@ -773,8 +779,8 @@ def get_compute_stream_device_time_elapsed_us() -> float:
__all__ = [
'i', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'j', 'jk', 'jkl', 'jl',
'k', 'kl', 'l', 'x86_64', 'x64', 'dx11', 'dx12', 'wasm', 'arm64', 'cc',
'cpu', 'cuda', 'gles', 'gpu', 'metal', 'opengl', 'vulkan', 'extension',
'loop_config', 'global_thread_idx', 'assume_in_range', 'block_local',
'cache_read_only', 'init', 'mesh_local', 'no_activate', 'reset',
'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us'
'cpu', 'cuda', 'amdgpu', 'gles', 'gpu', 'metal', 'opengl', 'vulkan',
'extension', 'loop_config', 'global_thread_idx', 'assume_in_range',
'block_local', 'cache_read_only', 'init', 'mesh_local', 'no_activate',
'reset', 'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us'
]
88 changes: 80 additions & 8 deletions taichi/codegen/amdgpu/codegen_amdgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
} else {
TI_NOT_IMPLEMENTED
}
} else if (op == UnaryOpType::sgn) {
} // TODO simplify the impl of sgn
else if (op == UnaryOpType::sgn) {
if (input_taichi_type->is_primitive(PrimitiveTypeID::i32)) {
auto ashr = builder->CreateAShr(input, 31);
auto sub = builder->CreateSub(0, input);
Expand Down Expand Up @@ -141,6 +142,57 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
builder->SetInsertPoint(bb_merge);
llvm_val[stmt] =
builder->CreateLoad(llvm::Type::getFloatTy(*llvm_context), cast);
} else if (input_taichi_type->is_primitive(PrimitiveTypeID::f64)) {
auto func = builder->GetInsertBlock()->getParent();
auto bb_oeq_then = BasicBlock::Create(*llvm_context, "oeq_then", func);
auto bb_oeq_else = BasicBlock::Create(*llvm_context, "oeq_else");
auto bb_merge = BasicBlock::Create(*llvm_context, "merge");
auto bb_olt_then = BasicBlock::Create(*llvm_context, "olt_then", func);
auto bb_olt_else = BasicBlock::Create(*llvm_context, "olt_else");

auto alloc = builder->CreateAlloca(
llvm::Type::getDoubleTy(*llvm_context), (unsigned)5);
auto newty = llvm::PointerType::get(
llvm::Type::getDoubleTy(*llvm_context), (unsigned)0);
auto cast = builder->CreateAddrSpaceCast(alloc, newty);
auto fcmp_oeq = builder->CreateFCmpOEQ(
input,
llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0));
builder->CreateCondBr(fcmp_oeq, bb_oeq_then, bb_oeq_else);
builder->SetInsertPoint(bb_oeq_then);
builder->CreateStore(
llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0),
cast);
builder->CreateBr(bb_merge);
bb_oeq_then = builder->GetInsertBlock();

func->getBasicBlockList().push_back(bb_oeq_else);
builder->SetInsertPoint(bb_oeq_else);
auto fcmp_olt = builder->CreateFCmpOLT(
input,
llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0));
builder->CreateCondBr(fcmp_olt, bb_olt_then, bb_olt_else);
bb_oeq_else = builder->GetInsertBlock();

builder->SetInsertPoint(bb_olt_then);
builder->CreateStore(
llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), -1),
cast);
builder->CreateBr(bb_merge);
bb_olt_then = builder->GetInsertBlock();

func->getBasicBlockList().push_back(bb_olt_else);
builder->SetInsertPoint(bb_olt_else);
builder->CreateStore(
llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 1),
cast);
builder->CreateBr(bb_merge);
bb_olt_else = builder->GetInsertBlock();

func->getBasicBlockList().push_back(bb_merge);
builder->SetInsertPoint(bb_merge);
llvm_val[stmt] =
builder->CreateLoad(llvm::Type::getDoubleTy(*llvm_context), cast);
}
}
UNARY_STD(cos)
Expand Down Expand Up @@ -265,12 +317,32 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
}

void visit(GlobalLoadStmt *stmt) override {
if (auto get_ch = stmt->src->cast<GetChStmt>()) {
bool should_cache_as_read_only = current_offload->mem_access_opt.has_flag(
get_ch->output_snode, SNodeAccessFlag::read_only);
create_global_load(stmt, should_cache_as_read_only);
auto ptr = llvm_val[stmt->src];
auto ptr_type = stmt->src->ret_type->as<PointerType>();
if (ptr_type->is_bit_pointer()) {
auto val_type = ptr_type->get_pointee_type();
auto get_ch = stmt->src->as<GetChStmt>();
auto physical_type =
tlctx->get_data_type(get_ch->input_snode->physical_type);
auto [byte_ptr, bit_offset] = load_bit_ptr(ptr);
auto physical_value = builder->CreateLoad(physical_type, byte_ptr);
if (auto qit = val_type->cast<QuantIntType>()) {
llvm_val[stmt] = extract_quant_int(physical_value, bit_offset, qit);
} else if (auto qfxt = val_type->cast<QuantFixedType>()) {
qit = qfxt->get_digits_type()->as<QuantIntType>();
auto digits = extract_quant_int(physical_value, bit_offset, qit);
llvm_val[stmt] = reconstruct_quant_fixed(digits, qfxt);
} else {
TI_ASSERT(val_type->is<QuantFloatType>());
TI_ASSERT(get_ch->input_snode->dt->is<BitStructType>());
llvm_val[stmt] = extract_quant_float(
physical_value, get_ch->input_snode->dt->as<BitStructType>(),
get_ch->output_snode->id_in_bit_struct);
}
} else {
create_global_load(stmt, false);
// Byte pointer case.
llvm_val[stmt] =
builder->CreateLoad(tlctx->get_data_type(stmt->ret_type), ptr);
}
}

Expand Down Expand Up @@ -370,7 +442,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
llvm_val[stmt] = call("__ocml_pow_f16", {lhs, rhs});
} else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f32)) {
llvm_val[stmt] = call("__ocml_pow_f32", {lhs, rhs});
} else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i64)) {
} else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f64)) {
llvm_val[stmt] = call("__ocml_pow_f64", {lhs, rhs});
} else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i32)) {
auto sitofp_lhs_ =
Expand All @@ -388,7 +460,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
llvm_val[stmt] = call("__ocml_atan2_f16", {lhs, rhs});
} else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f32)) {
llvm_val[stmt] = call("__ocml_atan2_f32", {lhs, rhs});
} else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i64)) {
} else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f64)) {
llvm_val[stmt] = call("__ocml_atan2_f64", {lhs, rhs});
} else {
TI_NOT_IMPLEMENTED
Expand Down
9 changes: 9 additions & 0 deletions taichi/codegen/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
#if defined(TI_WITH_DX12)
#include "taichi/codegen/dx12/codegen_dx12.h"
#endif
#if defined(TI_WITH_AMDGPU)
#include "taichi/codegen/amdgpu/codegen_amdgpu.h"
#endif
#include "taichi/system/timer.h"
#include "taichi/ir/analysis.h"
#include "taichi/ir/transforms.h"
Expand Down Expand Up @@ -47,6 +50,12 @@ std::unique_ptr<KernelCodeGen> KernelCodeGen::create(
return std::make_unique<KernelCodeGenDX12>(compile_config, kernel);
#else
TI_NOT_IMPLEMENTED
#endif
} else if (arch == Arch::amdgpu) {
#if defined(TI_WITH_AMDGPU)
return std::make_unique<KernelCodeGenAMDGPU>(compile_config, kernel);
#else
TI_NOT_IMPLEMENTED
#endif
} else {
TI_NOT_IMPLEMENTED
Expand Down
6 changes: 6 additions & 0 deletions taichi/codegen/llvm/codegen_llvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2634,6 +2634,12 @@ LLVMCompiledTask TaskCodeGenLLVM::run_compilation() {
TI_ASSERT(func);
tlctx->mark_function_as_cuda_kernel(func, task.block_dim);
}
} else if (compile_config.arch == Arch::amdgpu) {
for (const auto &task : offloaded_tasks) {
llvm::Function *func = module->getFunction(task.name);
TI_ASSERT(func);
tlctx->mark_function_as_amdgpu_kernel(func);
}
}

return {std::move(offloaded_tasks), std::move(module),
Expand Down
2 changes: 1 addition & 1 deletion taichi/inc/archs.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ PER_ARCH(opengl) // OpenGL Compute Shaders
PER_ARCH(dx11) // Microsoft DirectX 11, WIP
PER_ARCH(dx12) // Microsoft DirectX 12, WIP
PER_ARCH(opencl) // OpenCL, N/A
PER_ARCH(amdgpu) // AMD GPU, WIP
PER_ARCH(amdgpu) // AMD GPU
PER_ARCH(vulkan) // Vulkan
PER_ARCH(gles) // OpenGL ES
7 changes: 4 additions & 3 deletions taichi/ir/frontend_ir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ void FrontendForStmt::init_config(Arch arch, const ForLoopConfig &config) {
strictly_serialized = config.strictly_serialized;
mem_access_opt = config.mem_access_opt;
block_dim = config.block_dim;
if (arch == Arch::cuda) {
if (arch == Arch::cuda || arch == Arch::amdgpu) {
num_cpu_threads = 1;
TI_ASSERT(block_dim <= taichi_max_gpu_block_dim);
} else { // cpu
Expand Down Expand Up @@ -1284,8 +1284,9 @@ void ASTBuilder::insert_for(const Expr &s,

Expr ASTBuilder::insert_thread_idx_expr() {
auto loop = stack_.size() ? stack_.back()->parent_stmt : nullptr;
TI_ERROR_IF(arch_ != Arch::cuda && !arch_is_cpu(arch_),
"ti.thread_idx() is only available in cuda or cpu context.");
TI_ERROR_IF(
arch_ != Arch::cuda && !arch_is_cpu(arch_) && arch_ != Arch::amdgpu,
"ti.thread_idx() is only available in cuda or cpu or amdgpu context.");
if (loop != nullptr) {
auto i = stack_.size() - 1;
while (!(loop->is<FrontendForStmt>())) {
Expand Down
2 changes: 1 addition & 1 deletion taichi/ir/frontend_ir.h
Original file line number Diff line number Diff line change
Expand Up @@ -1029,7 +1029,7 @@ class ASTBuilder {
}

void block_dim(int v) {
if (arch_ == Arch::cuda || arch_ == Arch::vulkan) {
if (arch_ == Arch::cuda || arch_ == Arch::vulkan || arch_ == Arch::amdgpu) {
TI_ASSERT((v % 32 == 0) || bit::is_power_of_two(v));
} else {
TI_ASSERT(bit::is_power_of_two(v));
Expand Down
11 changes: 11 additions & 0 deletions taichi/jit/jit_session.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
TaichiLLVMContext *tlctx,
const CompileConfig &config,
Arch arch);

std::unique_ptr<JITSession> create_llvm_jit_session_amdgpu(
TaichiLLVMContext *tlctx,
const CompileConfig &config,
Arch arch);
#endif

JITSession::JITSession(TaichiLLVMContext *tlctx, const CompileConfig &config)
Expand All @@ -40,6 +45,12 @@ std::unique_ptr<JITSession> JITSession::create(TaichiLLVMContext *tlctx,
return create_llvm_jit_session_cpu(tlctx, config, Arch::x64);
#else
TI_NOT_IMPLEMENTED
#endif
} else if (arch == Arch::amdgpu) {
#ifdef TI_WITH_AMDGPU
return create_llvm_jit_session_amdgpu(tlctx, config, arch);
#else
TI_NOT_IMPLEMENTED
#endif
}
#else
Expand Down
17 changes: 17 additions & 0 deletions taichi/platform/amdgpu/detect_amdgpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#include "taichi/platform/amdgpu/detect_amdgpu.h"

#if defined(TI_WITH_AMDGPU)
#include "taichi/rhi/amdgpu/amdgpu_driver.h"
#endif

namespace taichi {

bool is_rocm_api_available() {
#if defined(TI_WITH_AMDGPU)
return lang::AMDGPUDriver::get_instance_without_context().detected();
#else
return false;
#endif
}

} // namespace taichi
5 changes: 5 additions & 0 deletions taichi/platform/amdgpu/detect_amdgpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

namespace taichi {
bool is_rocm_api_available();
} // namespace taichi
2 changes: 1 addition & 1 deletion taichi/program/compile_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ CompileConfig::CompileConfig() {
print_kernel_nvptx = false;
print_kernel_llvm_ir_optimized = false;

// CUDA backend options:
// CUDA/AMDGPU backend options:
device_memory_GB = 1; // by default, preallocate 1 GB GPU memory
device_memory_fraction = 0.0;

Expand Down
2 changes: 1 addition & 1 deletion taichi/program/compile_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ struct CompileConfig {
bool print_kernel_llvm_ir_optimized;
bool print_kernel_nvptx;

// CUDA backend options:
// CUDA/AMDGPU backend options:
float64 device_memory_GB;
float64 device_memory_fraction;

Expand Down
1 change: 1 addition & 0 deletions taichi/program/extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ bool is_extension_supported(Arch arch, Extension ext) {
{Extension::sparse, Extension::quant, Extension::quant_basic,
Extension::data64, Extension::adstack, Extension::bls,
Extension::assertion, Extension::mesh}},
{Arch::amdgpu, {Extension::assertion}},
{Arch::metal, {}},
{Arch::opengl, {Extension::extfunc}},
{Arch::gles, {}},
Expand Down
3 changes: 2 additions & 1 deletion taichi/program/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ void Kernel::operator()(const CompileConfig &compile_config,
compiled_(ctx_builder.get_context());

const auto arch = compile_config.arch;
if (compile_config.debug && (arch_is_cpu(arch) || arch == Arch::cuda)) {
if (compile_config.debug &&
(arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu)) {
program->check_runtime_error();
}
}
Expand Down
6 changes: 4 additions & 2 deletions taichi/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "taichi/ir/frontend_ir.h"
#include "taichi/program/snode_expr_utils.h"
#include "taichi/math/arithmetic.h"

#ifdef TI_WITH_LLVM
#include "taichi/runtime/program_impls/llvm/llvm_program.h"
#include "taichi/codegen/llvm/struct_llvm.h"
Expand Down Expand Up @@ -349,7 +350,7 @@ Ndarray *Program::create_ndarray(const DataType type,
auto arr = std::make_unique<Ndarray>(this, type, shape, layout);
if (zero_fill) {
Arch arch = compile_config().arch;
if (arch_is_cpu(arch) || arch == Arch::cuda) {
if (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu) {
fill_ndarray_fast_u32(arr.get(), /*data=*/0);
} else if (arch != Arch::dx12) {
// Device api support for dx12 backend are not complete yet
Expand Down Expand Up @@ -408,7 +409,8 @@ Texture *Program::create_texture(const DataType type,
intptr_t Program::get_ndarray_data_ptr_as_int(const Ndarray *ndarray) {
uint64_t *data_ptr{nullptr};
if (arch_is_cpu(compile_config().arch) ||
compile_config().arch == Arch::cuda) {
compile_config().arch == Arch::cuda ||
compile_config().arch == Arch::amdgpu) {
// For the LLVM backends, device allocation is a physical pointer.
data_ptr =
program_impl_->get_ndarray_alloc_info_ptr(ndarray->ndarray_alloc_);
Expand Down
Loading