diff --git a/c_api/src/taichi_llvm_impl.cpp b/c_api/src/taichi_llvm_impl.cpp index fd29a98fc0a21..0bd19fe1a6516 100644 --- a/c_api/src/taichi_llvm_impl.cpp +++ b/c_api/src/taichi_llvm_impl.cpp @@ -55,8 +55,7 @@ taichi::lang::Device &LlvmRuntime::get() { TiMemory LlvmRuntime::allocate_memory( const taichi::lang::Device::AllocParams ¶ms) { const taichi::lang::CompileConfig &config = executor_->get_config(); - taichi::lang::TaichiLLVMContext *tlctx = - executor_->get_llvm_context(config.arch); + taichi::lang::TaichiLLVMContext *tlctx = executor_->get_llvm_context(); taichi::lang::LLVMRuntime *llvm_runtime = executor_->get_llvm_runtime(); taichi::lang::LlvmDevice *llvm_device = executor_->llvm_device(); diff --git a/taichi/runtime/cpu/aot_module_loader_impl.cpp b/taichi/runtime/cpu/aot_module_loader_impl.cpp index dfea684a4f2ee..8fa1b8ff34fb1 100644 --- a/taichi/runtime/cpu/aot_module_loader_impl.cpp +++ b/taichi/runtime/cpu/aot_module_loader_impl.cpp @@ -20,7 +20,7 @@ class AotModuleImpl : public LlvmAotModule { LlvmOfflineCache::KernelCacheData &&loaded) override { Arch arch = executor_->get_config().arch; TI_ASSERT(arch == Arch::x64 || arch == Arch::arm64); - auto *tlctx = executor_->get_llvm_context(arch); + auto *tlctx = executor_->get_llvm_context(); CPUModuleToFunctionConverter converter{tlctx, executor_}; return converter.convert(name, loaded.args, diff --git a/taichi/runtime/cuda/aot_module_loader_impl.cpp b/taichi/runtime/cuda/aot_module_loader_impl.cpp index 914f0cb03ac3d..5a7f770d9451e 100644 --- a/taichi/runtime/cuda/aot_module_loader_impl.cpp +++ b/taichi/runtime/cuda/aot_module_loader_impl.cpp @@ -20,7 +20,7 @@ class AotModuleImpl : public LlvmAotModule { LlvmOfflineCache::KernelCacheData &&loaded) override { Arch arch = executor_->get_config().arch; TI_ASSERT(arch == Arch::cuda); - auto *tlctx = executor_->get_llvm_context(arch); + auto *tlctx = executor_->get_llvm_context(); CUDAModuleToFunctionConverter converter{tlctx, executor_}; return converter.convert(name, loaded.args, diff --git a/taichi/runtime/llvm/llvm_aot_module_loader.cpp b/taichi/runtime/llvm/llvm_aot_module_loader.cpp index c31cc8638c256..1b0927d916492 100644 --- a/taichi/runtime/llvm/llvm_aot_module_loader.cpp +++ b/taichi/runtime/llvm/llvm_aot_module_loader.cpp @@ -6,7 +6,7 @@ namespace taichi::lang { LlvmOfflineCache::KernelCacheData LlvmAotModule::load_kernel_from_cache( const std::string &name) { TI_ASSERT(cache_reader_ != nullptr); - auto *tlctx = executor_->get_llvm_context(executor_->get_config().arch); + auto *tlctx = executor_->get_llvm_context(); LlvmOfflineCache::KernelCacheData loaded; auto ok = cache_reader_->get_kernel_cache(loaded, name, *tlctx->get_this_thread_context()); diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index 49463cf3b8d17..eefdff7769817 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -95,6 +95,12 @@ TaichiLLVMContext::TaichiLLVMContext(const CompileConfig &config, Arch arch) llvm::InitializeNativeTargetAsmParser(); #endif } else if (arch == Arch::dx12) { + // FIXME: Must initialize these before initializing Arch::dx12 + // because it uses the jit of CPU right now. + llvm::InitializeNativeTarget(); + llvm::InitializeNativeTargetAsmPrinter(); + llvm::InitializeNativeTargetAsmParser(); + // The dx target is used elsewhere, so we need to initialize it too. #if defined(TI_WITH_DX12) LLVMInitializeDirectXTarget(); LLVMInitializeDirectXTargetMC(); diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp index d31a9eb087ed8..2e6c7e6763683 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.cpp +++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp @@ -54,15 +54,22 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, } } + if (config.kernel_profiler) { + profiler_ = profiler; + } + snode_tree_buffer_manager_ = std::make_unique(this); thread_pool_ = std::make_unique(config.cpu_max_num_threads); preallocated_device_buffer_ = nullptr; llvm_runtime_ = nullptr; - llvm_context_host_ = std::make_unique(config, host_arch()); - if (config.arch == Arch::cuda) { + if (arch_is_cpu(config.arch)) { + config.max_block_dim = 1024; + device_ = std::make_shared(); + } #if defined(TI_WITH_CUDA) + else if (config.arch == Arch::cuda) { int num_SMs{1}; CUDADriver::get_instance().device_get_attribute( &num_SMs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, nullptr); @@ -89,25 +96,6 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, } config.saturating_grid_dim = num_SMs * query_max_block_per_sm * 2; } -#endif - } else if (config.arch == Arch::dx12) { -#if defined(TI_WITH_DX12) - // FIXME: set value based on DX12. - config.max_block_dim = 1024; -#endif - } - - if (arch_is_cpu(config.arch)) { - config.max_block_dim = 1024; - device_ = std::make_shared(); - } - - if (config.kernel_profiler) { - profiler_ = profiler; - } - -#if defined(TI_WITH_CUDA) - if (config.arch == Arch::cuda) { if (config.kernel_profiler) { CUDAContext::get_instance().set_profiler(profiler); } else { @@ -115,63 +103,34 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, } CUDAContext::get_instance().set_debug(config.debug); device_ = std::make_shared(); - - this->maybe_initialize_cuda_llvm_context(); } #endif #if defined(TI_WITH_AMDGPU) - if (config.arch == Arch::amdgpu) { + else if (config.arch == Arch::amdgpu) { AMDGPUContext::get_instance().set_debug(config.debug); device_ = std::make_shared(); - - this->maybe_initialize_amdgpu_llvm_context(); } #endif #ifdef TI_WITH_DX12 - if (config.arch == Arch::dx12) { + else if (config.arch == Arch::dx12) { // FIXME: add dx12 device. + // FIXME: set value based on DX12. + config.max_block_dim = 1024; device_ = std::make_shared(); - - llvm_context_device_ = - std::make_unique(config, Arch::dx12); - // FIXME: add dx12 JIT. - llvm_context_device_->init_runtime_jit_module(); } #endif - - this->initialize_host(); -} - -TaichiLLVMContext *LlvmRuntimeExecutor::get_llvm_context(Arch arch) { - if (arch_is_cpu(arch)) { - return llvm_context_host_.get(); - } else { - return llvm_context_device_.get(); + else { + TI_NOT_IMPLEMENTED } + llvm_context_ = std::make_unique( + config_, arch_is_cpu(config.arch) ? host_arch() : config.arch); + llvm_context_->init_runtime_jit_module(); } -void LlvmRuntimeExecutor::initialize_host() { - // Note this cannot be placed inside LlvmProgramImpl constructor, see doc - // string for init_runtime_jit_module() for more details. - llvm_context_host_->init_runtime_jit_module(); -} - -void LlvmRuntimeExecutor::maybe_initialize_cuda_llvm_context() { - if (config_.arch == Arch::cuda && llvm_context_device_ == nullptr) { - llvm_context_device_ = - std::make_unique(config_, Arch::cuda); - llvm_context_device_->init_runtime_jit_module(); - } -} - -void LlvmRuntimeExecutor::maybe_initialize_amdgpu_llvm_context() { - if (config_.arch == Arch::amdgpu && llvm_context_device_ == nullptr) { - llvm_context_device_ = - std::make_unique(config_, Arch::amdgpu); - llvm_context_device_->init_runtime_jit_module(); - } +TaichiLLVMContext *LlvmRuntimeExecutor::get_llvm_context() { + return llvm_context_.get(); } void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, @@ -243,13 +202,7 @@ std::size_t LlvmRuntimeExecutor::get_snode_num_dynamically_allocated( void LlvmRuntimeExecutor::check_runtime_error(uint64 *result_buffer) { synchronize(); - auto tlctx = llvm_context_host_.get(); - if (llvm_context_device_) { - // In case there is a standalone device context (e.g. CUDA without unified - // memory), use the device context instead. - tlctx = llvm_context_device_.get(); - } - auto *runtime_jit_module = tlctx->runtime_jit_module; + auto *runtime_jit_module = llvm_context_->runtime_jit_module; runtime_jit_module->call("runtime_retrieve_and_reset_error_code", llvm_runtime_); auto error_code = @@ -369,18 +322,7 @@ DevicePtr LlvmRuntimeExecutor::get_snode_tree_device_ptr(int tree_id) { void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes( const LlvmOfflineCache::FieldCacheData &field_cache_data, uint64 *result_buffer) { - TaichiLLVMContext *tlctx = nullptr; - if (config_.arch == Arch::cuda) { -#if defined(TI_WITH_CUDA) - tlctx = llvm_context_device_.get(); -#else - TI_NOT_IMPLEMENTED -#endif - } else { - tlctx = llvm_context_host_.get(); - } - - auto *const runtime_jit = tlctx->runtime_jit_module; + auto *const runtime_jit = llvm_context_->runtime_jit_module; // By the time this creator is called, "this" is already destroyed. // Therefore it is necessary to capture members by values. size_t root_size = field_cache_data.root_size; @@ -477,18 +419,11 @@ LlvmDevice *LlvmRuntimeExecutor::llvm_device() { DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray( std::size_t alloc_size, uint64 *result_buffer) { - TaichiLLVMContext *tlctx = nullptr; - if (llvm_context_device_) { - tlctx = llvm_context_device_.get(); - } else { - tlctx = llvm_context_host_.get(); - } - return llvm_device()->allocate_memory_runtime( {{alloc_size, /*host_write=*/false, /*host_read=*/false, /*export_sharing=*/false, AllocUsage::Storage}, config_.ndarray_use_cached_allocator, - tlctx->runtime_jit_module, + llvm_context_->runtime_jit_module, get_llvm_runtime(), result_buffer}); } @@ -538,7 +473,6 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool, KernelProfilerBase *profiler, uint64 **result_buffer_ptr) { std::size_t prealloc_size = 0; - TaichiLLVMContext *tlctx = nullptr; if (config_.arch == Arch::cuda) { #if defined(TI_WITH_CUDA) CUDADriver::get_instance().malloc( @@ -566,16 +500,14 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool, CUDADriver::get_instance().memset(preallocated_device_buffer_, 0, prealloc_size); - tlctx = llvm_context_device_.get(); #else TI_NOT_IMPLEMENTED #endif } else { *result_buffer_ptr = (uint64 *)memory_pool->allocate( sizeof(uint64) * taichi_result_buffer_entries, 8); - tlctx = llvm_context_host_.get(); } - auto *const runtime_jit = tlctx->runtime_jit_module; + auto *const runtime_jit = llvm_context_->runtime_jit_module; // Starting random state for the program calculated using the random seed. // The seed is multiplied by 1048391 so that two programs with different seeds @@ -656,7 +588,7 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool, } void LlvmRuntimeExecutor::destroy_snode_tree(SNodeTree *snode_tree) { - get_llvm_context(config_.arch)->delete_snode_tree(snode_tree->id()); + get_llvm_context()->delete_snode_tree(snode_tree->id()); snode_tree_buffer_manager_->destroy(snode_tree); } diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h index a886cdd84a995..58ebaf1bd68ae 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.h +++ b/taichi/runtime/llvm/llvm_runtime_executor.h @@ -59,7 +59,7 @@ class LlvmRuntimeExecutor { return config_; } - TaichiLLVMContext *get_llvm_context(Arch arch); + TaichiLLVMContext *get_llvm_context(); LLVMRuntime *get_llvm_runtime(); @@ -101,14 +101,7 @@ class LlvmRuntimeExecutor { Args &&...args) { TI_ASSERT(arch_uses_llvm(config_.arch)); - TaichiLLVMContext *tlctx = nullptr; - if (llvm_context_device_) { - tlctx = llvm_context_device_.get(); - } else { - tlctx = llvm_context_host_.get(); - } - - auto runtime = tlctx->runtime_jit_module; + auto runtime = llvm_context_->runtime_jit_module; runtime->call("runtime_" + key, llvm_runtime_, std::forward(args)...); return taichi_union_cast_with_different_sizes(fetch_result_uint64( @@ -121,17 +114,6 @@ class LlvmRuntimeExecutor { cuda::CudaDevice *cuda_device(); cpu::CpuDevice *cpu_device(); - void initialize_host(); - - /** - * Initializes Program#llvm_context_device, if this has not been done. - * - * Not thread safe. - */ - void maybe_initialize_cuda_llvm_context(); - - void maybe_initialize_amdgpu_llvm_context(); - void finalize(); uint64 fetch_result_uint64(int i, uint64 *result_buffer); @@ -146,8 +128,7 @@ class LlvmRuntimeExecutor { // // TaichiLLVMContext is a thread-safe class with llvm::Module for compilation // and JITSession/JITModule for runtime loading & execution - std::unique_ptr llvm_context_host_{nullptr}; - std::unique_ptr llvm_context_device_{nullptr}; + std::unique_ptr llvm_context_{nullptr}; void *llvm_runtime_{nullptr}; std::unique_ptr thread_pool_{nullptr}; diff --git a/taichi/runtime/program_impls/llvm/llvm_program.cpp b/taichi/runtime/program_impls/llvm/llvm_program.cpp index e39e79c52c5f6..14273b4e62268 100644 --- a/taichi/runtime/program_impls/llvm/llvm_program.cpp +++ b/taichi/runtime/program_impls/llvm/llvm_program.cpp @@ -45,23 +45,10 @@ std::unique_ptr LlvmProgramImpl::compile_snode_tree_types_impl( SNodeTree *tree) { auto *const root = tree->root(); std::unique_ptr struct_compiler{nullptr}; - if (arch_is_cpu(config->arch)) { - auto host_module = - runtime_exec_->llvm_context_host_.get()->new_module("struct"); - struct_compiler = std::make_unique( - host_arch(), this, std::move(host_module), tree->id()); - } else if (config->arch == Arch::dx12) { - auto device_module = - runtime_exec_->llvm_context_device_.get()->new_module("struct"); - struct_compiler = std::make_unique( - Arch::dx12, this, std::move(device_module), tree->id()); - } else { - TI_ASSERT(config->arch == Arch::cuda); - auto device_module = - runtime_exec_->llvm_context_device_.get()->new_module("struct"); - struct_compiler = std::make_unique( - Arch::cuda, this, std::move(device_module), tree->id()); - } + auto module = runtime_exec_->llvm_context_.get()->new_module("struct"); + struct_compiler = std::make_unique( + arch_is_cpu(config->arch) ? host_arch() : config->arch, this, + std::move(module), tree->id()); struct_compiler->run(*root); ++num_snode_trees_processed_; return struct_compiler; diff --git a/taichi/runtime/program_impls/llvm/llvm_program.h b/taichi/runtime/program_impls/llvm/llvm_program.h index 353c6bd751943..cb498b90b13bd 100644 --- a/taichi/runtime/program_impls/llvm/llvm_program.h +++ b/taichi/runtime/program_impls/llvm/llvm_program.h @@ -150,18 +150,6 @@ class LlvmProgramImpl : public ProgramImpl { result_buffer); } - void initialize_host() { - runtime_exec_->initialize_host(); - } - - void maybe_initialize_cuda_llvm_context() { - runtime_exec_->maybe_initialize_cuda_llvm_context(); - } - - void maybe_initialize_amdgpu_llvm_context() { - runtime_exec_->maybe_initialize_amdgpu_llvm_context(); - } - uint64 fetch_result_uint64(int i, uint64 *result_buffer) override { return runtime_exec_->fetch_result_uint64(i, result_buffer); } @@ -185,7 +173,7 @@ class LlvmProgramImpl : public ProgramImpl { } TaichiLLVMContext *get_llvm_context(Arch arch) { - return runtime_exec_->get_llvm_context(arch); + return runtime_exec_->get_llvm_context(); } void synchronize() override {