From 524b6fbd26fe2a0b2d3ce5ff375a326615775db7 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 21 Mar 2023 18:03:40 +0800
Subject: [PATCH] [lang] Removed cpu_device(), cuda_device(), and
 amdgpu_device() from LlvmRuntimeExecutor (#7544)

Issue: #7300

### Brief Summary

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/rhi/amdgpu/amdgpu_device.h             |  2 +-
 taichi/rhi/cpu/cpu_device.h                   |  2 +-
 taichi/rhi/cuda/cuda_device.h                 |  2 +-
 taichi/rhi/llvm/llvm_device.h                 | 11 +++
 taichi/runtime/llvm/llvm_runtime_executor.cpp | 83 ++++++-------------
 taichi/runtime/llvm/llvm_runtime_executor.h   |  4 -
 .../runtime/program_impls/llvm/llvm_program.h |  8 --
 7 files changed, 41 insertions(+), 71 deletions(-)
diff --git a/taichi/rhi/amdgpu/amdgpu_device.h b/taichi/rhi/amdgpu/amdgpu_device.h
index ef2f257bb1225..0c540d1d700d2 100644
--- a/taichi/rhi/amdgpu/amdgpu_device.h
+++ b/taichi/rhi/amdgpu/amdgpu_device.h
@@ -96,7 +96,7 @@ class AmdgpuDevice : public LlvmDevice {
 
   void memcpy_internal(DevicePtr dst, DevicePtr src, uint64_t size) override;
 
-  DeviceAllocation import_memory(void *ptr, size_t size);
+  DeviceAllocation import_memory(void *ptr, size_t size) override;
 
   Stream *get_compute_stream() override{TI_NOT_IMPLEMENTED};
 
diff --git a/taichi/rhi/cpu/cpu_device.h b/taichi/rhi/cpu/cpu_device.h
index c0aecbc401e83..c3a03a9a6017e 100644
--- a/taichi/rhi/cpu/cpu_device.h
+++ b/taichi/rhi/cpu/cpu_device.h
@@ -106,7 +106,7 @@ class CpuDevice : public LlvmDevice {
   void unmap(DevicePtr ptr) final{TI_NOT_IMPLEMENTED};
   void unmap(DeviceAllocation alloc) final;
 
-  DeviceAllocation import_memory(void *ptr, size_t size);
+  DeviceAllocation import_memory(void *ptr, size_t size) override;
 
   void memcpy_internal(DevicePtr dst, DevicePtr src, uint64_t size) override;
 
diff --git a/taichi/rhi/cuda/cuda_device.h b/taichi/rhi/cuda/cuda_device.h
index c1e0185af879b..b9a1213ead94c 100644
--- a/taichi/rhi/cuda/cuda_device.h
+++ b/taichi/rhi/cuda/cuda_device.h
@@ -123,7 +123,7 @@ class CudaDevice : public LlvmDevice {
 
   void memcpy_internal(DevicePtr dst, DevicePtr src, uint64_t size) override;
 
-  DeviceAllocation import_memory(void *ptr, size_t size);
+  DeviceAllocation import_memory(void *ptr, size_t size) override;
 
   Stream *get_compute_stream() override{TI_NOT_IMPLEMENTED};
 
diff --git a/taichi/rhi/llvm/llvm_device.h b/taichi/rhi/llvm/llvm_device.h
index 5995a2d36dbb0..5fa9535b64724 100644
--- a/taichi/rhi/llvm/llvm_device.h
+++ b/taichi/rhi/llvm/llvm_device.h
@@ -20,6 +20,17 @@ class LlvmDevice : public Device {
     TI_NOT_IMPLEMENTED
   }
 
+  template <typename DEVICE>
+  DEVICE *as() {
+    auto *device = dynamic_cast<DEVICE *>(this);
+    TI_ASSERT(device != nullptr);
+    return device;
+  }
+
+  virtual DeviceAllocation import_memory(void *ptr, size_t size) {
+    TI_NOT_IMPLEMENTED
+  }
+
   virtual DeviceAllocation allocate_memory_runtime(
       const LlvmRuntimeAllocParams &params) {
     TI_NOT_IMPLEMENTED;
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 98502570b337e..0025070933660 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -415,23 +415,8 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
     std::memset(root_buffer, 0, rounded_size);
   }
 
-  DeviceAllocation alloc{kDeviceNullAllocation};
-
-  if (config_.arch == Arch::cuda) {
-#if defined(TI_WITH_CUDA)
-    alloc = cuda_device()->import_memory(root_buffer, rounded_size);
-#else
-    TI_NOT_IMPLEMENTED
-#endif
-  } else if (config_.arch == Arch::amdgpu) {
-#if defined(TI_WITH_AMDGPU)
-    alloc = amdgpu_device()->import_memory(root_buffer, rounded_size);
-#else
-    TI_NOT_IMPLEMENTED
-#endif
-  } else {
-    alloc = cpu_device()->import_memory(root_buffer, rounded_size);
-  }
+  DeviceAllocation alloc =
+      llvm_device()->import_memory(root_buffer, rounded_size);
 
   snode_tree_allocs_[tree_id] = alloc;
 
@@ -474,25 +459,6 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
   }
 }
 
-cuda::CudaDevice *LlvmRuntimeExecutor::cuda_device() {
-  if (config_.arch != Arch::cuda) {
-    TI_ERROR("arch is not cuda");
-  }
-  return static_cast<cuda::CudaDevice *>(device_.get());
-}
-
-amdgpu::AmdgpuDevice *LlvmRuntimeExecutor::amdgpu_device() {
-  if (config_.arch != Arch::amdgpu) {
-    TI_ERROR("arch is not amdgpu");
-  }
-  return static_cast<amdgpu::AmdgpuDevice *>(device_.get());
-}
-
-cpu::CpuDevice *LlvmRuntimeExecutor::cpu_device() {
-  TI_ERROR_IF(!arch_is_cpu(config_.arch), "arch is not cpu");
-  return static_cast<cpu::CpuDevice *>(device_.get());
-}
-
 LlvmDevice *LlvmRuntimeExecutor::llvm_device() {
   TI_ASSERT(dynamic_cast<LlvmDevice *>(device_.get()));
   return static_cast<LlvmDevice *>(device_.get());
@@ -511,7 +477,7 @@ DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray(
 }
 
 void LlvmRuntimeExecutor::deallocate_memory_ndarray(DeviceAllocation handle) {
-  cuda_device()->dealloc_memory(handle);
+  llvm_device()->dealloc_memory(handle);
 }
 
 void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc,
@@ -539,32 +505,35 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
     const DeviceAllocation &alloc) {
   if (config_.arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-    return (uint64_t *)cuda_device()->get_alloc_info(alloc).ptr;
+    return (uint64_t *)llvm_device()
+        ->as<cuda::CudaDevice>()
+        ->get_alloc_info(alloc)
+        .ptr;
 #else
     TI_NOT_IMPLEMENTED
 #endif
   } else if (config_.arch == Arch::amdgpu) {
 #if defined(TI_WITH_AMDGPU)
-    return (uint64_t *)amdgpu_device()->get_alloc_info(alloc).ptr;
+    return (uint64_t *)llvm_device()
+        ->as<amdgpu::AmdgpuDevice>()
+        ->get_alloc_info(alloc)
+        .ptr;
 #else
-    TI_NOT_IMPLEMENTED
+    TI_NOT_IMPLEMENTED;
 #endif
-  } else {
-    return (uint64_t *)cpu_device()->get_alloc_info(alloc).ptr;
   }
+
+  return (uint64_t *)llvm_device()
+      ->as<cpu::CpuDevice>()
+      ->get_alloc_info(alloc)
+      .ptr;
 }
 
 void LlvmRuntimeExecutor::finalize() {
   profiler_ = nullptr;
   if (preallocated_device_buffer_ != nullptr) {
-    if (config_.arch == Arch::cuda) {
-#if defined(TI_WITH_CUDA)
-      cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
-#endif
-    } else if (config_.arch == Arch::amdgpu) {
-#if defined(TI_WITH_AMDGPU)
-      amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_);
-#endif
+    if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
+      llvm_device()->dealloc_memory(preallocated_device_buffer_alloc_);
     }
   }
   finalized_ = true;
@@ -603,11 +572,12 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
     Device::AllocParams preallocated_device_buffer_alloc_params;
     preallocated_device_buffer_alloc_params.size = prealloc_size;
     RhiResult res =
-        cuda_device()->allocate_memory(preallocated_device_buffer_alloc_params,
+        llvm_device()->allocate_memory(preallocated_device_buffer_alloc_params,
                                        &preallocated_device_buffer_alloc_);
     TI_ASSERT(res == RhiResult::success);
     cuda::CudaDevice::AllocInfo preallocated_device_buffer_alloc_info =
-        cuda_device()->get_alloc_info(preallocated_device_buffer_alloc_);
+        llvm_device()->as<cuda::CudaDevice>()->get_alloc_info(
+            preallocated_device_buffer_alloc_);
     preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr;
 
     CUDADriver::get_instance().memset(preallocated_device_buffer_, 0,
@@ -636,12 +606,13 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
 
     Device::AllocParams preallocated_device_buffer_alloc_params;
     preallocated_device_buffer_alloc_params.size = prealloc_size;
-    RhiResult res = amdgpu_device()->allocate_memory(
-        preallocated_device_buffer_alloc_params,
-        &preallocated_device_buffer_alloc_);
+    RhiResult res =
+        llvm_device()->allocate_memory(preallocated_device_buffer_alloc_params,
+                                       &preallocated_device_buffer_alloc_);
     TI_ASSERT(res == RhiResult::success);
     amdgpu::AmdgpuDevice::AllocInfo preallocated_device_buffer_alloc_info =
-        amdgpu_device()->get_alloc_info(preallocated_device_buffer_alloc_);
+        llvm_device()->as<amdgpu::AmdgpuDevice>()->get_alloc_info(
+            preallocated_device_buffer_alloc_);
     preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr;
 
     AMDGPUDriver::get_instance().memset(preallocated_device_buffer_, 0,
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h
index 501c8428cc98a..a25e06c3efcc2 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.h
+++ b/taichi/runtime/llvm/llvm_runtime_executor.h
@@ -122,10 +122,6 @@ class LlvmRuntimeExecutor {
   /* -------------------------- */
   /* ------ Member Access ----- */
   /* -------------------------- */
-  cuda::CudaDevice *cuda_device();
-  cpu::CpuDevice *cpu_device();
-  amdgpu::AmdgpuDevice *amdgpu_device();
-
   void finalize();
 
   uint64 fetch_result_uint64(int i, uint64 *result_buffer);
diff --git a/taichi/runtime/program_impls/llvm/llvm_program.h b/taichi/runtime/program_impls/llvm/llvm_program.h
index 05fe9d71cfb3d..678c64f01c484 100644
--- a/taichi/runtime/program_impls/llvm/llvm_program.h
+++ b/taichi/runtime/program_impls/llvm/llvm_program.h
@@ -254,14 +254,6 @@ class LlvmProgramImpl : public ProgramImpl {
     return runtime_exec_->get_snode_tree_device_ptr(tree_id);
   }
 
-  cuda::CudaDevice *cuda_device() {
-    return runtime_exec_->cuda_device();
-  }
-
-  cpu::CpuDevice *cpu_device() {
-    return runtime_exec_->cpu_device();
-  }
-
   LlvmDevice *llvm_device() {
     return runtime_exec_->llvm_device();
   }