From 398dea9a329bb22b5425ae501a42c68e70fe9c99 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Mon, 8 May 2023 16:52:51 +0800
Subject: [PATCH] [Lang] Remove deprecated compile option
 ndarray_use_cached_allocator (#7937)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issue: #

### Brief Summary

<!--
copilot:summary
-->
### <samp>🤖 Generated by Copilot at b10a055</samp>

Removed the `use_cached` flag and related parameters from various
classes and functions that handle memory allocation for different
devices. This flag was unnecessary and redundant, as all devices now use
a caching allocator by default. This simplifies the code and reduces the
complexity of the memory management logic.

### Walkthrough

<!--
copilot:walkthrough
-->
### <samp>🤖 Generated by Copilot at b10a055</samp>

* Remove `ndarray_use_cached_allocator` flag and simplify memory
allocation logic for different devices
* Eliminate `ndarray_use_cached_allocator` field from `CompileConfig`
struct and class
([link](https://github.com/taichi-dev/taichi/pull/7937/files?diff=unified&w=0#diff-84b498f7923f43bc9e292dffb7b3d9a0cb7da1c16ddd0062af184a1b7b777976L45),
[link](https://github.com/taichi-dev/taichi/pull/7937/files?diff=unified&w=0#diff-604e33db5b7c4f4b819098dee1f2634a36c0974822c1538aec0d57d28ca00dd9L40),
[link](https://github.com/taichi-dev/taichi/pull/7937/files?diff=unified&w=0#diff-af631a0c71978fe591e17005f01f7c06bc30ae36c65df306bbb3b08ade770167L196-L197))
* Remove `use_cached` parameter and field from `allocate_memory_runtime`
function and `DeviceAllocation` struct in `CUDADevice` and
`AMDGPUDevice` classes
([link](https://github.com/taichi-dev/taichi/pull/7937/files?diff=unified&w=0#diff-e0843d34a17298595cd26f5b47b1933a0d18c5c795b0c91b73ec8541a2cdd3f9L59-R59),
[link](https://github.com/taichi-dev/taichi/pull/7937/files?diff=unified&w=0#diff-e0843d34a17298595cd26f5b47b1933a0d18c5c795b0c91b73ec8541a2cdd3f9L65-R67),
[link](https://github.com/taichi-dev/taichi/pull/7937/files?diff=unified&w=0#diff-7919f5d7e33aafc72f27ed93febc58a0ac77c220ae718bf78ef134dad3790654L54-R54),
[link](https://github.com/taichi-dev/taichi/pull/7937/files?diff=unified&w=0#diff-7919f5d7e33aafc72f27ed93febc58a0ac77c220ae718bf78ef134dad3790654L61-R63))
* Remove `use_cached` parameter from `allocate_memory` function in
`LlvmRuntime` class and `allocate_memory_runtime` function call in
`allocate_memory_ndarray` function
([link](https://github.com/taichi-dev/taichi/pull/7937/files?diff=unified&w=0#diff-10c4f36944f322074d4a7f5e6d1878ada018992f7c965f6944c60d4468f7719dL55-R61),
[link](https://github.com/taichi-dev/taichi/pull/7937/files?diff=unified&w=0#diff-b9155792159f392bd8bacd44cb1819be5239b022d707499fc364c0f93dd8c5e5L478))
* Remove `use_cached` field from `LlvmDevice` class
([link](https://github.com/taichi-dev/taichi/pull/7937/files?diff=unified&w=0#diff-96f2f848955ca34e50c83bdaa4b970cd7d60ac595e924f626c4568a03a43658eL13))
* Remove `ndarray_use_cached_allocator` parameter from `test` decorator
in `test_ndarray_caching_allocator` function in `test_ndarray.py`
([link](https://github.com/taichi-dev/taichi/pull/7937/files?diff=unified&w=0#diff-ca3c8d1edb25b6a7f4affbb79b2e3e74f73b3757e5d465258ce42ea9eb09fbc0L281-R281))
---
 c_api/src/taichi_llvm_impl.cpp                | 7 +++----
 taichi/program/compile_config.cpp             | 1 -
 taichi/program/compile_config.h               | 1 -
 taichi/python/export_lang.cpp                 | 2 --
 taichi/rhi/amdgpu/amdgpu_device.cpp           | 6 ++----
 taichi/rhi/cuda/cuda_device.cpp               | 6 ++----
 taichi/rhi/llvm/llvm_device.h                 | 1 -
 taichi/runtime/llvm/llvm_runtime_executor.cpp | 1 -
 tests/python/test_ndarray.py                  | 2 +-
 9 files changed, 8 insertions(+), 19 deletions(-)
diff --git a/c_api/src/taichi_llvm_impl.cpp b/c_api/src/taichi_llvm_impl.cpp
index c39bf619a7ed6..a8b2fb8d8c6d2 100644
--- a/c_api/src/taichi_llvm_impl.cpp
+++ b/c_api/src/taichi_llvm_impl.cpp
@@ -52,14 +52,13 @@ taichi::lang::Device &LlvmRuntime::get() {
 
 TiMemory LlvmRuntime::allocate_memory(
     const taichi::lang::Device::AllocParams &params) {
-  const taichi::lang::CompileConfig &config = executor_->get_config();
   taichi::lang::LLVMRuntime *llvm_runtime = executor_->get_llvm_runtime();
   taichi::lang::LlvmDevice *llvm_device = executor_->llvm_device();
 
   taichi::lang::DeviceAllocation devalloc =
-      llvm_device->allocate_memory_runtime(
-          {params, config.ndarray_use_cached_allocator,
-           executor_->get_runtime_jit_module(), llvm_runtime, result_buffer});
+      llvm_device->allocate_memory_runtime({params,
+                                            executor_->get_runtime_jit_module(),
+                                            llvm_runtime, result_buffer});
   return devalloc2devmem(*this, devalloc);
 }
 
diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
index 64c04371a44a2..bb6c102041d7d 100644
--- a/taichi/program/compile_config.cpp
+++ b/taichi/program/compile_config.cpp
@@ -42,7 +42,6 @@ CompileConfig::CompileConfig() {
   make_thread_local = true;
   make_block_local = true;
   detect_read_only = true;
-  ndarray_use_cached_allocator = true;
   real_matrix_scalarize = true;
   half2_vectorization = false;
   make_cpu_multithreading_loop = true;
diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
index df1650ec163a3..619109f4e86d8 100644
--- a/taichi/program/compile_config.h
+++ b/taichi/program/compile_config.h
@@ -37,7 +37,6 @@ struct CompileConfig {
   bool make_thread_local;
   bool make_block_local;
   bool detect_read_only;
-  bool ndarray_use_cached_allocator;
   bool real_matrix_scalarize;
   bool half2_vectorization;
   bool make_cpu_multithreading_loop;
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 04301f154e5d6..fe3e26dd213b4 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -193,8 +193,6 @@ void export_lang(py::module &m) {
       .def_readwrite("make_thread_local", &CompileConfig::make_thread_local)
       .def_readwrite("make_block_local", &CompileConfig::make_block_local)
       .def_readwrite("detect_read_only", &CompileConfig::detect_read_only)
-      .def_readwrite("ndarray_use_cached_allocator",
-                     &CompileConfig::ndarray_use_cached_allocator)
       .def_readwrite("real_matrix_scalarize",
                      &CompileConfig::real_matrix_scalarize)
       .def_readwrite("half2_vectorization", &CompileConfig::half2_vectorization)
diff --git a/taichi/rhi/amdgpu/amdgpu_device.cpp b/taichi/rhi/amdgpu/amdgpu_device.cpp
index f2d94a9c06883..6107904c9f644 100644
--- a/taichi/rhi/amdgpu/amdgpu_device.cpp
+++ b/taichi/rhi/amdgpu/amdgpu_device.cpp
@@ -56,17 +56,15 @@ DeviceAllocation AmdgpuDevice::allocate_memory_runtime(
   info.size = taichi::iroundup(params.size, taichi_page_size);
   if (params.host_read || params.host_write) {
     TI_NOT_IMPLEMENTED
-  } else if (params.use_cached) {
+  } else {
     info.ptr =
         DeviceMemoryPool::get_instance().allocate_with_cache(this, params);
     TI_ASSERT(info.ptr != nullptr);
 
     AMDGPUDriver::get_instance().memset((void *)info.ptr, 0, info.size);
-  } else {
-    info.ptr = allocate_llvm_runtime_memory_jit(params);
   }
   info.is_imported = false;
-  info.use_cached = params.use_cached;
+  info.use_cached = true;
   info.use_preallocated = true;
 
   DeviceAllocation alloc;
diff --git a/taichi/rhi/cuda/cuda_device.cpp b/taichi/rhi/cuda/cuda_device.cpp
index bb4bded239eb2..f75f657972cbf 100644
--- a/taichi/rhi/cuda/cuda_device.cpp
+++ b/taichi/rhi/cuda/cuda_device.cpp
@@ -51,18 +51,16 @@ DeviceAllocation CudaDevice::allocate_memory_runtime(
   info.size = taichi::iroundup(params.size, taichi_page_size);
   if (info.size == 0) {
     info.ptr = nullptr;
-  } else if (params.use_cached) {
+  } else {
     info.ptr =
         DeviceMemoryPool::get_instance().allocate_with_cache(this, params);
 
     TI_ASSERT(info.ptr != nullptr);
 
     CUDADriver::get_instance().memset((void *)info.ptr, 0, info.size);
-  } else {
-    info.ptr = allocate_llvm_runtime_memory_jit(params);
   }
   info.is_imported = false;
-  info.use_cached = params.use_cached;
+  info.use_cached = true;
   info.use_preallocated = true;
 
   DeviceAllocation alloc;
diff --git a/taichi/rhi/llvm/llvm_device.h b/taichi/rhi/llvm/llvm_device.h
index e7104f94a20fb..7997a5c3fe964 100644
--- a/taichi/rhi/llvm/llvm_device.h
+++ b/taichi/rhi/llvm/llvm_device.h
@@ -10,7 +10,6 @@ struct LLVMRuntime;
 class LlvmDevice : public Device {
  public:
   struct LlvmRuntimeAllocParams : AllocParams {
-    bool use_cached{true};
     JITModule *runtime_jit{nullptr};
     LLVMRuntime *runtime{nullptr};
     uint64 *result_buffer{nullptr};
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index c7ad98b5201e6..ffb35055618a0 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -475,7 +475,6 @@ DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray(
   return llvm_device()->allocate_memory_runtime(
       {{alloc_size, /*host_write=*/false, /*host_read=*/false,
         /*export_sharing=*/false, AllocUsage::Storage},
-       config_.ndarray_use_cached_allocator,
        get_runtime_jit_module(),
        get_llvm_runtime(),
        result_buffer});
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index e3c89fa3e0f00..f139d44917c75 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -278,7 +278,7 @@ def test_ndarray_deepcopy():
     assert y[4][1, 0] == 9
 
 
-@test_utils.test(arch=[ti.cuda], ndarray_use_cached_allocator=True)
+@test_utils.test(arch=[ti.cuda])
 def test_ndarray_caching_allocator():
     n = 8
     a = ti.ndarray(ti.i32, shape=(n))