From 284056206dd6a5f8a95fa61930242d9762d8d8f4 Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Thu, 29 Dec 2022 16:44:13 +0800
Subject: [PATCH] [bug] Fix device memory allocation for numpy array on CUDA
 backend

---
 taichi/codegen/cuda/codegen_cuda.cpp          | 11 +++++++++--
 taichi/runtime/llvm/llvm_runtime_executor.cpp |  4 ++++
 taichi/runtime/llvm/llvm_runtime_executor.h   |  2 ++
 3 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
index 8a1187c00b5294..b02073799988a5 100644
--- a/taichi/codegen/cuda/codegen_cuda.cpp
+++ b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -627,6 +627,7 @@ FunctionType CUDAModuleToFunctionConverter::convert(
     CUDAContext::get_instance().make_current();
     std::vector<void *> arg_buffers(args.size(), nullptr);
     std::vector<void *> device_buffers(args.size(), nullptr);
+    std::vector<DeviceAllocation> temporary_devallocs(args.size());
 
     bool transferred = false;
     for (int i = 0; i < (int)args.size(); i++) {
@@ -655,7 +656,13 @@ FunctionType CUDAModuleToFunctionConverter::convert(
             //   host.
             // See CUDA driver API `cuPointerGetAttribute` for more details.
             transferred = true;
-            CUDADriver::get_instance().malloc(&device_buffers[i], arr_sz);
+
+            auto result_buffer = context.result_buffer;
+            DeviceAllocation devalloc =
+                executor->allocate_memory_ndarray(arr_sz, result_buffer);
+            device_buffers[i] = executor->get_ndarray_alloc_info_ptr(devalloc);
+            temporary_devallocs[i] = devalloc;
+
             CUDADriver::get_instance().memcpy_host_to_device(
                 (void *)device_buffers[i], arg_buffers[i], arr_sz);
           } else {
@@ -703,7 +710,7 @@ FunctionType CUDAModuleToFunctionConverter::convert(
           CUDADriver::get_instance().memcpy_device_to_host(
               arg_buffers[i], (void *)device_buffers[i],
               context.array_runtime_sizes[i]);
-          CUDADriver::get_instance().mem_free((void *)device_buffers[i]);
+          executor->deallocate_memory_ndarray(temporary_devallocs[i]);
         }
       }
     }
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index b137aba54f6324..f4b6f1b75e515c 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -494,6 +494,10 @@ DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray(
        result_buffer});
 }
 
+void LlvmRuntimeExecutor::deallocate_memory_ndarray(DeviceAllocation handle) {
+  cuda_device()->dealloc_memory(handle);
+}
+
 void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc,
                                        std::size_t size,
                                        uint32_t data) {
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h
index b662cd7e9bf9c7..7bf1178397981b 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.h
+++ b/taichi/runtime/llvm/llvm_runtime_executor.h
@@ -49,6 +49,8 @@ class LlvmRuntimeExecutor {
   DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
                                            uint64 *result_buffer);
 
+  void deallocate_memory_ndarray(DeviceAllocation handle);
+
   void check_runtime_error(uint64 *result_buffer);
 
   uint64_t *get_ndarray_alloc_info_ptr(const DeviceAllocation &alloc);