Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bug] Fix device memory allocation for numpy array on CUDA backend #7008

Merged
merged 1 commit into from
Jan 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions taichi/codegen/cuda/codegen_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,7 @@ FunctionType CUDAModuleToFunctionConverter::convert(
CUDAContext::get_instance().make_current();
std::vector<void *> arg_buffers(args.size(), nullptr);
std::vector<void *> device_buffers(args.size(), nullptr);
std::vector<DeviceAllocation> temporary_devallocs(args.size());

bool transferred = false;
for (int i = 0; i < (int)args.size(); i++) {
Expand Down Expand Up @@ -655,7 +656,13 @@ FunctionType CUDAModuleToFunctionConverter::convert(
// host.
// See CUDA driver API `cuPointerGetAttribute` for more details.
transferred = true;
CUDADriver::get_instance().malloc(&device_buffers[i], arr_sz);

auto result_buffer = context.result_buffer;
DeviceAllocation devalloc =
executor->allocate_memory_ndarray(arr_sz, result_buffer);
device_buffers[i] = executor->get_ndarray_alloc_info_ptr(devalloc);
temporary_devallocs[i] = devalloc;

CUDADriver::get_instance().memcpy_host_to_device(
(void *)device_buffers[i], arg_buffers[i], arr_sz);
} else {
Expand Down Expand Up @@ -703,7 +710,7 @@ FunctionType CUDAModuleToFunctionConverter::convert(
CUDADriver::get_instance().memcpy_device_to_host(
arg_buffers[i], (void *)device_buffers[i],
context.array_runtime_sizes[i]);
CUDADriver::get_instance().mem_free((void *)device_buffers[i]);
executor->deallocate_memory_ndarray(temporary_devallocs[i]);
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions taichi/runtime/llvm/llvm_runtime_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,10 @@ DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray(
result_buffer});
}

void LlvmRuntimeExecutor::deallocate_memory_ndarray(DeviceAllocation handle) {
cuda_device()->dealloc_memory(handle);
}

void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc,
std::size_t size,
uint32_t data) {
Expand Down
2 changes: 2 additions & 0 deletions taichi/runtime/llvm/llvm_runtime_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class LlvmRuntimeExecutor {
DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
uint64 *result_buffer);

void deallocate_memory_ndarray(DeviceAllocation handle);

void check_runtime_error(uint64 *result_buffer);

uint64_t *get_ndarray_alloc_info_ptr(const DeviceAllocation &alloc);
Expand Down