Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[lang] Refactor allocation logic for SNodeTreeBufferManager #7795

Merged
merged 3 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions taichi/rhi/cpu/cpu_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,19 @@ CpuDevice::CpuDevice() {
RhiResult CpuDevice::allocate_memory(const AllocParams &params,
DeviceAllocation *out_devalloc) {
AllocInfo info;

info.ptr = HostMemoryPool::get_instance().allocate(
params.size, HostMemoryPool::page_size, true /*exclusive*/);
info.size = params.size;
info.use_cached = false;

if (info.ptr == nullptr) {
return RhiResult::out_of_memory;
}
if (info.size == 0) {
info.ptr = nullptr;
} else {
info.ptr = HostMemoryPool::get_instance().allocate(
params.size, HostMemoryPool::page_size, true /*exclusive*/);

if (info.ptr == nullptr) {
return RhiResult::out_of_memory;
}
}
*out_devalloc = DeviceAllocation{};
out_devalloc->alloc_id = allocations_.size();
out_devalloc->device = this;
Expand All @@ -48,6 +51,9 @@ DeviceAllocation CpuDevice::allocate_memory_runtime(
void CpuDevice::dealloc_memory(DeviceAllocation handle) {
validate_device_alloc(handle);
AllocInfo &info = allocations_[handle.alloc_id];
if (info.size == 0) {
return;
}
if (info.ptr == nullptr) {
TI_ERROR("the DeviceAllocation is already deallocated");
}
Expand Down
7 changes: 6 additions & 1 deletion taichi/rhi/cuda/cuda_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ DeviceAllocation CudaDevice::allocate_memory_runtime(
const LlvmRuntimeAllocParams &params) {
AllocInfo info;
info.size = taichi::iroundup(params.size, taichi_page_size);
if (params.use_cached) {
if (info.size == 0) {
info.ptr = nullptr;
} else if (params.use_cached) {
info.ptr =
DeviceMemoryPool::get_instance().allocate_with_cache(this, params);

Expand Down Expand Up @@ -76,6 +78,9 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) {

validate_device_alloc(handle);
AllocInfo &info = allocations_[handle.alloc_id];
if (info.size == 0) {
return;
}
if (info.ptr == nullptr) {
TI_ERROR("the DeviceAllocation is already deallocated");
}
Expand Down
5 changes: 2 additions & 3 deletions taichi/runtime/llvm/llvm_runtime_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,9 +397,8 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
TI_TRACE("Allocating data structure of size {} bytes", root_size);
std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);

Ptr root_buffer = snode_tree_buffer_manager_->allocate(
runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree_id,
result_buffer);
Ptr root_buffer = snode_tree_buffer_manager_->allocate(rounded_size, tree_id,
result_buffer);
if (config_.arch == Arch::cuda) {
#if defined(TI_WITH_CUDA)
CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
Expand Down
70 changes: 7 additions & 63 deletions taichi/runtime/llvm/snode_tree_buffer_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,74 +9,18 @@ SNodeTreeBufferManager::SNodeTreeBufferManager(
TI_TRACE("SNode tree buffer manager created.");
}

void SNodeTreeBufferManager::merge_and_insert(Ptr ptr, std::size_t size) {
// merge with right block
if (ptr_map_[ptr + size]) {
std::size_t tmp = ptr_map_[ptr + size];
size_set_.erase(std::make_pair(tmp, ptr + size));
ptr_map_.erase(ptr + size);
size += tmp;
}
// merge with left block
auto map_it = ptr_map_.lower_bound(ptr);
if (map_it != ptr_map_.begin()) {
auto x = *--map_it;
if (x.first + x.second == ptr) {
size_set_.erase(std::make_pair(x.second, x.first));
ptr_map_.erase(x.first);
ptr = x.first;
size += x.second;
}
}
size_set_.insert(std::make_pair(size, ptr));
ptr_map_[ptr] = size;
}

Ptr SNodeTreeBufferManager::allocate(JITModule *runtime_jit,
void *runtime,
std::size_t size,
std::size_t alignment,
Ptr SNodeTreeBufferManager::allocate(std::size_t size,
const int snode_tree_id,
uint64 *result_buffer) {
TI_TRACE("allocating memory for SNode Tree {}", snode_tree_id);
TI_ASSERT_INFO(snode_tree_id < kMaxNumSnodeTreesLlvm,
"LLVM backend supports up to {} snode trees",
kMaxNumSnodeTreesLlvm);
auto set_it = size_set_.lower_bound(std::make_pair(size, nullptr));
if (set_it == size_set_.end()) {
runtime_jit->call<void *, std::size_t, std::size_t>(
"runtime_memory_allocate_aligned", runtime, size, alignment,
result_buffer);
auto ptr = runtime_exec_->fetch_result<Ptr>(0, result_buffer);
roots_[snode_tree_id] = ptr;
sizes_[snode_tree_id] = size;
return ptr;
} else {
auto x = *set_it;
size_set_.erase(x);
ptr_map_.erase(x.second);
if (x.first - size > 0) {
size_set_.insert(std::make_pair(x.first - size, x.second + size));
ptr_map_[x.second + size] = x.first - size;
}
TI_ASSERT(x.second);
roots_[snode_tree_id] = x.second;
sizes_[snode_tree_id] = size;
return x.second;
}
auto devalloc = runtime_exec_->allocate_memory_ndarray(size, result_buffer);
snode_tree_id_to_device_alloc_[snode_tree_id] = devalloc;
return (Ptr)runtime_exec_->get_ndarray_alloc_info_ptr(devalloc);
}

void SNodeTreeBufferManager::destroy(SNodeTree *snode_tree) {
int snode_tree_id = snode_tree->id();
TI_TRACE("Destroying SNode tree {}.", snode_tree_id);
std::size_t size = sizes_[snode_tree_id];
if (size == 0) {
TI_DEBUG("SNode tree {} destroy failed.", snode_tree_id);
return;
}
Ptr ptr = roots_[snode_tree_id];
merge_and_insert(ptr, size);
TI_DEBUG("SNode tree {} destroyed.", snode_tree_id);
auto devalloc = snode_tree_id_to_device_alloc_[snode_tree->id()];
runtime_exec_->deallocate_memory_ndarray(devalloc);
snode_tree_id_to_device_alloc_.erase(snode_tree->id());
}

} // namespace taichi::lang
13 changes: 3 additions & 10 deletions taichi/runtime/llvm/snode_tree_buffer_manager.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once
#include "taichi/inc/constants.h"
#include "taichi/struct/snode_tree.h"
#include "taichi/rhi/public_device.h"
#define TI_RUNTIME_HOST

#include <set>
Expand All @@ -16,23 +17,15 @@ class SNodeTreeBufferManager {
public:
explicit SNodeTreeBufferManager(LlvmRuntimeExecutor *runtime_exec);

void merge_and_insert(Ptr ptr, std::size_t size);

Ptr allocate(JITModule *runtime_jit,
void *runtime,
std::size_t size,
std::size_t alignment,
Ptr allocate(std::size_t size,
const int snode_tree_id,
uint64 *result_buffer);

void destroy(SNodeTree *snode_tree);

private:
std::set<std::pair<std::size_t, Ptr>> size_set_;
std::map<Ptr, std::size_t> ptr_map_;
LlvmRuntimeExecutor *runtime_exec_;
Ptr roots_[kMaxNumSnodeTreesLlvm];
std::size_t sizes_[kMaxNumSnodeTreesLlvm];
std::map<int, DeviceAllocation> snode_tree_id_to_device_alloc_;
};

} // namespace taichi::lang