From b72cc0b51cf3d0faa355b575b38fd65e186962e6 Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Mon, 13 Mar 2023 11:53:25 +0800 Subject: [PATCH 1/4] [lang] Refactor CudaCachingAllocator into a generic caching allocator class --- taichi/rhi/cuda/cuda_caching_allocator.cpp | 39 ++++++++++++++++++---- taichi/rhi/cuda/cuda_caching_allocator.h | 6 +++- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/taichi/rhi/cuda/cuda_caching_allocator.cpp b/taichi/rhi/cuda/cuda_caching_allocator.cpp index 7f1fab4384edb..3e6c9b763d154 100644 --- a/taichi/rhi/cuda/cuda_caching_allocator.cpp +++ b/taichi/rhi/cuda/cuda_caching_allocator.cpp @@ -1,4 +1,5 @@ #include "taichi/rhi/cuda/cuda_caching_allocator.h" +#include "taichi/runtime/llvm/snode_tree_buffer_manager.h" namespace taichi::lang { namespace cuda { @@ -7,11 +8,34 @@ CudaCachingAllocator::CudaCachingAllocator(LlvmDevice *device) : device_(device) { } +void CudaCachingAllocator::merge_and_insert(uint8_t *ptr, std::size_t size) { + // merge with right block + if (ptr_map_[ptr + size]) { + std::size_t tmp = ptr_map_[ptr + size]; + mem_blocks_.erase(std::make_pair(tmp, ptr + size)); + ptr_map_.erase(ptr + size); + size += tmp; + } + // merge with left block + auto map_it = ptr_map_.lower_bound(ptr); + if (map_it != ptr_map_.begin()) { + auto x = *--map_it; + if (x.first + x.second == ptr) { + mem_blocks_.erase(std::make_pair(x.second, x.first)); + ptr_map_.erase(x.first); + ptr = x.first; + size += x.second; + } + } + mem_blocks_.insert(std::make_pair(size, ptr)); + ptr_map_[ptr] = size; +} + uint64_t *CudaCachingAllocator::allocate( const LlvmDevice::LlvmRuntimeAllocParams ¶ms) { uint64_t *ret{nullptr}; auto size_aligned = taichi::iroundup(params.size, taichi_page_size); - auto it_blk = mem_blocks_.lower_bound(size_aligned); + auto it_blk = mem_blocks_.lower_bound(std::make_pair(size_aligned, nullptr)); if (it_blk != mem_blocks_.end()) { size_t remaining_sz = it_blk->first - size_aligned; @@ -19,19 +43,22 @@ uint64_t *CudaCachingAllocator::allocate( TI_ASSERT(remaining_sz % taichi_page_size == 0); auto remaining_head = reinterpret_cast(it_blk->second) + size_aligned; - mem_blocks_.insert( - {remaining_sz, reinterpret_cast(remaining_head)}); + mem_blocks_.insert(std::make_pair(remaining_sz, remaining_head)); + ptr_map_.insert(std::make_pair(remaining_head, remaining_sz)); } - ret = it_blk->second; + ret = reinterpret_cast(it_blk->second); mem_blocks_.erase(it_blk); + ptr_map_.erase(it_blk->second); + } else { - ret = device_->allocate_llvm_runtime_memory_jit(params); + ret = reinterpret_cast( + device_->allocate_llvm_runtime_memory_jit(params)); } return ret; } void CudaCachingAllocator::release(size_t sz, uint64_t *ptr) { - mem_blocks_.insert({sz, ptr}); + merge_and_insert(reinterpret_cast(ptr), sz); } } // namespace cuda diff --git a/taichi/rhi/cuda/cuda_caching_allocator.h b/taichi/rhi/cuda/cuda_caching_allocator.h index e86aac68fd7f4..b1f472e85fc89 100644 --- a/taichi/rhi/cuda/cuda_caching_allocator.h +++ b/taichi/rhi/cuda/cuda_caching_allocator.h @@ -6,6 +6,7 @@ #include "taichi/inc/constants.h" #include #include +#include namespace taichi::lang { namespace cuda { @@ -18,7 +19,10 @@ class CudaCachingAllocator { void release(size_t sz, uint64_t *ptr); private: - std::multimap mem_blocks_; + void merge_and_insert(uint8_t *ptr, std::size_t size); + + std::set> mem_blocks_; + std::map ptr_map_; LlvmDevice *device_{nullptr}; }; From b2cfe428fa6405cfd455e4a048c1a49615f4c2d1 Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Mon, 13 Mar 2023 12:54:40 +0800 Subject: [PATCH 2/4] update --- taichi/rhi/cuda/CMakeLists.txt | 1 - taichi/rhi/cuda/cuda_device.cpp | 4 ++-- taichi/rhi/cuda/cuda_device.h | 4 ++-- taichi/rhi/llvm/CMakeLists.txt | 1 + .../allocator.cpp} | 13 +++++-------- .../cuda_caching_allocator.h => llvm/allocator.h} | 6 ++---- tests/python/test_ndarray.py | 2 +- 7 files changed, 13 insertions(+), 18 deletions(-) rename taichi/rhi/{cuda/cuda_caching_allocator.cpp => llvm/allocator.cpp} (82%) rename taichi/rhi/{cuda/cuda_caching_allocator.h => llvm/allocator.h} (83%) diff --git a/taichi/rhi/cuda/CMakeLists.txt b/taichi/rhi/cuda/CMakeLists.txt index 1287602a2172b..0e1fdca52ff16 100644 --- a/taichi/rhi/cuda/CMakeLists.txt +++ b/taichi/rhi/cuda/CMakeLists.txt @@ -5,7 +5,6 @@ add_library(${CUDA_RHI}) target_sources(${CUDA_RHI} PRIVATE cuda_device.cpp - cuda_caching_allocator.cpp cuda_context.cpp cuda_driver.cpp cuda_profiler.cpp diff --git a/taichi/rhi/cuda/cuda_device.cpp b/taichi/rhi/cuda/cuda_device.cpp index 07281ee442db5..0a59f5bbf84ab 100644 --- a/taichi/rhi/cuda/cuda_device.cpp +++ b/taichi/rhi/cuda/cuda_device.cpp @@ -44,7 +44,7 @@ DeviceAllocation CudaDevice::allocate_memory_runtime( info.size = taichi::iroundup(params.size, taichi_page_size); if (params.use_cached) { if (caching_allocator_ == nullptr) { - caching_allocator_ = std::make_unique(this); + caching_allocator_ = std::make_unique(this); } info.ptr = caching_allocator_->allocate(params); CUDADriver::get_instance().memset((void *)info.ptr, 0, info.size); @@ -72,7 +72,7 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) { TI_ASSERT(!info.is_imported); if (info.use_cached) { if (caching_allocator_ == nullptr) { - TI_ERROR("the CudaCachingAllocator is not initialized"); + TI_ERROR("the CachingAllocator is not initialized"); } caching_allocator_->release(info.size, (uint64_t *)info.ptr); } else if (!info.use_preallocated) { diff --git a/taichi/rhi/cuda/cuda_device.h b/taichi/rhi/cuda/cuda_device.h index 59e039ac84a50..c1e0185af879b 100644 --- a/taichi/rhi/cuda/cuda_device.h +++ b/taichi/rhi/cuda/cuda_device.h @@ -4,7 +4,7 @@ #include "taichi/common/core.h" #include "taichi/rhi/cuda/cuda_driver.h" -#include "taichi/rhi/cuda/cuda_caching_allocator.h" +#include "taichi/rhi/llvm/allocator.h" #include "taichi/rhi/cuda/cuda_context.h" #include "taichi/rhi/llvm/llvm_device.h" @@ -136,7 +136,7 @@ class CudaDevice : public LlvmDevice { TI_ERROR("invalid DeviceAllocation"); } } - std::unique_ptr caching_allocator_{nullptr}; + std::unique_ptr caching_allocator_{nullptr}; }; } // namespace cuda diff --git a/taichi/rhi/llvm/CMakeLists.txt b/taichi/rhi/llvm/CMakeLists.txt index 64bf1be4dbb1f..588e9707a8494 100644 --- a/taichi/rhi/llvm/CMakeLists.txt +++ b/taichi/rhi/llvm/CMakeLists.txt @@ -5,6 +5,7 @@ add_library(${LLVM_RHI}) target_sources(${LLVM_RHI} PRIVATE llvm_device.cpp + allocator.cpp ) target_include_directories(${LLVM_RHI} diff --git a/taichi/rhi/cuda/cuda_caching_allocator.cpp b/taichi/rhi/llvm/allocator.cpp similarity index 82% rename from taichi/rhi/cuda/cuda_caching_allocator.cpp rename to taichi/rhi/llvm/allocator.cpp index 3e6c9b763d154..89091871afbe8 100644 --- a/taichi/rhi/cuda/cuda_caching_allocator.cpp +++ b/taichi/rhi/llvm/allocator.cpp @@ -1,14 +1,12 @@ -#include "taichi/rhi/cuda/cuda_caching_allocator.h" +#include "taichi/rhi/llvm/allocator.h" #include "taichi/runtime/llvm/snode_tree_buffer_manager.h" namespace taichi::lang { -namespace cuda { -CudaCachingAllocator::CudaCachingAllocator(LlvmDevice *device) - : device_(device) { +CachingAllocator::CachingAllocator(LlvmDevice *device) : device_(device) { } -void CudaCachingAllocator::merge_and_insert(uint8_t *ptr, std::size_t size) { +void CachingAllocator::merge_and_insert(uint8_t *ptr, std::size_t size) { // merge with right block if (ptr_map_[ptr + size]) { std::size_t tmp = ptr_map_[ptr + size]; @@ -31,7 +29,7 @@ void CudaCachingAllocator::merge_and_insert(uint8_t *ptr, std::size_t size) { ptr_map_[ptr] = size; } -uint64_t *CudaCachingAllocator::allocate( +uint64_t *CachingAllocator::allocate( const LlvmDevice::LlvmRuntimeAllocParams ¶ms) { uint64_t *ret{nullptr}; auto size_aligned = taichi::iroundup(params.size, taichi_page_size); @@ -57,9 +55,8 @@ uint64_t *CudaCachingAllocator::allocate( return ret; } -void CudaCachingAllocator::release(size_t sz, uint64_t *ptr) { +void CachingAllocator::release(size_t sz, uint64_t *ptr) { merge_and_insert(reinterpret_cast(ptr), sz); } -} // namespace cuda } // namespace taichi::lang diff --git a/taichi/rhi/cuda/cuda_caching_allocator.h b/taichi/rhi/llvm/allocator.h similarity index 83% rename from taichi/rhi/cuda/cuda_caching_allocator.h rename to taichi/rhi/llvm/allocator.h index b1f472e85fc89..9dd1263913c91 100644 --- a/taichi/rhi/cuda/cuda_caching_allocator.h +++ b/taichi/rhi/llvm/allocator.h @@ -9,11 +9,10 @@ #include namespace taichi::lang { -namespace cuda { -class CudaCachingAllocator { +class CachingAllocator { public: - explicit CudaCachingAllocator(LlvmDevice *device); + explicit CachingAllocator(LlvmDevice *device); uint64_t *allocate(const LlvmDevice::LlvmRuntimeAllocParams ¶ms); void release(size_t sz, uint64_t *ptr); @@ -26,5 +25,4 @@ class CudaCachingAllocator { LlvmDevice *device_{nullptr}; }; -} // namespace cuda } // namespace taichi::lang diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py index e6ba13be0eed1..a0e556f4c52eb 100644 --- a/tests/python/test_ndarray.py +++ b/tests/python/test_ndarray.py @@ -273,7 +273,7 @@ def test_ndarray_deepcopy(): @test_utils.test(arch=[ti.cuda], ndarray_use_cached_allocator=True) -def test_ndarray_cuda_caching_allocator(): +def test_ndarray_caching_allocator(): n = 8 a = ti.ndarray(ti.i32, shape=(n)) a.fill(2) From da173c82403a66254703c679045abd30cf36cccd Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Tue, 14 Mar 2023 09:50:10 +0800 Subject: [PATCH 3/4] Update test_ndarray.py --- tests/python/test_ndarray.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py index a0e556f4c52eb..142d0b69dd31f 100644 --- a/tests/python/test_ndarray.py +++ b/tests/python/test_ndarray.py @@ -618,6 +618,7 @@ def test_ndarray_grouped(): _test_ndarray_grouped() + @test_utils.test(arch=[ti.cpu, ti.cuda], real_matrix_scalarize=False) def test_ndarray_grouped_real_matrix(): _test_ndarray_grouped() From 00bcb59d315882c2378a0f6c116a232dfa43873f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 14 Mar 2023 01:51:35 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/python/test_ndarray.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py index 142d0b69dd31f..a0e556f4c52eb 100644 --- a/tests/python/test_ndarray.py +++ b/tests/python/test_ndarray.py @@ -618,7 +618,6 @@ def test_ndarray_grouped(): _test_ndarray_grouped() - @test_utils.test(arch=[ti.cpu, ti.cuda], real_matrix_scalarize=False) def test_ndarray_grouped_real_matrix(): _test_ndarray_grouped()