From b72cc0b51cf3d0faa355b575b38fd65e186962e6 Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Mon, 13 Mar 2023 11:53:25 +0800
Subject: [PATCH 1/4] [lang] Refactor CudaCachingAllocator into a generic
 caching allocator class

---
 taichi/rhi/cuda/cuda_caching_allocator.cpp | 39 ++++++++++++++++++----
 taichi/rhi/cuda/cuda_caching_allocator.h   |  6 +++-
 2 files changed, 38 insertions(+), 7 deletions(-)
diff --git a/taichi/rhi/cuda/cuda_caching_allocator.cpp b/taichi/rhi/cuda/cuda_caching_allocator.cpp
index 7f1fab4384edb..3e6c9b763d154 100644
--- a/taichi/rhi/cuda/cuda_caching_allocator.cpp
+++ b/taichi/rhi/cuda/cuda_caching_allocator.cpp
@@ -1,4 +1,5 @@
 #include "taichi/rhi/cuda/cuda_caching_allocator.h"
+#include "taichi/runtime/llvm/snode_tree_buffer_manager.h"
 
 namespace taichi::lang {
 namespace cuda {
@@ -7,11 +8,34 @@ CudaCachingAllocator::CudaCachingAllocator(LlvmDevice *device)
     : device_(device) {
 }
 
+void CudaCachingAllocator::merge_and_insert(uint8_t *ptr, std::size_t size) {
+  // merge with right block
+  if (ptr_map_[ptr + size]) {
+    std::size_t tmp = ptr_map_[ptr + size];
+    mem_blocks_.erase(std::make_pair(tmp, ptr + size));
+    ptr_map_.erase(ptr + size);
+    size += tmp;
+  }
+  // merge with left block
+  auto map_it = ptr_map_.lower_bound(ptr);
+  if (map_it != ptr_map_.begin()) {
+    auto x = *--map_it;
+    if (x.first + x.second == ptr) {
+      mem_blocks_.erase(std::make_pair(x.second, x.first));
+      ptr_map_.erase(x.first);
+      ptr = x.first;
+      size += x.second;
+    }
+  }
+  mem_blocks_.insert(std::make_pair(size, ptr));
+  ptr_map_[ptr] = size;
+}
+
 uint64_t *CudaCachingAllocator::allocate(
     const LlvmDevice::LlvmRuntimeAllocParams &params) {
   uint64_t *ret{nullptr};
   auto size_aligned = taichi::iroundup(params.size, taichi_page_size);
-  auto it_blk = mem_blocks_.lower_bound(size_aligned);
+  auto it_blk = mem_blocks_.lower_bound(std::make_pair(size_aligned, nullptr));
 
   if (it_blk != mem_blocks_.end()) {
     size_t remaining_sz = it_blk->first - size_aligned;
@@ -19,19 +43,22 @@ uint64_t *CudaCachingAllocator::allocate(
       TI_ASSERT(remaining_sz % taichi_page_size == 0);
       auto remaining_head =
           reinterpret_cast<uint8_t *>(it_blk->second) + size_aligned;
-      mem_blocks_.insert(
-          {remaining_sz, reinterpret_cast<uint64_t *>(remaining_head)});
+      mem_blocks_.insert(std::make_pair(remaining_sz, remaining_head));
+      ptr_map_.insert(std::make_pair(remaining_head, remaining_sz));
     }
-    ret = it_blk->second;
+    ret = reinterpret_cast<uint64_t *>(it_blk->second);
     mem_blocks_.erase(it_blk);
+    ptr_map_.erase(it_blk->second);
+
   } else {
-    ret = device_->allocate_llvm_runtime_memory_jit(params);
+    ret = reinterpret_cast<uint64_t *>(
+        device_->allocate_llvm_runtime_memory_jit(params));
   }
   return ret;
 }
 
 void CudaCachingAllocator::release(size_t sz, uint64_t *ptr) {
-  mem_blocks_.insert({sz, ptr});
+  merge_and_insert(reinterpret_cast<uint8_t *>(ptr), sz);
 }
 
 }  // namespace cuda
diff --git a/taichi/rhi/cuda/cuda_caching_allocator.h b/taichi/rhi/cuda/cuda_caching_allocator.h
index e86aac68fd7f4..b1f472e85fc89 100644
--- a/taichi/rhi/cuda/cuda_caching_allocator.h
+++ b/taichi/rhi/cuda/cuda_caching_allocator.h
@@ -6,6 +6,7 @@
 #include "taichi/inc/constants.h"
 #include <stdint.h>
 #include <map>
+#include <set>
 
 namespace taichi::lang {
 namespace cuda {
@@ -18,7 +19,10 @@ class CudaCachingAllocator {
   void release(size_t sz, uint64_t *ptr);
 
  private:
-  std::multimap<size_t, uint64_t *> mem_blocks_;
+  void merge_and_insert(uint8_t *ptr, std::size_t size);
+
+  std::set<std::pair<std::size_t, uint8_t *>> mem_blocks_;
+  std::map<uint8_t *, std::size_t> ptr_map_;
   LlvmDevice *device_{nullptr};
 };
 

From b2cfe428fa6405cfd455e4a048c1a49615f4c2d1 Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Mon, 13 Mar 2023 12:54:40 +0800
Subject: [PATCH 2/4] update

---
 taichi/rhi/cuda/CMakeLists.txt                      |  1 -
 taichi/rhi/cuda/cuda_device.cpp                     |  4 ++--
 taichi/rhi/cuda/cuda_device.h                       |  4 ++--
 taichi/rhi/llvm/CMakeLists.txt                      |  1 +
 .../allocator.cpp}                                  | 13 +++++--------
 .../cuda_caching_allocator.h => llvm/allocator.h}   |  6 ++----
 tests/python/test_ndarray.py                        |  2 +-
 7 files changed, 13 insertions(+), 18 deletions(-)
 rename taichi/rhi/{cuda/cuda_caching_allocator.cpp => llvm/allocator.cpp} (82%)
 rename taichi/rhi/{cuda/cuda_caching_allocator.h => llvm/allocator.h} (83%)

diff --git a/taichi/rhi/cuda/CMakeLists.txt b/taichi/rhi/cuda/CMakeLists.txt
index 1287602a2172b..0e1fdca52ff16 100644
--- a/taichi/rhi/cuda/CMakeLists.txt
+++ b/taichi/rhi/cuda/CMakeLists.txt
@@ -5,7 +5,6 @@ add_library(${CUDA_RHI})
 target_sources(${CUDA_RHI}
   PRIVATE
     cuda_device.cpp
-    cuda_caching_allocator.cpp
     cuda_context.cpp
     cuda_driver.cpp
     cuda_profiler.cpp
diff --git a/taichi/rhi/cuda/cuda_device.cpp b/taichi/rhi/cuda/cuda_device.cpp
index 07281ee442db5..0a59f5bbf84ab 100644
--- a/taichi/rhi/cuda/cuda_device.cpp
+++ b/taichi/rhi/cuda/cuda_device.cpp
@@ -44,7 +44,7 @@ DeviceAllocation CudaDevice::allocate_memory_runtime(
   info.size = taichi::iroundup(params.size, taichi_page_size);
   if (params.use_cached) {
     if (caching_allocator_ == nullptr) {
-      caching_allocator_ = std::make_unique<CudaCachingAllocator>(this);
+      caching_allocator_ = std::make_unique<CachingAllocator>(this);
     }
     info.ptr = caching_allocator_->allocate(params);
     CUDADriver::get_instance().memset((void *)info.ptr, 0, info.size);
@@ -72,7 +72,7 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) {
   TI_ASSERT(!info.is_imported);
   if (info.use_cached) {
     if (caching_allocator_ == nullptr) {
-      TI_ERROR("the CudaCachingAllocator is not initialized");
+      TI_ERROR("the CachingAllocator is not initialized");
     }
     caching_allocator_->release(info.size, (uint64_t *)info.ptr);
   } else if (!info.use_preallocated) {
diff --git a/taichi/rhi/cuda/cuda_device.h b/taichi/rhi/cuda/cuda_device.h
index 59e039ac84a50..c1e0185af879b 100644
--- a/taichi/rhi/cuda/cuda_device.h
+++ b/taichi/rhi/cuda/cuda_device.h
@@ -4,7 +4,7 @@
 
 #include "taichi/common/core.h"
 #include "taichi/rhi/cuda/cuda_driver.h"
-#include "taichi/rhi/cuda/cuda_caching_allocator.h"
+#include "taichi/rhi/llvm/allocator.h"
 #include "taichi/rhi/cuda/cuda_context.h"
 #include "taichi/rhi/llvm/llvm_device.h"
 
@@ -136,7 +136,7 @@ class CudaDevice : public LlvmDevice {
       TI_ERROR("invalid DeviceAllocation");
     }
   }
-  std::unique_ptr<CudaCachingAllocator> caching_allocator_{nullptr};
+  std::unique_ptr<CachingAllocator> caching_allocator_{nullptr};
 };
 
 }  // namespace cuda
diff --git a/taichi/rhi/llvm/CMakeLists.txt b/taichi/rhi/llvm/CMakeLists.txt
index 64bf1be4dbb1f..588e9707a8494 100644
--- a/taichi/rhi/llvm/CMakeLists.txt
+++ b/taichi/rhi/llvm/CMakeLists.txt
@@ -5,6 +5,7 @@ add_library(${LLVM_RHI})
 target_sources(${LLVM_RHI}
   PRIVATE
     llvm_device.cpp
+    allocator.cpp
   )
 
 target_include_directories(${LLVM_RHI}
diff --git a/taichi/rhi/cuda/cuda_caching_allocator.cpp b/taichi/rhi/llvm/allocator.cpp
similarity index 82%
rename from taichi/rhi/cuda/cuda_caching_allocator.cpp
rename to taichi/rhi/llvm/allocator.cpp
index 3e6c9b763d154..89091871afbe8 100644
--- a/taichi/rhi/cuda/cuda_caching_allocator.cpp
+++ b/taichi/rhi/llvm/allocator.cpp
@@ -1,14 +1,12 @@
-#include "taichi/rhi/cuda/cuda_caching_allocator.h"
+#include "taichi/rhi/llvm/allocator.h"
 #include "taichi/runtime/llvm/snode_tree_buffer_manager.h"
 
 namespace taichi::lang {
-namespace cuda {
 
-CudaCachingAllocator::CudaCachingAllocator(LlvmDevice *device)
-    : device_(device) {
+CachingAllocator::CachingAllocator(LlvmDevice *device) : device_(device) {
 }
 
-void CudaCachingAllocator::merge_and_insert(uint8_t *ptr, std::size_t size) {
+void CachingAllocator::merge_and_insert(uint8_t *ptr, std::size_t size) {
   // merge with right block
   if (ptr_map_[ptr + size]) {
     std::size_t tmp = ptr_map_[ptr + size];
@@ -31,7 +29,7 @@ void CudaCachingAllocator::merge_and_insert(uint8_t *ptr, std::size_t size) {
   ptr_map_[ptr] = size;
 }
 
-uint64_t *CudaCachingAllocator::allocate(
+uint64_t *CachingAllocator::allocate(
     const LlvmDevice::LlvmRuntimeAllocParams &params) {
   uint64_t *ret{nullptr};
   auto size_aligned = taichi::iroundup(params.size, taichi_page_size);
@@ -57,9 +55,8 @@ uint64_t *CudaCachingAllocator::allocate(
   return ret;
 }
 
-void CudaCachingAllocator::release(size_t sz, uint64_t *ptr) {
+void CachingAllocator::release(size_t sz, uint64_t *ptr) {
   merge_and_insert(reinterpret_cast<uint8_t *>(ptr), sz);
 }
 
-}  // namespace cuda
 }  // namespace taichi::lang
diff --git a/taichi/rhi/cuda/cuda_caching_allocator.h b/taichi/rhi/llvm/allocator.h
similarity index 83%
rename from taichi/rhi/cuda/cuda_caching_allocator.h
rename to taichi/rhi/llvm/allocator.h
index b1f472e85fc89..9dd1263913c91 100644
--- a/taichi/rhi/cuda/cuda_caching_allocator.h
+++ b/taichi/rhi/llvm/allocator.h
@@ -9,11 +9,10 @@
 #include <set>
 
 namespace taichi::lang {
-namespace cuda {
 
-class CudaCachingAllocator {
+class CachingAllocator {
  public:
-  explicit CudaCachingAllocator(LlvmDevice *device);
+  explicit CachingAllocator(LlvmDevice *device);
 
   uint64_t *allocate(const LlvmDevice::LlvmRuntimeAllocParams &params);
   void release(size_t sz, uint64_t *ptr);
@@ -26,5 +25,4 @@ class CudaCachingAllocator {
   LlvmDevice *device_{nullptr};
 };
 
-}  // namespace cuda
 }  // namespace taichi::lang
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index e6ba13be0eed1..a0e556f4c52eb 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -273,7 +273,7 @@ def test_ndarray_deepcopy():
 
 
 @test_utils.test(arch=[ti.cuda], ndarray_use_cached_allocator=True)
-def test_ndarray_cuda_caching_allocator():
+def test_ndarray_caching_allocator():
     n = 8
     a = ti.ndarray(ti.i32, shape=(n))
     a.fill(2)

From da173c82403a66254703c679045abd30cf36cccd Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <zy2284@columbia.edu>
Date: Tue, 14 Mar 2023 09:50:10 +0800
Subject: [PATCH 3/4] Update test_ndarray.py

---
 tests/python/test_ndarray.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index a0e556f4c52eb..142d0b69dd31f 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -618,6 +618,7 @@ def test_ndarray_grouped():
     _test_ndarray_grouped()
 
 
+
 @test_utils.test(arch=[ti.cpu, ti.cuda], real_matrix_scalarize=False)
 def test_ndarray_grouped_real_matrix():
     _test_ndarray_grouped()

From 00bcb59d315882c2378a0f6c116a232dfa43873f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 14 Mar 2023 01:51:35 +0000
Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/python/test_ndarray.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index 142d0b69dd31f..a0e556f4c52eb 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -618,7 +618,6 @@ def test_ndarray_grouped():
     _test_ndarray_grouped()
 
 
-
 @test_utils.test(arch=[ti.cpu, ti.cuda], real_matrix_scalarize=False)
 def test_ndarray_grouped_real_matrix():
     _test_ndarray_grouped()