[REVIEW] Merge free lists in pool_memory_resource to defragment befor…

…e growing from upstream (#532) * Add list merging before growing pool. * Fix list merging * Changelog for #530 * Correct CL# * Hoist list size() to before it may change. * Add iterator comment. * Add POOL_MR_PTDS_TEST * Update CI build readme * Remove invalid optimization for case when there is only one free list. * Fix comment * Remove extra log_summary_trace() * Update version to 0.16 in ci local README Co-authored-by: Keith Kraus <[email protected]>
rapidsai · Sep 14, 2020 · 6a3a62f · 6a3a62f
1 parent e3c55f7
commit 6a3a62f
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,7 +8,8 @@
 
 - PR #474 Use CMake find_package(CUDAToolkit)
 - PR #477 Just use `None` for `strides` in `DeviceBuffer`
-- PR #528 Add `maximum_pool_size` parameter to reinitialize API
+- PR #528 Add maximum_pool_size parameter to reinitialize API
+- PR #532 Merge free lists in pool_memory_resource to defragment before growing from upstream
 - PR #537 Add CMake option to disable deprecation warnings
 - PR #541 Refine CMakeLists.txt to make it easy to import by external projects
 - PR #538 Upgrade CUB and Thrust to the latest commits

diff --git a/ci/local/README.md b/ci/local/README.md
@@ -23,9 +23,9 @@ where:
 ```
 
 Example Usage:
-`bash build.sh -r ~/rapids/rmm -i gpuci/rapidsai-base:cuda10.1-ubuntu16.04-gcc5-py3.6`
+`bash build.sh -r ~/rapids/rmm -i gpuci/rapidsai:0.16-cuda10.2-devel-ubuntu16.04-py3.7`
 
-For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai-base/tags) page.
+For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai/tags) page.
 
 Style Check:
 ```bash

diff --git a/cmake/Modules/ConfigureGoogleTest.cmake b/cmake/Modules/ConfigureGoogleTest.cmake
@@ -4,8 +4,7 @@ set(GTEST_CMAKE_ARGS "")
 		     # " -Dgtest_build_samples=ON" 
                      # " -DCMAKE_VERBOSE_MAKEFILE=ON")
 
-# Poor's man workaround for
-# https://github.com/google/googletest/issues/854
+# Workaround https://github.com/google/googletest/issues/854
 if(CMAKE_CXX_COMPILER MATCHES ".*clang")
   list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_C_FLAGS=-fPIC")
   list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-fPIC")
@@ -53,4 +52,3 @@ message(STATUS "GoogleTest installed here: " ${GTEST_ROOT}/install)
 set(GTEST_INCLUDE_DIR "${GTEST_ROOT}/install/include")
 set(GTEST_LIBRARY_DIR "${GTEST_ROOT}/install/lib")
 set(GTEST_FOUND TRUE)
-
diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
@@ -203,25 +203,27 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
    */
   virtual void* do_allocate(std::size_t bytes, cudaStream_t stream) override
   {
-    RMM_LOG_TRACE("[A][{}B][stream {:p}]", bytes, static_cast<void*>(stream));
+    RMM_LOG_TRACE("[A][stream {:p}][{}B]", static_cast<void*>(stream), bytes);
+
     if (bytes <= 0) return nullptr;
 
     lock_guard lock(mtx_);
 
     auto stream_event = get_event(stream);
-    bytes             = rmm::detail::align_up(bytes, allocation_alignment);
+
+    bytes = rmm::detail::align_up(bytes, allocation_alignment);
     RMM_EXPECTS(bytes <= this->underlying().get_maximum_allocation_size(),
                 rmm::bad_alloc,
                 "Maximum allocation size exceeded");
     auto const b = this->underlying().get_block(bytes, stream_event);
     auto split   = this->underlying().allocate_from_block(b, bytes);
     if (split.remainder.is_valid()) stream_free_blocks_[stream_event].insert(split.remainder);
-    RMM_LOG_TRACE("[A][{}B][{:p}][stream {:p}]",
+    RMM_LOG_TRACE("[A][stream {:p}][{}B][{:p}]",
+                  static_cast<void*>(stream_event.stream),
                   bytes,
-                  static_cast<void*>(split.allocated_pointer),
-                  static_cast<void*>(stream));
+                  static_cast<void*>(split.allocated_pointer));
 
-    log_summary_trace(stream_event.stream);
+    log_summary_trace();
 
     return split.allocated_pointer;
   }
@@ -236,10 +238,11 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   virtual void do_deallocate(void* p, std::size_t bytes, cudaStream_t stream) override
   {
     lock_guard lock(mtx_);
-    RMM_LOG_TRACE("[D][{}B][{:p}][stream {:p}]", bytes, p, static_cast<void*>(stream));
     auto stream_event = get_event(stream);
-    bytes             = rmm::detail::align_up(bytes, allocation_alignment);
-    auto const b      = this->underlying().free_block(p, bytes);
+    RMM_LOG_TRACE("[D][stream {:p}][{}B][{:p}]", static_cast<void*>(stream_event.stream), bytes, p);
+
+    bytes        = rmm::detail::align_up(bytes, allocation_alignment);
+    auto const b = this->underlying().free_block(p, bytes);
 
     // TODO: cudaEventRecord has significant overhead on deallocations. For the non-PTDS case
     // we may be able to delay recording the event in some situations. But using events rather than
@@ -248,7 +251,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
 
     stream_free_blocks_[stream_event].insert(b);
 
-    log_summary_trace(stream_event.stream);
+    log_summary_trace();
   }
 
  private:
@@ -324,18 +327,25 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
       if (b.is_valid()) { return b; }
     }
 
-    // Otherwise try to find a block associated with another stream
-    block_type b = get_block_from_other_stream(size, stream_event);
-    if (b.is_valid()) { return b; }
+    // note this creates a free list if it doesn't already exist, hence we get num lists first above
+    free_list& blocks =
+      (iter != stream_free_blocks_.end()) ? iter->second : stream_free_blocks_[stream_event];
 
-    // no larger blocks available on other streams, so grow the pool and create a block
+    // Try to find an existing block in another stream
+    {
+      block_type const b = get_block_from_other_stream(size, stream_event, blocks, false);
+      if (b.is_valid()) return b;
+    }
 
-    log_summary_trace(stream_event.stream);
+    // no large enough blocks available on other streams, so sync and merge until we find one
+    {
+      block_type const b = get_block_from_other_stream(size, stream_event, blocks, true);
+      if (b.is_valid()) return b;
+    }
 
-    // avoid searching for this stream's list again
-    free_list& blocks =
-      (iter != stream_free_blocks_.end()) ? iter->second : stream_free_blocks_[stream_event];
+    log_summary_trace();
 
+    // no large enough blocks available after merging, so grow the pool
     return this->underlying().expand_pool(size, blocks, stream_event.stream);
   }
 
@@ -356,29 +366,46 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
    * @return A block with non-null pointer and size >= `size`, or a nullptr block if none is
    *         available in `blocks`.
    */
-  block_type get_block_from_other_stream(size_t size, stream_event_pair stream_event)
+  block_type get_block_from_other_stream(size_t size,
+                                         stream_event_pair stream_event,
+                                         free_list& blocks,
+                                         bool merge_first)
   {
-    // nothing in this stream's free list, look for one on another stream
-    for (auto s = stream_free_blocks_.begin(); s != stream_free_blocks_.end(); ++s) {
-      auto blocks_event = s->first;
-      if (blocks_event.event != stream_event.event) {
-        auto blocks = s->second;
-
-        block_type const b = blocks.get_block(size);  // get the best fit block
+    for (auto it = stream_free_blocks_.begin(), next_it = it; it != stream_free_blocks_.end();
+         it = next_it) {
+      ++next_it;  // Points to element after `it` to allow erasing `it` in the loop body
+      auto other_event = it->first.event;
+      if (other_event != stream_event.event) {
+        auto other_blocks = it->second;
+
+        block_type const b = [&]() {
+          if (merge_first) {
+            merge_lists(stream_event, blocks, other_event, std::move(other_blocks));
+
+            RMM_LOG_INFO("[A][Stream {:p}][{}B][Merged stream {:p}]",
+                         static_cast<void*>(stream_event.stream),
+                         size,
+                         static_cast<void*>(it->first.stream));
+
+            stream_free_blocks_.erase(it);
+
+            return blocks.get_block(size);  // get the best fit block in merged lists
+          } else {
+            return other_blocks.get_block(size);  // get the best fit block in other list
+          }
+        }();
 
         if (b.is_valid()) {
-          // Since we found a block associated with a different stream, we have to insert a wait
-          // on the stream's associated event into the allocating stream.
-          RMM_CUDA_TRY(cudaStreamWaitEvent(stream_event.stream, blocks_event.event, 0));
-
-          // Move all the blocks to the requesting stream, since it has waited on them
-          stream_free_blocks_[stream_event].insert(std::move(blocks));
-          stream_free_blocks_.erase(s);
-
-          RMM_LOG_INFO("[A][{}B][Stream {:p} steal from stream {:p}]",
-                       size,
+          RMM_LOG_INFO((merge_first) ? "[A][Stream {:p}][{}B][Found after merging stream {:p}]"
+                                     : "[A][Stream {:p}][{}B][Taken from stream {:p}]",
                        static_cast<void*>(stream_event.stream),
-                       static_cast<void*>(s->first.stream));
+                       size,
+                       static_cast<void*>(it->first.stream));
+
+          if (not merge_first) {
+            merge_lists(stream_event, blocks, other_event, std::move(other_blocks));
+            stream_free_blocks_.erase(it);
+          }
 
           return b;
         }
@@ -387,6 +414,19 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
     return block_type{};
   }
 
+  void merge_lists(stream_event_pair stream_event,
+                   free_list& blocks,
+                   cudaEvent_t other_event,
+                   free_list&& other_blocks)
+  {
+    // Since we found a block associated with a different stream, we have to insert a wait
+    // on the stream's associated event into the allocating stream.
+    RMM_CUDA_TRY(cudaStreamWaitEvent(stream_event.stream, other_event, 0));
+
+    // Merge the two free lists
+    blocks.insert(std::move(other_blocks));
+  }
+
   /**
    * @brief Clear free lists and events
    *
@@ -405,7 +445,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
     stream_free_blocks_.clear();
   }
 
-  void log_summary_trace(cudaStream_t stream)
+  void log_summary_trace()
   {
 #if (SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_TRACE)
     std::size_t num_blocks{0};

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -66,6 +66,17 @@ set(DEVICE_MR_TEST_SRC
 
 ConfigureTest(DEVICE_MR_TEST "${DEVICE_MR_TEST_SRC}")
 
+###################################################################################################
+###################################################################################################
+# - device mr per-thread default stream tests------------------------------------------------------
+
+set(DEVICE_MR_PTDS_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/mr_multithreaded_tests.cpp")
+
+ConfigureTest(DEVICE_MR_PTDS_TEST "${DEVICE_MR_PTDS_TEST_SRC}")
+target_compile_definitions(DEVICE_MR_PTDS_TEST PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
+
+
 ###################################################################################################
 ###################################################################################################
 # - pool mr tests --------------------------------------------------------------------------------
@@ -75,6 +86,13 @@ set(POOL_MR_TEST_SRC
 
 ConfigureTest(POOL_MR_TEST "${POOL_MR_TEST_SRC}")
 
+set(POOL_MR_PTDS_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/pool_mr_tests.cpp")
+
+ConfigureTest(POOL_MR_PTDS_TEST "${POOL_MR_PTDS_TEST_SRC}")
+target_compile_definitions(POOL_MR_PTDS_TEST PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
+
+
 ###################################################################################################
 ###################################################################################################
 # - thrust allocator tests --------------------------------------------------------------------------------
@@ -84,18 +102,6 @@ set(THRUST_ALLOCATOR_TEST_SRC
 
 ConfigureTest(THRUST_ALLOCATOR_TEST "${THRUST_ALLOCATOR_TEST_SRC}")
 
-###################################################################################################
-###################################################################################################
-# - device mr per-thread default stream tests------------------------------------------------------
-
-
-set(DEVICE_MR_PTDS_TEST_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/mr_multithreaded_tests.cpp")
-
-ConfigureTest(DEVICE_MR_PTDS_TEST "${DEVICE_MR_PTDS_TEST_SRC}")
-target_compile_definitions(DEVICE_MR_PTDS_TEST PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
-
-
 ###################################################################################################
 ###################################################################################################
 # - host mr tests ----------------------------------------------------------------------------------