rapidsai · harrism · Jul 10, 2020 · Jun 22, 2020 · Jun 23, 2020 · Jun 24, 2020
diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -15,13 +15,15 @@
  */
 #pragma once
 
+#include <cstdint>
 #include <rmm/detail/error.hpp>
 #include <rmm/mr/device/detail/free_list.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuda_runtime_api.h>
 
 #include <algorithm>
+#include <atomic>
 #include <cassert>
 #include <exception>
 #include <iostream>
@@ -30,6 +32,8 @@
 #include <mutex>
 #include <numeric>
 #include <set>
+#include <thread>
+#include <unordered_map>
 #include <vector>
 
 namespace rmm {
@@ -76,7 +80,7 @@ class pool_memory_resource final : public device_memory_resource {
     if (maximum_pool_size == default_maximum_size) maximum_pool_size_ = props.totalGlobalMem;
 
     // Allocate initial block
-    stream_free_blocks_[0].insert(block_from_upstream(initial_pool_size, 0));
+    stream_free_blocks_[get_event(0)].insert(block_from_upstream(initial_pool_size, 0));
   }
 
   /**
@@ -108,6 +112,7 @@ class pool_memory_resource final : public device_memory_resource {
   Upstream* get_upstream() const noexcept { return upstream_mr_; }
 
  private:
+  using id_type   = uint32_t;
   using block     = rmm::mr::detail::block;
   using free_list = rmm::mr::detail::free_list<>;
 
@@ -124,59 +129,60 @@ class pool_memory_resource final : public device_memory_resource {
    *               available in `blocks`.
    */
   block block_from_stream(free_list& blocks,
-                          cudaStream_t blocks_stream,
+                          cudaEvent_t blocks_event,
                           size_t size,
-                          cudaStream_t stream)
+                          cudaStream_t stream,
+                          cudaEvent_t stream_event)
   {
     block const b = blocks.best_fit(size);  // get the best fit block
 
-    // If we found a block associated with a different stream,
-    // we have to synchronize the stream in order to use it
-    if ((blocks_stream != stream) && b.is_valid()) {
-      cudaError_t result = cudaStreamSynchronize(blocks_stream);
-
-      RMM_EXPECTS((result == cudaSuccess ||                    // stream synced
-                   result == cudaErrorInvalidResourceHandle),  // stream deleted
-                  rmm::bad_alloc,
-                  "cudaStreamSynchronize failure");
-
-      // Now that this stream is synced, insert all other blocks into this stream's list
-      // Note: This could cause thrashing between two streams. On the other hand, it reduces
-      // fragmentation by coalescing.
-      stream_free_blocks_[stream].insert(blocks.begin(), blocks.end());
-
-      // remove this stream from the freelist
-      stream_free_blocks_.erase(blocks_stream);
+    if (b.is_valid()) {
+      // If we found a block associated with a different stream, we have to insert a wait on the
+      // stream's associated event into the allocating stream.
+      if (stream_event != blocks_event) {
+        stream_free_blocks_[stream_event].insert(blocks.begin(), blocks.end());
+        stream_free_blocks_.erase(blocks_event);
+
+        // TODO: could eliminate this ifdef and have the same behavior for PTDS and non-PTDS
+        // But the cudaEventRecord() on every free_block reduces performance significantly
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+        RMM_CUDA_TRY(cudaStreamWaitEvent(stream, blocks_event, 0));
+        remove_event(blocks_event);  // only removes non-default-stream-events
+#else
+        RMM_CUDA_TRY(cudaStreamSynchronize(event_streams_[blocks_event]));
+#endif
+      }
     }
+
     return b;
   }
 
   /**
    * @brief Find an available block in the pool of at least `size` bytes, for use on `stream`.
    *
-   * Attempts to find a free block that was last used on `stream` to avoid synchronization. If none
-   * is available, it finds a block last used on another stream. In this case, the stream associated
-   * with the found block is synchronized to ensure all asynchronous work on the memory is finished
-   * before it is used on `stream`.
+   * Attempts to find a free block that was last used on `stream` to avoid synchronization. If
+   * none is available, it finds a block last used on another stream. In this case, the stream
+   * associated with the found block is synchronized to ensure all asynchronous work on the memory
+   * is finished before it is used on `stream`.
    *
    * @param size The size of the requested allocation, in bytes.
    * @param stream The stream on which the allocation will be used.
    * @return block A block with non-null pointer and size >= `size`.
    */
-  block available_larger_block(size_t size, cudaStream_t stream)
+  block available_larger_block(size_t size, cudaStream_t stream, cudaEvent_t event)
   {
     // Try to find a larger block in free list for the same stream
-    auto iter = stream_free_blocks_.find(stream);
+    auto iter = stream_free_blocks_.find(event);
     if (iter != stream_free_blocks_.end()) {
-      block b = block_from_stream(iter->second, stream, size, stream);
+      block b = block_from_stream(iter->second, event, size, stream, event);
       if (b.is_valid()) return b;
     }
 
     // nothing in this stream's free list, look for one on another stream
     auto s = stream_free_blocks_.begin();
     while (s != stream_free_blocks_.end()) {
-      if (s->first != stream) {
-        block b = block_from_stream(s->second, s->first, size, stream);
+      if (s->first != event) {
+        block b = block_from_stream(s->second, s->first, size, stream, event);
         if (b.is_valid()) return b;
       }
       ++s;
@@ -198,13 +204,13 @@ class pool_memory_resource final : public device_memory_resource {
    * @param stream The stream on which the allocation will be used.
    * @return void* The pointer to the allocated memory.
    */
-  void* allocate_from_block(block const& b, size_t size, cudaStream_t stream)
+  void* allocate_from_block(block const& b, size_t size, cudaEvent_t event)
   {
     block const alloc{b.pointer(), size, b.is_head()};
 
     if (b.size() > size) {
       block rest{b.pointer() + size, b.size() - size, false};
-      stream_free_blocks_[stream].insert(rest);
+      stream_free_blocks_[event].insert(rest);
     }
 
     allocated_blocks_.insert(alloc);
@@ -224,9 +230,19 @@ class pool_memory_resource final : public device_memory_resource {
 
     auto const i = allocated_blocks_.find(static_cast<char*>(p));
     assert(i != allocated_blocks_.end());
-    assert(i->size == rmm::detail::align_up(size, allocation_alignment));
+    assert(i->size() == rmm::detail::align_up(size, allocation_alignment));
+
+    cudaEvent_t event = get_event(stream);
 
-    stream_free_blocks_[stream].insert(*i);
+    // TODO: cudaEventRecord has significant overhead on deallocations, however it could mean less
+    // synchronization So we need to test in real non-PTDS applications that have multiple streams
+    // whether or not the overhead is worth it
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+    auto result = cudaEventRecord(event, stream);
+    assert(cudaSuccess == result);
+#endif
+
+    stream_free_blocks_[event].insert(*i);
     allocated_blocks_.erase(i);
   }
 
@@ -287,6 +303,7 @@ class pool_memory_resource final : public device_memory_resource {
     for (auto b : upstream_blocks_)
       upstream_mr_->deallocate(b.pointer(), b.size());
     upstream_blocks_.clear();
+    // TODO empty free lists and allocated blocks
     current_pool_size_ = 0;
   }
 
@@ -306,7 +323,7 @@ class pool_memory_resource final : public device_memory_resource {
 
     for (auto h : upstream_blocks_) {
       h.print();
-      upstream_total += h.size;
+      upstream_total += h.size();
     }
     std::cout << "total upstream: " << upstream_total << " B\n";
 
@@ -338,9 +355,10 @@ class pool_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cudaStream_t stream) override
   {
     if (bytes <= 0) return nullptr;
-    bytes         = rmm::detail::align_up(bytes, allocation_alignment);
-    block const b = available_larger_block(bytes, stream);
-    return allocate_from_block(b, bytes, stream);
+    bytes             = rmm::detail::align_up(bytes, allocation_alignment);
+    cudaEvent_t event = get_event(stream);
+    block const b     = available_larger_block(bytes, stream, event);
+    return allocate_from_block(b, bytes, event);
   }
 
   /**
@@ -378,12 +396,84 @@ class pool_memory_resource final : public device_memory_resource {
 
   // map of [stream_id, free_list] pairs
   // stream stream_id must be synced before allocating from this list to a different stream
-  std::map<cudaStream_t, free_list> stream_free_blocks_;
+  // std::map<cudaStream_t, free_list> stream_free_blocks_;
+  std::map<cudaEvent_t, free_list> stream_free_blocks_;
 
   std::set<block, rmm::mr::detail::compare_blocks<block>> allocated_blocks_;
 
   // blocks allocated from upstream: so they can be easily freed
   std::vector<block> upstream_blocks_;
+
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+  struct unique_event {
+    unique_event(std::set<id_type>& ids) : ids_(ids), id(next_id())
+    {
+      ids_.insert(id);
+      auto result = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+      assert(cudaSuccess == result);
+    }
+    ~unique_event()
+    {
+      ids_.erase(id);
+      auto result = cudaEventDestroy(event);
+      assert(cudaSuccess == result);
+    }
+    id_type get_id() const noexcept { return id; }
+    cudaEvent_t get_event() const noexcept { return event; }
+
+   private:
+    static id_type next_id()
+    {
+      static std::atomic<id_type> s_id{};
+      return ++s_id;
+    }
+
+    id_type id;
+    cudaEvent_t event;
+    std::set<id_type>& ids_;  // reference to external set
+  };
+#endif
+
+  cudaEvent_t get_event(cudaStream_t stream)
+  {
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+    if (stream == 0) {
+      thread_local unique_event e{ids_};
+      return e.get_event();
+    } else
+#endif
+    {
+      auto iter = stream_events_.find(stream);
+      if (iter == stream_events_.end()) {
+        // create event
+        cudaEvent_t event{};
+        auto result = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+        assert(cudaSuccess == result);
+        stream_events_[stream] = event;
+        event_streams_[event]  = stream;
+        return event;
+      } else {
+        return iter->second;
+      }
+    }
+  }
+
+  void remove_event(cudaEvent_t event)
+  {
+    auto iter = event_streams_.find(event);
+    if (iter != event_streams_.end()) {  // this is a non-default stream event
+      cudaStream_t stream = iter->second;
+      RMM_CUDA_TRY(cudaEventDestroy(event));
+      event_streams_.erase(iter);
+      stream_events_.erase(stream);
+    }
+  }
+
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+  std::set<id_type> ids_;
+#endif
+  std::unordered_map<cudaStream_t, cudaEvent_t> stream_events_;
+  std::unordered_map<cudaEvent_t, cudaStream_t> event_streams_;
 };
 
 }  // namespace mr