diff --git a/CHANGELOG.md b/CHANGELOG.md
index 67c4e765c..82fa165c9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,8 @@
 
 - PR #375 Support out-of-band buffers in Python pickling
 - PR #391 Add `get_default_resource_type`
-- PR #396 Remove deprecated RMM APIs.
+- PR #396 Remove deprecated RMM APIs
+- PR #425 Add CUDA per-thread default stream support and thread safety to `pool_memory_resource`
 
 ## Improvements
 
@@ -29,7 +30,7 @@
 - PR #383 Explicitly require NumPy
 - PR #398 Fix missing head flag in merge_blocks (pool_memory_resource) and improve block class
 - PR #403 Mark Cython `memory_resource_wrappers` `extern` as `nogil`
-- PR #406 Sets Google Benchmark to a fixed version, v1.5.1.
+- PR #406 Sets Google Benchmark to a fixed version, v1.5.1
 
 
 # RMM 0.14.0 (Date TBD)
diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
index 1e5e8000c..954ec2b76 100644
--- a/benchmarks/random_allocations/random_allocations.cpp
+++ b/benchmarks/random_allocations/random_allocations.cpp
@@ -58,7 +58,6 @@ allocation remove_at(allocation_vector& allocs, std::size_t index)
 // nested MR type names can get long...
 using cuda_mr            = rmm::mr::cuda_memory_resource;
 using pool_mr            = rmm::mr::pool_memory_resource<cuda_mr>;
-using safe_pool_mr       = rmm::mr::thread_safe_resource_adaptor<pool_mr>;
 using fixed_multisize_mr = rmm::mr::fixed_multisize_memory_resource<pool_mr>;
 using hybrid_mr          = rmm::mr::hybrid_memory_resource<fixed_multisize_mr, pool_mr>;
 using safe_hybrid_mr     = rmm::mr::thread_safe_resource_adaptor<hybrid_mr>;
@@ -184,9 +183,9 @@ resource_wrapper<cuda_mr>::resource_wrapper()
 }
 
 template <>
-resource_wrapper<safe_pool_mr>::resource_wrapper()
+resource_wrapper<pool_mr>::resource_wrapper()
 {
-  mr = new rmm::mr::thread_safe_resource_adaptor<pool_mr>(new pool_mr(new cuda_mr()));
+  mr = new pool_mr(new cuda_mr());
 }
 
 template <>
@@ -228,12 +227,10 @@ resource_wrapper<fixed_multisize_mr>::~resource_wrapper()
 }
 
 template <>
-resource_wrapper<safe_pool_mr>::~resource_wrapper()
+resource_wrapper<pool_mr>::~resource_wrapper()
 {
-  auto pool = mr->get_upstream();
-  auto cuda = pool->get_upstream();
+  auto cuda = mr->get_upstream();
   delete mr;
-  delete pool;
   delete cuda;
 }
 
@@ -299,7 +296,7 @@ void declare_benchmark(std::string name)
   if (name == "hybrid")
     BENCHMARK_TEMPLATE(BM_RandomAllocations, safe_hybrid_mr)->Apply(benchmark_range);
   else if (name == "pool")
-    BENCHMARK_TEMPLATE(BM_RandomAllocations, safe_pool_mr)->Apply(benchmark_range);
+    BENCHMARK_TEMPLATE(BM_RandomAllocations, pool_mr)->Apply(benchmark_range);
   else if (name == "fixed_multisize")
     BENCHMARK_TEMPLATE(BM_RandomAllocations, fixed_multisize_mr)->Apply(benchmark_range);
   else if (name == "cnmem")
@@ -318,7 +315,7 @@ int main(int argc, char** argv)
     if (argc > 3) max_size = atoi(argv[3]);
     declare_benchmark(mr_name);
   } else {
-    BENCHMARK_TEMPLATE(BM_RandomAllocations, safe_pool_mr)->Apply(benchmark_range);
+    BENCHMARK_TEMPLATE(BM_RandomAllocations, pool_mr)->Apply(benchmark_range);
     BENCHMARK_TEMPLATE(BM_RandomAllocations, safe_hybrid_mr)->Apply(benchmark_range);
     BENCHMARK_TEMPLATE(BM_RandomAllocations, cnmem_mr)->Apply(benchmark_range);
     BENCHMARK_TEMPLATE(BM_RandomAllocations, cuda_mr)->Apply(benchmark_range);
diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
index 7732ce53a..57e98900e 100644
--- a/include/rmm/mr/device/pool_memory_resource.hpp
+++ b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -22,14 +22,17 @@
 #include <cuda_runtime_api.h>
 
 #include <algorithm>
+#include <atomic>
 #include <cassert>
-#include <exception>
+#include <cstdint>
 #include <iostream>
 #include <list>
 #include <map>
 #include <mutex>
 #include <numeric>
 #include <set>
+#include <thread>
+#include <unordered_map>
 #include <vector>
 
 namespace rmm {
@@ -39,6 +42,9 @@ namespace mr {
  * @brief A coalescing best-fit suballocator which uses a pool of memory allocated from
  *        an upstream memory_resource.
  *
+ * Allocation (do_allocate()) and deallocation (do_deallocate()) are thread-safe. Also,
+ * this class is compatible with CUDA per-thread default stream.
+ *
  * @tparam UpstreamResource memory_resource to use for allocating the pool. Implements
  *                          rmm::mr::device_memory_resource interface.
  */
@@ -75,15 +81,24 @@ class pool_memory_resource final : public device_memory_resource {
 
     if (maximum_pool_size == default_maximum_size) maximum_pool_size_ = props.totalGlobalMem;
 
-    // Allocate initial block
-    stream_free_blocks_[0].insert(block_from_upstream(initial_pool_size, 0));
+    // Allocate initial block and insert into free list for the legacy default stream
+    stream_free_blocks_[get_event(cudaStreamLegacy)].insert(
+      block_from_upstream(initial_pool_size, 0));
   }
 
   /**
    * @brief Destroy the `pool_memory_resource` and deallocate all memory it allocated using
    * the upstream resource.
    */
-  ~pool_memory_resource() { release(); }
+  ~pool_memory_resource()
+  {
+    // foo
+    release();
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+    for (auto& event : ptds_events_)
+      event.get().parent = nullptr;
+#endif
+  }
 
   /**
    * @brief Queries whether the resource supports use of non-null CUDA streams for
@@ -108,84 +123,99 @@ class pool_memory_resource final : public device_memory_resource {
   Upstream* get_upstream() const noexcept { return upstream_mr_; }
 
  private:
-  using block     = rmm::mr::detail::block;
-  using free_list = rmm::mr::detail::free_list<>;
+  using id_type    = uint32_t;
+  using block      = rmm::mr::detail::block;
+  using free_list  = rmm::mr::detail::free_list<>;
+  using lock_guard = std::lock_guard<std::mutex>;
 
   /**
-   * @brief Find a free block of at least `size` bytes in `free_list` `blocks` associated with
-   *        stream `blocks_stream`, for use on `stream`.
+   * @brief A structure pairing a CUDA stream and an associated event for the stream.
    *
-   * @param blocks The `free_list` to look in for a free block of sufficient size.
-   * @param blocks_stream The stream that all blocks in `blocks` are associated with.
-   * @param size The requested size of the allocation.
-
-   * @param stream The stream on which the allocation is being requested.
-   * @return block A block with non-null pointer and size >= `size`, or a nullptr block if none is
-   *               available in `blocks`.
    */
-  block block_from_stream(free_list& blocks,
-                          cudaStream_t blocks_stream,
-                          size_t size,
-                          cudaStream_t stream)
-  {
-    block const b = blocks.best_fit(size);  // get the best fit block
-
-    // If we found a block associated with a different stream,
-    // we have to synchronize the stream in order to use it
-    if ((blocks_stream != stream) && b.is_valid()) {
-      cudaError_t result = cudaStreamSynchronize(blocks_stream);
-
-      RMM_EXPECTS((result == cudaSuccess ||                    // stream synced
-                   result == cudaErrorInvalidResourceHandle),  // stream deleted
-                  rmm::bad_alloc,
-                  "cudaStreamSynchronize failure");
+  struct stream_event_pair {
+    cudaStream_t stream;
+    cudaEvent_t event;
 
-      // Now that this stream is synced, insert all other blocks into this stream's list
-      // Note: This could cause thrashing between two streams. On the other hand, it reduces
-      // fragmentation by coalescing.
-      stream_free_blocks_[stream].insert(blocks.begin(), blocks.end());
+    bool operator<(stream_event_pair const& rhs) const { return event < rhs.event; }
+  };
 
-      // remove this stream from the freelist
-      stream_free_blocks_.erase(blocks_stream);
+  /**
+   * @brief Find a free block of at least `size` bytes in a `free_list` with a different
+   * stream/event than `stream_event`.
+   *
+   * If an appropriate block is found in a free list F associated with event E, if
+   * `CUDA_API_PER_THREAD_DEFAULT_STREAM` is defined, `stream_event.stream` will be made to wait on
+   * event E. Otherwise, the stream associated with free list F will be synchronized. In either
+   * case all other blocks in free list F will be moved to the free list associated with
+   * `stream_event.stream`. This results in coalescing with other blocks in that free list,
+   * hopefully reducing fragmentation.
+   *
+   * @param size The requested size of the allocation.
+   * @param stream_event The stream and associated event on which the allocation is being requested.
+   * @return A block with non-null pointer and size >= `size`, or a nullptr block if none is
+   *         available in `blocks`.
+   */
+  block get_block_from_other_stream(size_t size, stream_event_pair stream_event)
+  {
+    // nothing in this stream's free list, look for one on another stream
+    for (auto s = stream_free_blocks_.begin(); s != stream_free_blocks_.end(); ++s) {
+      auto blocks_event = s->first;
+      if (blocks_event.event != stream_event.event) {
+        auto blocks = s->second;
+
+        block const b = blocks.best_fit(size);  // get the best fit block
+
+        if (b.is_valid()) {
+          // Since we found a block associated with a different stream, we have to insert a wait on
+          // the stream's associated event into the allocating stream.
+          // TODO: could eliminate this ifdef and have the same behavior for PTDS and non-PTDS
+          // But the cudaEventRecord() on every free_block reduces performance significantly
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+          RMM_CUDA_TRY(cudaStreamWaitEvent(stream_event.stream, blocks_event.event, 0));
+#else
+          RMM_CUDA_TRY(cudaStreamSynchronize(blocks_event.stream));
+#endif
+          // Move all the blocks to the requesting stream, since it has waited on them
+          stream_free_blocks_[stream_event].insert(blocks.begin(), blocks.end());
+          stream_free_blocks_.erase(s);
+
+          return b;
+        }
+      }
     }
-    return b;
+    return block{};
   }
 
   /**
    * @brief Find an available block in the pool of at least `size` bytes, for use on `stream`.
    *
-   * Attempts to find a free block that was last used on `stream` to avoid synchronization. If none
-   * is available, it finds a block last used on another stream. In this case, the stream associated
-   * with the found block is synchronized to ensure all asynchronous work on the memory is finished
-   * before it is used on `stream`.
+   * Attempts to find a free block that was last used on `stream` to avoid synchronization. If
+   * none is available, it finds a block last used on another stream. In this case, the stream
+   * associated with the found block is synchronized to ensure all asynchronous work on the memory
+   * is finished before it is used on `stream`.
+   *
+   * @throw `std::bad_alloc` if the requested allocation could not be fulfilled.
    *
    * @param size The size of the requested allocation, in bytes.
-   * @param stream The stream on which the allocation will be used.
+   * @param stream_event The stream and associated event on which the allocation is being requested.
    * @return block A block with non-null pointer and size >= `size`.
    */
-  block available_larger_block(size_t size, cudaStream_t stream)
+  block available_larger_block(size_t size, stream_event_pair stream_event)
   {
-    // Try to find a larger block in free list for the same stream
-    auto iter = stream_free_blocks_.find(stream);
+    // Try to find a larger block in free list for the same stream (no sync required)
+    auto iter = stream_free_blocks_.find(stream_event);
     if (iter != stream_free_blocks_.end()) {
-      block b = block_from_stream(iter->second, stream, size, stream);
+      block b = iter->second.best_fit(size);
       if (b.is_valid()) return b;
     }
 
-    // nothing in this stream's free list, look for one on another stream
-    auto s = stream_free_blocks_.begin();
-    while (s != stream_free_blocks_.end()) {
-      if (s->first != stream) {
-        block b = block_from_stream(s->second, s->first, size, stream);
-        if (b.is_valid()) return b;
-      }
-      ++s;
-    }
+    block b = get_block_from_other_stream(size, stream_event);
+    if (b.is_valid()) return b;
 
     // no larger blocks available on other streams, so grow the pool and create a block
     size_t grow_size = size_to_grow(size);
     RMM_EXPECTS(grow_size > 0, rmm::bad_alloc, "Maximum pool size exceeded");
-    return block_from_upstream(grow_size, stream);
+    return block_from_upstream(grow_size, stream_event.stream);
   }
 
   /**
@@ -195,16 +225,16 @@ class pool_memory_resource final : public device_memory_resource {
    *
    * @param b The block to allocate from.
    * @param size The size in bytes of the requested allocation.
-   * @param stream The stream on which the allocation will be used.
+   * @param stream_event The stream and associated event on which the allocation will be used.
    * @return void* The pointer to the allocated memory.
    */
-  void* allocate_from_block(block const& b, size_t size, cudaStream_t stream)
+  void* allocate_from_block(block const& b, size_t size, stream_event_pair stream_event)
   {
     block const alloc{b.pointer(), size, b.is_head()};
 
     if (b.size() > size) {
       block rest{b.pointer() + size, b.size() - size, false};
-      stream_free_blocks_[stream].insert(rest);
+      stream_free_blocks_[stream_event].insert(rest);
     }
 
     allocated_blocks_.insert(alloc);
@@ -222,11 +252,21 @@ class pool_memory_resource final : public device_memory_resource {
   {
     if (p == nullptr) return;
 
+    stream_event_pair stream_event = get_event(stream);
+
     auto const i = allocated_blocks_.find(static_cast<char*>(p));
     assert(i != allocated_blocks_.end());
-    assert(i->size == rmm::detail::align_up(size, allocation_alignment));
+    assert(i->size() == rmm::detail::align_up(size, allocation_alignment));
 
-    stream_free_blocks_[stream].insert(*i);
+    // TODO: cudaEventRecord has significant overhead on deallocations, however it could mean less
+    // synchronization So we need to test in real non-PTDS applications that have multiple streams
+    // whether or not the overhead is worth it
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+    auto result = cudaEventRecord(stream_event.event, stream);
+    assert(cudaSuccess == result);
+#endif
+
+    stream_free_blocks_[stream_event].insert(*i);
     allocated_blocks_.erase(i);
   }
 
@@ -284,9 +324,18 @@ class pool_memory_resource final : public device_memory_resource {
    */
   void release()
   {
+    lock_guard lock(mtx_);
+
     for (auto b : upstream_blocks_)
       upstream_mr_->deallocate(b.pointer(), b.size());
     upstream_blocks_.clear();
+    allocated_blocks_.clear();
+
+    for (auto s_e : stream_events_)
+      destroy_event(s_e.second);
+    stream_events_.clear();
+    stream_free_blocks_.clear();
+
     current_pool_size_ = 0;
   }
 
@@ -297,6 +346,8 @@ class pool_memory_resource final : public device_memory_resource {
    */
   void print()
   {
+    lock_guard lock(mtx_);
+
     std::size_t free, total;
     std::tie(free, total) = upstream_mr_->get_mem_info(0);
     std::cout << "GPU free memory: " << free << "total: " << total << "\n";
@@ -306,7 +357,7 @@ class pool_memory_resource final : public device_memory_resource {
 
     for (auto h : upstream_blocks_) {
       h.print();
-      upstream_total += h.size;
+      upstream_total += h.size();
     }
     std::cout << "total upstream: " << upstream_total << " B\n";
 
@@ -317,7 +368,7 @@ class pool_memory_resource final : public device_memory_resource {
 
     std::cout << "sync free blocks: ";
     for (auto s : stream_free_blocks_) {
-      std::cout << "stream " << s.first << " ";
+      std::cout << "stream: " << s.first.stream << " event: " << s.first.event << " ";
       s.second.print();
     }
     std::cout << "\n";
@@ -338,9 +389,14 @@ class pool_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cudaStream_t stream) override
   {
     if (bytes <= 0) return nullptr;
-    bytes         = rmm::detail::align_up(bytes, allocation_alignment);
-    block const b = available_larger_block(bytes, stream);
-    return allocate_from_block(b, bytes, stream);
+
+    lock_guard lock(mtx_);
+
+    stream_event_pair stream_event = get_event(stream);
+    bytes                          = rmm::detail::align_up(bytes, allocation_alignment);
+    block const b                  = available_larger_block(bytes, stream_event);
+    auto p                         = allocate_from_block(b, bytes, stream_event);
+    return p;
   }
 
   /**
@@ -352,6 +408,7 @@ class pool_memory_resource final : public device_memory_resource {
    */
   void do_deallocate(void* p, std::size_t bytes, cudaStream_t stream) override
   {
+    lock_guard lock(mtx_);
     free_block(p, bytes, stream);
   }
 
@@ -371,19 +428,125 @@ class pool_memory_resource final : public device_memory_resource {
     return std::make_pair(free_size, total_size);
   }
 
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+  /**
+   * @brief RAII wrapper for a CUDA event for a per-thread default stream
+   *
+   * These objects take care of creating and freeing an event associated with a per-thread default
+   * stream. They are needed because the event needs to exist in thread_local memory, so it must
+   * be cleaned up when the thread exits. They maintain a pointer to the parent
+   * (pool_memory_resource) that created them, because when a thread exits, if the parent still
+   * exists, they must tell the parent to merge the free list associated with the event. Also, the
+   * parent maintains a list of references to the created cuda_event objects so that if any remain
+   * when the parent is destroyed, it can set their parent pointers to nullptr to we don't have a
+   * use-after-free race. Note: all of this is a workaround for the fact that there is no way
+   * currently to get a unique handle to a CUDA per-thread default stream. :(
+   */
+  struct default_stream_event {
+    default_stream_event(pool_memory_resource<Upstream>* parent) : parent(parent)
+    {
+      auto result = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+      assert(cudaSuccess == result);
+      if (parent) parent->ptds_events_.push_back(*this);
+    }
+    ~default_stream_event()
+    {
+      if (parent) {
+        lock_guard lock(parent->mtx_);
+        parent->destroy_event(stream_event_pair{cudaStreamDefault, event});
+      }
+    }
+
+    cudaEvent_t event;
+    pool_memory_resource<Upstream>* parent;
+  };
+#endif
+
+  /**
+   * @brief get a unique CUDA event (possibly new) associated with `stream`
+   *
+   * The event is created on the first call, and it is not recorded. If compiled for per-thread
+   * default stream and `stream` is the default stream, the event is created in thread local memory
+   * and is unique per CPU thread.
+   *
+   * @param stream The stream for which to get an event.
+   * @return The stream_event for `stream`.
+   */
+  stream_event_pair get_event(cudaStream_t stream)
+  {
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+    if (cudaStreamDefault == stream || cudaStreamPerThread == stream) {
+      static thread_local default_stream_event e{this};
+      return stream_event_pair{stream, e.event};
+    }
+#else
+    // We use cudaStreamLegacy as the event map key for the default stream for consistency between
+    // PTDS and non-PTDS mode. In PTDS mode, the cudaStreamLegacy map key will only exist if the
+    // user explicitly passes it, so it is used as the default location for the free list
+    // at construction, and for merging free lists when a thread exits (see destroy_event()).
+    // For consistency, the same key is used for null stream free lists in non-PTDS mode.
+    if (cudaStreamDefault == stream) { stream = cudaStreamLegacy; }
+#endif
+
+    auto iter = stream_events_.find(stream);
+    if (iter == stream_events_.end()) {
+      stream_event_pair stream_event{stream};
+      auto result = cudaEventCreateWithFlags(&stream_event.event, cudaEventDisableTiming);
+      assert(cudaSuccess == result);
+      stream_events_[stream] = stream_event;
+      return stream_event;
+    } else {
+      return iter->second;
+    }
+  }
+
+  /**
+   * @brief Destroy the specified CUDA event and move all free blocks for the associated stream
+   * to the default stream free list.
+   *
+   * @param event The event to destroy.
+   */
+  void destroy_event(stream_event_pair stream_event)
+  {
+    // If we are destroying an event with associated free list, we need to synchronize that event
+    // and then merge its free list into the (legacy) default stream's list
+    auto free_list_iter = stream_free_blocks_.find(stream_event);
+    if (free_list_iter != stream_free_blocks_.end()) {
+      auto blocks = free_list_iter->second;
+      stream_free_blocks_[get_event(cudaStreamLegacy)].insert(blocks.begin(), blocks.end());
+      stream_free_blocks_.erase(free_list_iter);
+
+      auto result = cudaEventSynchronize(stream_event.event);
+      assert(cudaSuccess == result);
+    }
+    auto result = cudaEventDestroy(stream_event.event);
+    assert(cudaSuccess == result);
+  }
+
   size_t maximum_pool_size_;
   size_t current_pool_size_{0};
 
   Upstream* upstream_mr_;  // The "heap" to allocate the pool from
 
-  // map of [stream_id, free_list] pairs
-  // stream stream_id must be synced before allocating from this list to a different stream
-  std::map<cudaStream_t, free_list> stream_free_blocks_;
+  // map of [cudaEvent_t, free_list] pairs
+  // Event (or associated stream) must be synced before allocating from associated free_list to a
+  // different stream
+  std::map<stream_event_pair, free_list> stream_free_blocks_;
 
   std::set<block, rmm::mr::detail::compare_blocks<block>> allocated_blocks_;
 
   // blocks allocated from upstream: so they can be easily freed
   std::vector<block> upstream_blocks_;
+
+  // bidirectional mapping between non-default streams and events
+  std::unordered_map<cudaStream_t, stream_event_pair> stream_events_;
+
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+  // references to per-thread events to avoid use-after-free when threads exit after MR is deleted
+  std::list<std::reference_wrapper<default_stream_event>> ptds_events_;
+#endif
+
+  std::mutex mutable mtx_;  // mutex for thread-safe access
 };
 
 }  // namespace mr
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index cad684133..6f20478cf 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -53,7 +53,9 @@ include_directories("${GTEST_INCLUDE_DIR}"
 ###################################################################################################
 # - library paths ---------------------------------------------------------------------------------
 
-link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc
+# CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link
+# directories for nvcc
+link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" 
                  "${CMAKE_BINARY_DIR}/lib"
                  "${GTEST_LIBRARY_DIR}")
 
@@ -66,10 +68,11 @@ link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT
 
 ###################################################################################################
 ###################################################################################################
-# - device mr tests -------------------------------------------------------------------------------------
+# - device mr tests --------------------------------------------------------------------------------
 
 set(DEVICE_MR_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/mr_tests.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/mr_multithreaded_tests.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/thrust_allocator_tests.cu")
     
 
@@ -77,16 +80,28 @@ ConfigureTest(DEVICE_MR_TEST "${DEVICE_MR_TEST_SRC}")
 
 ###################################################################################################
 ###################################################################################################
-# - host mr tests -------------------------------------------------------------------------------------
+# - device mr per-thread default stream tests------------------------------------------------------
+
+
+set(DEVICE_MR_PTDS_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/mr_multithreaded_tests.cpp")
+
+ConfigureTest(DEVICE_MR_PTDS_TEST "${DEVICE_MR_PTDS_TEST_SRC}")
+target_compile_definitions(DEVICE_MR_PTDS_TEST PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
+
+
+###################################################################################################
+###################################################################################################
+# - host mr tests ----------------------------------------------------------------------------------
 
 set(HOST_MR_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/mr/host/mr_tests.cpp")
     
-
 ConfigureTest(HOST_MR_TEST "${HOST_MR_TEST_SRC}") 
+
 ###################################################################################################
 ###################################################################################################
-# - device buffer tests -------------------------------------------------------------------------------------
+# - device buffer tests ----------------------------------------------------------------------------
 
 set(BUFFER_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/device_buffer_tests.cu")
@@ -95,14 +110,14 @@ ConfigureTest(DEVICE_BUFFER_TEST "${BUFFER_TEST_SRC}")
 
 ###################################################################################################
 ###################################################################################################
-# - device scalar tests -------------------------------------------------------------------------------------
+# - device scalar tests ----------------------------------------------------------------------------
 
 set(SCALAR_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/device_scalar_tests.cpp")
 
 ConfigureTest(DEVICE_SCALAR_TEST "${SCALAR_TEST_SRC}")
 ###################################################################################################
-# - logger tests -------------------------------------------------------------------------------------
+# - logger tests ----------------------------------------------------------------------------------
 
 set(LOGGER_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/logger_tests.cpp")
@@ -110,7 +125,7 @@ set(LOGGER_TEST_SRC
 ConfigureTest(LOGGER_TEST "${LOGGER_TEST_SRC}")
 
 ###################################################################################################
-# - uvector tests -------------------------------------------------------------------------------------
+# - uvector tests ---------------------------------------------------------------------------------
 
 set(DEVICE_UVECTOR_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/device_uvector_tests.cu")
diff --git a/tests/mr/device/mr_multithreaded_tests.cpp b/tests/mr/device/mr_multithreaded_tests.cpp
new file mode 100644
index 000000000..2cb844b55
--- /dev/null
+++ b/tests/mr/device/mr_multithreaded_tests.cpp
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "mr/device/cuda_memory_resource.hpp"
+#include "mr/device/default_memory_resource.hpp"
+#include "mr/device/pool_memory_resource.hpp"
+#include "mr_test.hpp"
+
+#include <thread>
+#include <vector>
+
+namespace {
+
+using thread_safe_fixed_size_mr      = rmm::mr::thread_safe_resource_adaptor<fixed_size_mr>;
+using thread_safe_fixed_multisize_mr = rmm::mr::thread_safe_resource_adaptor<fixed_multisize_mr>;
+using thread_safe_fixed_multisize_pool_mr =
+  rmm::mr::thread_safe_resource_adaptor<fixed_multisize_pool_mr>;
+using thread_safe_hybrid_mr = rmm::mr::thread_safe_resource_adaptor<hybrid_mr>;
+
+constexpr std::size_t num_threads{4};
+
+template <typename Task, typename... Arguments>
+void spawn(Task task, Arguments... args)
+{
+  std::vector<std::thread> threads;
+  threads.reserve(num_threads);
+  for (int i = 0; i < num_threads; ++i)
+    threads.emplace_back(std::thread(task, args...));
+
+  for (auto& t : threads)
+    t.join();
+}
+
+}  // namespace
+
+// specialize test constructor for thread-safe types
+
+template <>
+inline MRTest<thread_safe_fixed_size_mr>::MRTest()
+  : mr{new thread_safe_fixed_size_mr(new fixed_size_mr(rmm::mr::get_default_resource()))}
+{
+}
+
+template <>
+inline MRTest<thread_safe_fixed_size_mr>::~MRTest()
+{
+  auto fixed = mr->get_upstream();
+  this->mr.reset();
+  delete fixed;
+}
+
+template <>
+inline MRTest<thread_safe_fixed_multisize_mr>::MRTest()
+  : mr{new thread_safe_fixed_multisize_mr(new fixed_multisize_mr(rmm::mr::get_default_resource()))}
+{
+}
+
+template <>
+inline MRTest<thread_safe_fixed_multisize_mr>::~MRTest()
+{
+  auto fixed = mr->get_upstream();
+  this->mr.reset();
+  delete fixed;
+}
+
+template <>
+inline MRTest<thread_safe_fixed_multisize_pool_mr>::MRTest()
+  : mr{new thread_safe_fixed_multisize_pool_mr(
+      new fixed_multisize_pool_mr(new pool_mr(new rmm::mr::cuda_memory_resource)))}
+{
+}
+
+template <>
+inline MRTest<thread_safe_fixed_multisize_pool_mr>::~MRTest()
+{
+  auto fixed = mr->get_upstream();
+  auto pool  = fixed->get_upstream();
+  auto cuda  = pool->get_upstream();
+  this->mr.reset();
+  delete fixed;
+  delete pool;
+  delete cuda;
+}
+
+template <>
+inline MRTest<thread_safe_hybrid_mr>::MRTest()
+{
+  rmm::mr::cuda_memory_resource* cuda = new rmm::mr::cuda_memory_resource{};
+  pool_mr* pool                       = new pool_mr(cuda);
+  this->mr.reset(new thread_safe_hybrid_mr(new hybrid_mr(new fixed_multisize_pool_mr(pool), pool)));
+}
+
+template <>
+inline MRTest<thread_safe_hybrid_mr>::~MRTest()
+{
+  auto hybrid = mr->get_upstream();
+  auto fixed  = hybrid->get_small_mr();
+  auto pool   = hybrid->get_large_mr();
+  auto cuda   = pool->get_upstream();
+  this->mr.reset();
+  delete hybrid;
+  delete fixed;
+  delete pool;
+  delete cuda;
+}
+
+// specialize get_max_size for thread-safe MRs
+template <>
+std::size_t get_max_size(thread_safe_fixed_size_mr* mr)
+{
+  return mr->get_upstream()->get_block_size();
+}
+
+template <>
+std::size_t get_max_size(thread_safe_fixed_multisize_mr* mr)
+{
+  return mr->get_upstream()->get_max_size();
+}
+
+template <>
+std::size_t get_max_size(thread_safe_fixed_multisize_pool_mr* mr)
+{
+  return mr->get_upstream()->get_max_size();
+}
+
+// specialize random allocations to not allocate too large
+template <>
+inline void test_random_allocations<thread_safe_fixed_size_mr>(thread_safe_fixed_size_mr* mr,
+                                                               std::size_t num_allocations,
+                                                               cudaStream_t stream)
+{
+  return test_random_allocations_base(mr, num_allocations, 1_MiB, stream);
+}
+
+template <>
+inline void test_random_allocations<thread_safe_fixed_multisize_mr>(
+  thread_safe_fixed_multisize_mr* mr, std::size_t num_allocations, cudaStream_t stream)
+{
+  return test_random_allocations_base(mr, num_allocations, 1_MiB, stream);
+}
+
+template <>
+inline void test_random_allocations<thread_safe_fixed_multisize_pool_mr>(
+  thread_safe_fixed_multisize_pool_mr* mr, std::size_t num_allocations, cudaStream_t stream)
+{
+  return test_random_allocations_base(mr, num_allocations, 1_MiB, stream);
+}
+
+template <>
+inline void test_mixed_random_allocation_free<thread_safe_fixed_size_mr>(
+  thread_safe_fixed_size_mr* mr, cudaStream_t stream)
+{
+  test_mixed_random_allocation_free_base(mr, 1_MiB, stream);
+}
+
+template <>
+inline void test_mixed_random_allocation_free<thread_safe_fixed_multisize_mr>(
+  thread_safe_fixed_multisize_mr* mr, cudaStream_t stream)
+{
+  test_mixed_random_allocation_free_base(mr, 4_MiB, stream);
+}
+
+template <>
+inline void test_mixed_random_allocation_free<thread_safe_fixed_multisize_pool_mr>(
+  thread_safe_fixed_multisize_pool_mr* mr, cudaStream_t stream)
+{
+  test_mixed_random_allocation_free_base(mr, 4_MiB, stream);
+}
+
+// Test on all memory resource classes
+using resources = ::testing::Types<rmm::mr::cuda_memory_resource,
+                                   rmm::mr::managed_memory_resource,
+                                   rmm::mr::cnmem_memory_resource,
+                                   rmm::mr::cnmem_managed_memory_resource,
+                                   pool_mr,
+                                   thread_safe_fixed_size_mr,
+                                   thread_safe_fixed_multisize_mr,
+                                   thread_safe_fixed_multisize_pool_mr,
+                                   thread_safe_hybrid_mr>;
+
+template <typename MemoryResourceType>
+using MRTest_mt = MRTest<MemoryResourceType>;
+
+TYPED_TEST_CASE(MRTest_mt, resources);
+
+TEST(DefaultTest, UseDefaultResource_mt) { spawn(test_get_default_resource); }
+
+TYPED_TEST(MRTest_mt, SetDefaultResource_mt)
+{
+  // single thread changes default resource, then multiple threads use it
+
+  // Not necessarily false, since two cuda_memory_resources are always equal
+  // EXPECT_FALSE(this->mr->is_equal(*rmm::mr::get_default_resource()));
+  rmm::mr::device_memory_resource* old{nullptr};
+  EXPECT_NO_THROW(old = rmm::mr::set_default_resource(this->mr.get()));
+  EXPECT_NE(nullptr, old);
+
+  spawn([mr = this->mr.get()]() {
+    EXPECT_EQ(mr, rmm::mr::get_default_resource());
+    test_get_default_resource();  // test allocating with the new default resource
+  });
+
+  // setting default resource w/ nullptr should reset to initial
+  EXPECT_NO_THROW(rmm::mr::set_default_resource(nullptr));
+  EXPECT_TRUE(old->is_equal(*rmm::mr::get_default_resource()));
+  // Not necessarily false, since two cuda_memory_resources are always equal
+  // EXPECT_FALSE(this->mr->is_equal(*rmm::mr::get_default_resource()));
+}
+
+TYPED_TEST(MRTest_mt, Allocate) { spawn(test_various_allocations<TypeParam>, this->mr.get()); }
+
+TYPED_TEST(MRTest_mt, AllocateOnStream)
+{
+  spawn(test_various_allocations_on_stream<TypeParam>, this->mr.get(), this->stream);
+}
+
+TYPED_TEST(MRTest_mt, RandomAllocations)
+{
+  spawn(test_random_allocations<TypeParam>, this->mr.get(), 100, nullptr);
+}
+
+TYPED_TEST(MRTest_mt, RandomAllocationsStream)
+{
+  spawn(test_random_allocations<TypeParam>, this->mr.get(), 100, this->stream);
+}
+
+TYPED_TEST(MRTest_mt, MixedRandomAllocationFree)
+{
+  spawn(test_mixed_random_allocation_free<TypeParam>, this->mr.get(), nullptr);
+}
+
+TYPED_TEST(MRTest_mt, MixedRandomAllocationFreeStream)
+{
+  spawn(test_mixed_random_allocation_free<TypeParam>, this->mr.get(), this->stream);
+}
+
+template <typename MemoryResourceType>
+void allocate_loop(MemoryResourceType* mr,
+                   std::size_t num_allocations,
+                   std::list<allocation>& allocations,
+                   std::mutex& mtx,
+                   cudaStream_t stream)
+{
+  constexpr std::size_t max_size{1_MiB};
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> size_distribution(1, max_size);
+
+  for (std::size_t i = 0; i < num_allocations; ++i) {
+    size_t size = size_distribution(generator);
+    void* ptr{};
+    EXPECT_NO_THROW(ptr = mr->allocate(size, stream));
+    {
+      std::lock_guard<std::mutex> lock(mtx);
+      allocations.emplace_back(ptr, size);
+    }
+  }
+}
+
+template <typename MemoryResourceType>
+void deallocate_loop(MemoryResourceType* mr,
+                     std::size_t num_allocations,
+                     std::list<allocation>& allocations,
+                     std::mutex& mtx,
+                     cudaStream_t stream)
+{
+  for (std::size_t i = 0; i < num_allocations;) {
+    {
+      std::lock_guard<std::mutex> lock(mtx);
+      if (allocations.empty())
+        continue;
+      else {
+        i++;
+        allocation alloc = allocations.front();
+        allocations.pop_front();
+        EXPECT_NO_THROW(mr->deallocate(alloc.p, alloc.size, stream));
+      }
+    }
+  }
+}
+
+template <typename MemoryResourceType>
+void test_allocate_free_different_threads(MemoryResourceType* mr,
+                                          cudaStream_t streamA,
+                                          cudaStream_t streamB)
+{
+  std::default_random_engine generator;
+  constexpr std::size_t num_allocations{100};
+  constexpr std::size_t max_size{1_MiB};
+
+  std::uniform_int_distribution<std::size_t> size_distribution(1, max_size);
+
+  std::mutex mtx;
+  std::list<allocation> allocations;
+
+  std::thread producer(allocate_loop<MemoryResourceType>,
+                       mr,
+                       num_allocations,
+                       std::ref(allocations),
+                       std::ref(mtx),
+                       streamA);
+  std::thread consumer(deallocate_loop<MemoryResourceType>,
+                       mr,
+                       num_allocations,
+                       std::ref(allocations),
+                       std::ref(mtx),
+                       streamB);
+
+  producer.join();
+  consumer.join();
+}
+
+TYPED_TEST(MRTest_mt, AllocFreeDifferentThreadsDefaultStream)
+{
+  test_allocate_free_different_threads<TypeParam>(this->mr.get(), nullptr, nullptr);
+}
+
+TYPED_TEST(MRTest_mt, AllocFreeDifferentThreadsSameStream)
+{
+  test_allocate_free_different_threads<TypeParam>(this->mr.get(), this->stream, this->stream);
+}
+
+// cnmem does not allow freeing on a different stream than allocating
+using resources_different_stream = ::testing::Types<rmm::mr::cuda_memory_resource,
+                                                    rmm::mr::managed_memory_resource,
+                                                    pool_mr,
+                                                    thread_safe_fixed_size_mr,
+                                                    thread_safe_fixed_multisize_mr,
+                                                    thread_safe_fixed_multisize_pool_mr,
+                                                    thread_safe_hybrid_mr>;
+
+template <typename MemoryResourceType>
+using MRTestDifferentStream_mt = MRTest<MemoryResourceType>;
+
+TYPED_TEST_CASE(MRTestDifferentStream_mt, resources_different_stream);
+
+TYPED_TEST(MRTestDifferentStream_mt, AllocFreeDifferentThreadsDifferentStream)
+{
+  cudaStream_t streamB{};
+  EXPECT_EQ(cudaSuccess, cudaStreamCreate(&streamB));
+  test_allocate_free_different_threads<TypeParam>(this->mr.get(), this->stream, streamB);
+  EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(streamB));
+  EXPECT_EQ(cudaSuccess, cudaStreamDestroy(streamB));
+}
diff --git a/tests/mr/device/mr_test.hpp b/tests/mr/device/mr_test.hpp
new file mode 100644
index 000000000..f9e9d8663
--- /dev/null
+++ b/tests/mr/device/mr_test.hpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gtest/gtest.h"
+
+#include <rmm/mr/device/cnmem_managed_memory_resource.hpp>
+#include <rmm/mr/device/cnmem_memory_resource.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/default_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/fixed_multisize_memory_resource.hpp>
+#include <rmm/mr/device/fixed_size_memory_resource.hpp>
+#include <rmm/mr/device/hybrid_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/device/thread_safe_resource_adaptor.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <random>
+
+namespace {
+
+inline bool is_aligned(void* p, std::size_t alignment = 256)
+{
+  return (0 == reinterpret_cast<uintptr_t>(p) % alignment);
+}
+
+/**
+ * @brief Returns if a pointer points to a device memory or managed memory
+ * allocation.
+ */
+inline bool is_device_memory(void* p)
+{
+  cudaPointerAttributes attributes{};
+  if (cudaSuccess != cudaPointerGetAttributes(&attributes, p)) { return false; }
+#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
+  return attributes.memoryType == cudaMemoryTypeDevice;
+#else
+  return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
+#endif
+}
+
+// some useful allocation sizes
+constexpr long operator""_B(unsigned long long const x) { return x; }
+constexpr long operator""_KiB(unsigned long long const x) { return x * (long{1} << 10); }
+constexpr long operator""_MiB(unsigned long long const x) { return x * (long{1} << 20); }
+constexpr long operator""_GiB(unsigned long long const x) { return x * (long{1} << 30); }
+constexpr long operator""_TiB(unsigned long long const x) { return x * (long{1} << 40); }
+constexpr long operator""_PiB(unsigned long long const x) { return x * (long{1} << 50); }
+
+struct allocation {
+  void* p{nullptr};
+  std::size_t size{0};
+  allocation(void* _p, std::size_t _size) : p{_p}, size{_size} {}
+  allocation() = default;
+};
+
+// nested MR type names can get long...
+using pool_mr       = rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>;
+using fixed_size_mr = rmm::mr::fixed_size_memory_resource<rmm::mr::device_memory_resource>;
+using fixed_multisize_mr =
+  rmm::mr::fixed_multisize_memory_resource<rmm::mr::device_memory_resource>;
+using fixed_multisize_pool_mr = rmm::mr::fixed_multisize_memory_resource<pool_mr>;
+using hybrid_mr               = rmm::mr::hybrid_memory_resource<fixed_multisize_pool_mr, pool_mr>;
+
+}  // namespace
+
+template <typename MemoryResourceType>
+std::size_t get_max_size(MemoryResourceType* mr)
+{
+  return std::numeric_limits<std::size_t>::max();
+}
+
+template <>
+inline std::size_t get_max_size(fixed_size_mr* mr)
+{
+  return mr->get_block_size();
+}
+
+template <>
+inline std::size_t get_max_size(fixed_multisize_mr* mr)
+{
+  return mr->get_max_size();
+}
+
+// Various test functions, shared between single-threaded and multithreaded tests.
+
+inline void test_get_default_resource()
+{
+  EXPECT_NE(nullptr, rmm::mr::get_default_resource());
+  void* p{nullptr};
+  EXPECT_NO_THROW(p = rmm::mr::get_default_resource()->allocate(1_MiB));
+  EXPECT_NE(nullptr, p);
+  EXPECT_TRUE(is_aligned(p));
+  EXPECT_TRUE(is_device_memory(p));
+  EXPECT_NO_THROW(rmm::mr::get_default_resource()->deallocate(p, 1_MiB));
+}
+
+template <typename MemoryResourceType>
+void test_allocate(MemoryResourceType* mr, std::size_t bytes, cudaStream_t stream = 0)
+{
+  void* p{nullptr};
+  if (bytes > get_max_size(mr)) {
+    EXPECT_THROW(p = mr->allocate(bytes), std::bad_alloc);
+  } else {
+    EXPECT_NO_THROW(p = mr->allocate(bytes));
+    if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    EXPECT_NE(nullptr, p);
+    EXPECT_TRUE(is_aligned(p));
+    EXPECT_TRUE(is_device_memory(p));
+    EXPECT_NO_THROW(mr->deallocate(p, bytes));
+    if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+  }
+}
+
+template <typename MemoryResourceType>
+void test_various_allocations(MemoryResourceType* mr)
+{
+  // test allocating zero bytes
+  {
+    void* p{nullptr};
+    EXPECT_NO_THROW(p = mr->allocate(0));
+    EXPECT_EQ(nullptr, p);
+    EXPECT_NO_THROW(mr->deallocate(p, 0));
+  }
+
+  test_allocate(mr, 4_B);
+  test_allocate(mr, 1_KiB);
+  test_allocate(mr, 1_MiB);
+  test_allocate(mr, 1_GiB);
+
+  // should fail to allocate too much
+  {
+    void* p{nullptr};
+    EXPECT_THROW(p = mr->allocate(1_PiB), rmm::bad_alloc);
+    EXPECT_EQ(nullptr, p);
+  }
+}
+
+template <typename MemoryResourceType>
+void test_various_allocations_on_stream(MemoryResourceType* mr, cudaStream_t stream = 0)
+{
+  // test allocating zero bytes on non-default stream
+  {
+    void* p{nullptr};
+    EXPECT_NO_THROW(p = mr->allocate(0, stream));
+    EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    EXPECT_NO_THROW(mr->deallocate(p, 0, stream));
+    EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+  }
+
+  test_allocate(mr, 4_B, stream);
+  test_allocate(mr, 1_KiB, stream);
+  test_allocate(mr, 1_MiB, stream);
+  test_allocate(mr, 1_GiB, stream);
+
+  // should fail to allocate too much
+  {
+    void* p{nullptr};
+    EXPECT_THROW(p = mr->allocate(1_PiB, stream), rmm::bad_alloc);
+    EXPECT_EQ(nullptr, p);
+  }
+}
+
+template <typename MemoryResourceType>
+void test_random_allocations_base(MemoryResourceType* mr,
+                                  std::size_t num_allocations = 100,
+                                  std::size_t max_size        = 5_MiB,
+                                  cudaStream_t stream         = 0)
+{
+  std::vector<allocation> allocations(num_allocations);
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> distribution(1, max_size);
+
+  // 100 allocations from [0,5MB)
+  std::for_each(
+    allocations.begin(), allocations.end(), [&generator, &distribution, stream, mr](allocation& a) {
+      a.size = distribution(generator);
+      EXPECT_NO_THROW(a.p = mr->allocate(a.size, stream));
+      if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+      EXPECT_NE(nullptr, a.p);
+      EXPECT_TRUE(is_aligned(a.p));
+    });
+
+  std::for_each(
+    allocations.begin(), allocations.end(), [generator, distribution, stream, mr](allocation& a) {
+      EXPECT_NO_THROW(mr->deallocate(a.p, a.size, stream));
+      if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    });
+}
+
+template <typename MemoryResourceType>
+void test_random_allocations(MemoryResourceType* mr,
+                             std::size_t num_allocations = 100,
+                             cudaStream_t stream         = 0)
+{
+  return test_random_allocations_base<MemoryResourceType>(mr, num_allocations, 5_MiB, stream);
+}
+
+template <>
+inline void test_random_allocations<fixed_size_mr>(fixed_size_mr* mr,
+                                                   std::size_t num_allocations,
+                                                   cudaStream_t stream)
+{
+  return test_random_allocations_base(mr, num_allocations, 1_MiB, stream);
+}
+
+template <>
+inline void test_random_allocations<fixed_multisize_mr>(fixed_multisize_mr* mr,
+                                                        std::size_t num_allocations,
+                                                        cudaStream_t stream)
+{
+  return test_random_allocations_base(mr, num_allocations, 1_MiB, stream);
+}
+
+template <typename MemoryResourceType>
+void test_mixed_random_allocation_free_base(MemoryResourceType* mr,
+                                            std::size_t max_size = 5_MiB,
+                                            cudaStream_t stream  = 0)
+{
+  std::default_random_engine generator;
+  constexpr std::size_t num_allocations{100};
+
+  std::uniform_int_distribution<std::size_t> size_distribution(1, max_size);
+
+  constexpr int allocation_probability = 53;  // percent
+  std::uniform_int_distribution<int> op_distribution(0, 99);
+  std::uniform_int_distribution<int> index_distribution(0, num_allocations - 1);
+
+  int active_allocations{0};
+  int allocation_count{0};
+
+  std::vector<allocation> allocations;
+
+  for (int i = 0; i < num_allocations * 2; ++i) {
+    bool do_alloc = true;
+    if (active_allocations > 0) {
+      int chance = op_distribution(generator);
+      do_alloc   = (chance < allocation_probability) && (allocation_count < num_allocations);
+    }
+
+    if (do_alloc) {
+      size_t size = size_distribution(generator);
+      active_allocations++;
+      allocation_count++;
+      EXPECT_NO_THROW(allocations.emplace_back(mr->allocate(size, stream), size));
+      auto new_allocation = allocations.back();
+      EXPECT_NE(nullptr, new_allocation.p);
+      EXPECT_TRUE(is_aligned(new_allocation.p));
+    } else {
+      size_t index = index_distribution(generator) % active_allocations;
+      active_allocations--;
+      allocation to_free = allocations[index];
+      allocations.erase(std::next(allocations.begin(), index));
+      EXPECT_NO_THROW(mr->deallocate(to_free.p, to_free.size, stream));
+    }
+  }
+
+  EXPECT_EQ(active_allocations, 0);
+  EXPECT_EQ(allocations.size(), active_allocations);
+}
+
+template <typename MemoryResourceType>
+void test_mixed_random_allocation_free(MemoryResourceType* mr, cudaStream_t stream)
+{
+  test_mixed_random_allocation_free_base(mr, 5_MiB, stream);
+}
+
+template <>
+inline void test_mixed_random_allocation_free<fixed_size_mr>(fixed_size_mr* mr, cudaStream_t stream)
+{
+  test_mixed_random_allocation_free_base(mr, 1_MiB, stream);
+}
+
+template <>
+inline void test_mixed_random_allocation_free<fixed_multisize_mr>(fixed_multisize_mr* mr,
+                                                                  cudaStream_t stream)
+{
+  test_mixed_random_allocation_free_base(mr, 4_MiB, stream);
+}
+
+// The test fixture
+template <typename MemoryResourceType>
+struct MRTest : public ::testing::Test {
+  std::unique_ptr<MemoryResourceType> mr;
+  cudaStream_t stream;
+
+  MRTest() : mr{new MemoryResourceType} {}
+
+  void SetUp() override { EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); }
+
+  void TearDown() override { EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); };
+
+  ~MRTest() {}
+};
+
+// Specialize constructor to pass arguments
+template <>
+inline MRTest<fixed_size_mr>::MRTest() : mr{new fixed_size_mr{rmm::mr::get_default_resource()}}
+{
+}
+
+template <>
+inline MRTest<fixed_multisize_mr>::MRTest()
+  : mr{new fixed_multisize_mr(rmm::mr::get_default_resource())}
+{
+}
+
+template <>
+inline MRTest<pool_mr>::MRTest() : mr{}
+{
+  rmm::mr::cuda_memory_resource* cuda = new rmm::mr::cuda_memory_resource{};
+  this->mr.reset(new pool_mr(cuda));
+}
+
+template <>
+inline MRTest<hybrid_mr>::MRTest()
+{
+  rmm::mr::cuda_memory_resource* cuda = new rmm::mr::cuda_memory_resource{};
+  pool_mr* pool                       = new pool_mr(cuda);
+  this->mr.reset(new hybrid_mr(new fixed_multisize_pool_mr(pool), pool));
+}
+
+template <>
+inline MRTest<pool_mr>::~MRTest()
+{
+  auto upstream = this->mr->get_upstream();
+  this->mr.reset();
+  delete upstream;
+}
+
+template <>
+inline MRTest<hybrid_mr>::~MRTest()
+{
+  auto fixed = this->mr->get_small_mr();
+  auto pool  = this->mr->get_large_mr();
+  auto cuda  = pool->get_upstream();
+  this->mr.reset();
+  delete fixed;
+  delete pool;
+  delete cuda;
+}
diff --git a/tests/mr/device/mr_tests.cpp b/tests/mr/device/mr_tests.cpp
index 6131938ad..5f62a8e3a 100644
--- a/tests/mr/device/mr_tests.cpp
+++ b/tests/mr/device/mr_tests.cpp
@@ -14,307 +14,13 @@
  * limitations under the License.
  */
 
-#include "gtest/gtest.h"
-
-#include <rmm/mr/device/cnmem_managed_memory_resource.hpp>
-#include <rmm/mr/device/cnmem_memory_resource.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/default_memory_resource.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/fixed_multisize_memory_resource.hpp>
-#include <rmm/mr/device/fixed_size_memory_resource.hpp>
-#include <rmm/mr/device/hybrid_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-#include <rmm/mr/device/thread_safe_resource_adaptor.hpp>
-
-#include <cuda_runtime_api.h>
-#include <cstddef>
-#include <deque>
-#include <random>
-
-namespace {
-static constexpr std::size_t ALIGNMENT{256};
-inline bool is_aligned(void* p, std::size_t alignment = ALIGNMENT)
-{
-  return (0 == reinterpret_cast<uintptr_t>(p) % alignment);
-}
-
-/**
- * @brief Returns if a pointer points to a device memory or managed memory
- * allocation.
- */
-inline bool is_device_memory(void* p)
-{
-  cudaPointerAttributes attributes{};
-  if (cudaSuccess != cudaPointerGetAttributes(&attributes, p)) { return false; }
-#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
-  return attributes.memoryType == cudaMemoryTypeDevice;
-#else
-  return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
-#endif
-}
-
-// some useful allocation sizes
-static constexpr std::size_t size_word{4};
-static constexpr std::size_t size_kb{std::size_t{1} << 10};
-static constexpr std::size_t size_mb{std::size_t{1} << 20};
-static constexpr std::size_t size_gb{std::size_t{1} << 30};
-static constexpr std::size_t size_tb{std::size_t{1} << 40};
-static constexpr std::size_t size_pb{std::size_t{1} << 50};
-
-struct allocation {
-  void* p{nullptr};
-  std::size_t size{0};
-  allocation(void* _p, std::size_t _size) : p{_p}, size{_size} {}
-  allocation() = default;
-};
-}  // namespace
-
-// nested MR type names can get long...
-using pool_mr       = rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>;
-using fixed_size_mr = rmm::mr::fixed_size_memory_resource<rmm::mr::device_memory_resource>;
-using fixed_multisize_mr =
-  rmm::mr::fixed_multisize_memory_resource<rmm::mr::device_memory_resource>;
-using fixed_multisize_pool_mr = rmm::mr::fixed_multisize_memory_resource<pool_mr>;
-using hybrid_mr               = rmm::mr::hybrid_memory_resource<fixed_multisize_pool_mr, pool_mr>;
-
-using thread_safe_cuda_mr = rmm::mr::thread_safe_resource_adaptor<rmm::mr::cuda_memory_resource>;
-
-template <typename MemoryResourceType>
-struct MRTest : public ::testing::Test {
-  std::unique_ptr<MemoryResourceType> mr;
-  cudaStream_t stream;
-
-  MRTest() : mr{new MemoryResourceType} {}
-
-  void SetUp() override { EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); }
-
-  void TearDown() override { EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); };
-
-  ~MRTest() {}
-
-  void test_allocate(std::size_t bytes, cudaStream_t stream = 0);
-  void test_random_allocations_base(std::size_t num_allocations = 100,
-                                    std::size_t max_size        = 5 * size_mb,
-                                    cudaStream_t stream         = 0);
-  void test_random_allocations(std::size_t num_allocations = 100, cudaStream_t stream = 0);
-  void test_mixed_random_allocation_free_base(std::size_t max_size = 5 * size_mb,
-                                              cudaStream_t stream  = 0);
-  void test_mixed_random_allocation_free(cudaStream_t stream = 0);
-};
-
-// Specialize constructor to pass arguments
-template <>
-MRTest<fixed_size_mr>::MRTest() : mr{new fixed_size_mr{rmm::mr::get_default_resource()}}
-{
-}
-
-template <>
-MRTest<fixed_multisize_mr>::MRTest() : mr{new fixed_multisize_mr(rmm::mr::get_default_resource())}
-{
-}
-
-template <>
-MRTest<pool_mr>::MRTest()
-{
-  rmm::mr::cuda_memory_resource* cuda = new rmm::mr::cuda_memory_resource{};
-  this->mr.reset(new pool_mr(cuda));
-}
-
-template <>
-MRTest<hybrid_mr>::MRTest()
-{
-  rmm::mr::cuda_memory_resource* cuda = new rmm::mr::cuda_memory_resource{};
-  pool_mr* pool                       = new pool_mr(cuda);
-  this->mr.reset(new hybrid_mr(new fixed_multisize_pool_mr(pool), pool));
-}
-
-template <>
-MRTest<pool_mr>::~MRTest()
-{
-  auto upstream = this->mr->get_upstream();
-  this->mr.reset();
-  delete upstream;
-}
-
-template <>
-MRTest<hybrid_mr>::~MRTest()
-{
-  auto small = this->mr->get_small_mr();
-  auto large = this->mr->get_large_mr();
-  this->mr.reset();
-  delete small;
-  delete large;
-}
-
-template <>
-MRTest<thread_safe_cuda_mr>::MRTest()
-  : mr{new thread_safe_cuda_mr(new rmm::mr::cuda_memory_resource)}
-{
-}
-
-template <>
-MRTest<thread_safe_cuda_mr>::~MRTest()
-{
-  auto upstream = mr->get_upstream();
-  delete upstream;
-}
-
-template <typename MemoryResourceType>
-std::size_t get_max_size(MemoryResourceType* mr)
-{
-  return std::numeric_limits<std::size_t>::max();
-}
-
-template <>
-std::size_t get_max_size(fixed_size_mr* mr)
-{
-  return mr->get_block_size();
-}
-
-template <>
-std::size_t get_max_size(fixed_multisize_mr* mr)
-{
-  return mr->get_max_size();
-}
-
-template <typename MemoryResourceType>
-void MRTest<MemoryResourceType>::test_allocate(std::size_t bytes, cudaStream_t stream)
-{
-  void* p{nullptr};
-  if (bytes > get_max_size(this->mr.get())) {
-    EXPECT_THROW(p = this->mr->allocate(bytes), std::bad_alloc);
-  } else {
-    EXPECT_NO_THROW(p = this->mr->allocate(bytes));
-    if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(this->stream));
-    EXPECT_NE(nullptr, p);
-    EXPECT_TRUE(is_aligned(p));
-    EXPECT_TRUE(is_device_memory(p));
-    EXPECT_NO_THROW(this->mr->deallocate(p, bytes));
-    if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(this->stream));
-  }
-}
-
-template <typename MemoryResourceType>
-void MRTest<MemoryResourceType>::test_random_allocations_base(std::size_t num_allocations,
-                                                              std::size_t max_size,
-                                                              cudaStream_t stream)
-{
-  std::vector<allocation> allocations(num_allocations);
-
-  std::default_random_engine generator;
-  std::uniform_int_distribution<std::size_t> distribution(1, max_size);
-
-  // 100 allocations from [0,5MB)
-  std::for_each(allocations.begin(),
-                allocations.end(),
-                [&generator, &distribution, stream, this](allocation& a) {
-                  a.size = distribution(generator);
-                  EXPECT_NO_THROW(a.p = this->mr->allocate(a.size, stream));
-                  if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
-                  EXPECT_NE(nullptr, a.p);
-                  EXPECT_TRUE(is_aligned(a.p));
-                });
-
-  std::for_each(
-    allocations.begin(), allocations.end(), [generator, distribution, stream, this](allocation& a) {
-      EXPECT_NO_THROW(this->mr->deallocate(a.p, a.size, stream));
-      if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
-    });
-}
-
-template <typename MemoryResourceType>
-void MRTest<MemoryResourceType>::test_random_allocations(std::size_t num_allocations,
-                                                         cudaStream_t stream)
-{
-  return test_random_allocations_base(num_allocations, 5 * size_mb, stream);
-}
-
-template <>
-void MRTest<fixed_size_mr>::test_random_allocations(std::size_t num_allocations,
-                                                    cudaStream_t stream)
-{
-  return test_random_allocations_base(num_allocations, 1 * size_mb, stream);
-}
-
-template <>
-void MRTest<fixed_multisize_mr>::test_random_allocations(std::size_t num_allocations,
-                                                         cudaStream_t stream)
-{
-  return test_random_allocations_base(num_allocations, 4 * size_mb, stream);
-}
-
-template <typename MemoryResourceType>
-void MRTest<MemoryResourceType>::test_mixed_random_allocation_free_base(std::size_t max_size,
-                                                                        cudaStream_t stream)
-{
-  std::default_random_engine generator;
-  constexpr std::size_t num_allocations{100};
-
-  std::uniform_int_distribution<std::size_t> size_distribution(1, max_size);
-
-  constexpr int allocation_probability = 53;  // percent
-  std::uniform_int_distribution<int> op_distribution(0, 99);
-  std::uniform_int_distribution<int> index_distribution(0, num_allocations - 1);
-
-  int active_allocations{0};
-  int allocation_count{0};
-
-  std::vector<allocation> allocations;
-
-  for (int i = 0; i < num_allocations * 2; ++i) {
-    bool do_alloc = true;
-    if (active_allocations > 0) {
-      int chance = op_distribution(generator);
-      do_alloc   = (chance < allocation_probability) && (allocation_count < num_allocations);
-    }
-
-    if (do_alloc) {
-      size_t size = size_distribution(generator);
-      active_allocations++;
-      allocation_count++;
-      EXPECT_NO_THROW(allocations.emplace_back(this->mr->allocate(size, stream), size));
-      auto new_allocation = allocations.back();
-      EXPECT_NE(nullptr, new_allocation.p);
-      EXPECT_TRUE(is_aligned(new_allocation.p));
-    } else {
-      size_t index = index_distribution(generator) % active_allocations;
-      active_allocations--;
-      allocation to_free = allocations[index];
-      allocations.erase(std::next(allocations.begin(), index));
-      EXPECT_NO_THROW(this->mr->deallocate(to_free.p, to_free.size, stream));
-    }
-  }
-
-  EXPECT_EQ(active_allocations, 0);
-  EXPECT_EQ(allocations.size(), active_allocations);
-}
-
-template <typename MemoryResourceType>
-void MRTest<MemoryResourceType>::test_mixed_random_allocation_free(cudaStream_t stream)
-{
-  test_mixed_random_allocation_free_base(5 * size_mb, stream);
-}
-
-template <>
-void MRTest<fixed_size_mr>::test_mixed_random_allocation_free(cudaStream_t stream)
-{
-  test_mixed_random_allocation_free_base(size_mb, stream);
-}
-
-template <>
-void MRTest<fixed_multisize_mr>::test_mixed_random_allocation_free(cudaStream_t stream)
-{
-  test_mixed_random_allocation_free_base(4 * size_mb, stream);
-}
+#include "mr_test.hpp"
 
 // Test on all memory resource classes
 using resources = ::testing::Types<rmm::mr::cuda_memory_resource,
                                    rmm::mr::managed_memory_resource,
                                    rmm::mr::cnmem_memory_resource,
                                    rmm::mr::cnmem_managed_memory_resource,
-                                   thread_safe_cuda_mr,
                                    pool_mr,
                                    fixed_size_mr,
                                    fixed_multisize_mr,
@@ -322,16 +28,7 @@ using resources = ::testing::Types<rmm::mr::cuda_memory_resource,
 
 TYPED_TEST_CASE(MRTest, resources);
 
-TEST(DefaultTest, UseDefaultResource)
-{
-  EXPECT_NE(nullptr, rmm::mr::get_default_resource());
-  void* p{nullptr};
-  EXPECT_NO_THROW(p = rmm::mr::get_default_resource()->allocate(size_mb));
-  EXPECT_NE(nullptr, p);
-  EXPECT_TRUE(is_aligned(p));
-  EXPECT_TRUE(is_device_memory(p));
-  EXPECT_NO_THROW(rmm::mr::get_default_resource()->deallocate(p, size_mb));
-}
+TEST(DefaultTest, UseDefaultResource) { test_get_default_resource(); }
 
 TYPED_TEST(MRTest, SetDefaultResource)
 {
@@ -340,13 +37,9 @@ TYPED_TEST(MRTest, SetDefaultResource)
   rmm::mr::device_memory_resource* old{nullptr};
   EXPECT_NO_THROW(old = rmm::mr::set_default_resource(this->mr.get()));
   EXPECT_NE(nullptr, old);
-  EXPECT_TRUE(this->mr->is_equal(*rmm::mr::get_default_resource()));
-  void* p{nullptr};
-  EXPECT_NO_THROW(p = rmm::mr::get_default_resource()->allocate(size_mb));
-  EXPECT_NE(nullptr, p);
-  EXPECT_TRUE(is_aligned(p));
-  EXPECT_TRUE(is_device_memory(p));
-  EXPECT_NO_THROW(rmm::mr::get_default_resource()->deallocate(p, size_mb));
+
+  test_get_default_resource();  // test allocating with the new default resource
+
   // setting default resource w/ nullptr should reset to initial
   EXPECT_NO_THROW(rmm::mr::set_default_resource(nullptr));
   EXPECT_TRUE(old->is_equal(*rmm::mr::get_default_resource()));
@@ -356,58 +49,28 @@ TYPED_TEST(MRTest, SetDefaultResource)
 
 TYPED_TEST(MRTest, SelfEquality) { EXPECT_TRUE(this->mr->is_equal(*this->mr)); }
 
-TYPED_TEST(MRTest, AllocateZeroBytes)
-{
-  void* p{nullptr};
-  EXPECT_NO_THROW(p = this->mr->allocate(0));
-  EXPECT_EQ(nullptr, p);
-  EXPECT_NO_THROW(this->mr->deallocate(p, 0));
-}
+TYPED_TEST(MRTest, Allocate) { test_various_allocations(this->mr.get()); }
 
-TYPED_TEST(MRTest, AllocateZeroBytesStream)
+TYPED_TEST(MRTest, AllocateOnStream)
 {
-  void* p{nullptr};
-  EXPECT_NO_THROW(p = this->mr->allocate(0, this->stream));
-  EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(this->stream));
-  EXPECT_NO_THROW(this->mr->deallocate(p, 0, this->stream));
-  EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(this->stream));
+  test_various_allocations_on_stream(this->mr.get(), this->stream);
 }
 
-TYPED_TEST(MRTest, Allocate)
-{
-  this->test_allocate(size_word);
-  this->test_allocate(size_kb);
-  this->test_allocate(size_mb);
-  this->test_allocate(size_gb);
+TYPED_TEST(MRTest, RandomAllocations) { test_random_allocations(this->mr.get()); }
 
-  // should fail to allocate too much
-  void* p{nullptr};
-  EXPECT_THROW(p = this->mr->allocate(size_pb), rmm::bad_alloc);
-  EXPECT_EQ(nullptr, p);
+TYPED_TEST(MRTest, RandomAllocationsStream)
+{
+  test_random_allocations(this->mr.get(), 100, this->stream);
 }
 
-TYPED_TEST(MRTest, AllocateOnStream)
+TYPED_TEST(MRTest, MixedRandomAllocationFree)
 {
-  this->test_allocate(size_word, this->stream);
-  this->test_allocate(size_kb, this->stream);
-  this->test_allocate(size_mb, this->stream);
-  this->test_allocate(size_gb, this->stream);
-
-  // should fail to allocate too much
-  void* p{nullptr};
-  EXPECT_THROW(p = this->mr->allocate(size_pb, this->stream), rmm::bad_alloc);
-  EXPECT_EQ(nullptr, p);
+  test_mixed_random_allocation_free(this->mr.get(), nullptr);
 }
 
-TYPED_TEST(MRTest, RandomAllocations) { this->test_random_allocations(); }
-
-TYPED_TEST(MRTest, RandomAllocationsStream) { this->test_random_allocations(100, this->stream); }
-
-TYPED_TEST(MRTest, MixedRandomAllocationFree) { this->test_mixed_random_allocation_free(); }
-
 TYPED_TEST(MRTest, MixedRandomAllocationFreeStream)
 {
-  this->test_mixed_random_allocation_free(this->stream);
+  test_mixed_random_allocation_free(this->mr.get(), this->stream);
 }
 
 TYPED_TEST(MRTest, GetMemInfo)