diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
index b6123e772..958ea523b 100644
--- a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
+++ b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,14 @@
 #pragma once
 
 #include <limits>
+#include <rmm/detail/aligned.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuda_runtime_api.h>
 
+#include <cstddef>
 #include <functional>
 #include <map>
 #include <mutex>
@@ -66,10 +68,10 @@ struct crtp {
  * Classes derived from stream_ordered_memory_resource must implement the following four methods,
  * documented separately:
  *
- * 1. `size_t get_maximum_allocation_size() const`
- * 2. `block_type expand_pool(size_t size, free_list& blocks, cuda_stream_view stream)`
- * 3. `split_block allocate_from_block(block_type const& b, size_t size)`
- * 4. `block_type free_block(void* p, size_t size) noexcept`
+ * 1. `std::size_t get_maximum_allocation_size() const`
+ * 2. `block_type expand_pool(std::size_t size, free_list& blocks, cuda_stream_view stream)`
+ * 3. `split_block allocate_from_block(block_type const& b, std::size_t size)`
+ * 4. `block_type free_block(void* p, std::size_t size) noexcept`
  */
 template <typename PoolResource, typename FreeListType>
 class stream_ordered_memory_resource : public crtp<PoolResource>, public device_memory_resource {
@@ -89,18 +91,18 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
 
   // Derived classes must implement these four methods
 
-  /**
+  /*
    * @brief Get the maximum size of a single allocation supported by this suballocator memory
    * resource
    *
-   * Default implementation is the maximum `size_t` value, but fixed-size allocators will have a
-   * lower limit. Override this function in derived classes as necessary.
+   * Default implementation is the maximum `std::size_t` value, but fixed-size allocators will have
+   * a lower limit. Override this function in derived classes as necessary.
    *
-   * @return size_t The maximum size of a single allocation supported by this memory resource
+   * @return std::size_t The maximum size of a single allocation supported by this memory resource
    */
-  // size_t get_maximum_allocation_size() const { return std::numeric_limits<size_t>::max(); }
+  // std::size_t get_maximum_allocation_size() const
 
-  /**
+  /*
    * @brief Allocate space (typically from upstream) to supply the suballocation pool and return
    * a sufficiently sized block.
    *
@@ -114,15 +116,12 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
    * @param stream The stream on which the memory is to be used.
    * @return block_type a block of at least `size` bytes
    */
-  // block_type expand_pool(size_t size, free_list& blocks, cuda_stream_view stream)
+  // block_type expand_pool(std::size_t size, free_list& blocks, cuda_stream_view stream)
 
-  /// Struct representing a block that has been split for allocation
-  struct split_block {
-    void* allocated_pointer;  ///< The pointer allocated from a block
-    block_type remainder;     ///< The remainder of the block from which the pointer was allocated
-  };
+  /// Pair representing a block that has been split for allocation
+  using split_block = std::pair<block_type, block_type>;
 
-  /**
+  /*
    * @brief Split block `b` if necessary to return a pointer to memory of `size` bytes.
    *
    * If the block is split, the remainder is returned as the remainder element in the output
@@ -134,9 +133,9 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
    * @return A `split_block` comprising the allocated pointer and any unallocated remainder of the
    * input block.
    */
-  // split_block allocate_from_block(block_type const& b, size_t size)
+  // split_block allocate_from_block(block_type const& b, std::size_t size)
 
-  /**
+  /*
    * @brief Finds, frees and returns the block associated with pointer `p`.
    *
    * @param p The pointer to the memory to free.
@@ -144,7 +143,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
    * @return The (now freed) block associated with `p`. The caller is expected to return the block
    * to the pool.
    */
-  // block_type free_block(void* p, size_t size) noexcept
+  // block_type free_block(void* p, std::size_t size) noexcept
 
   /**
    * @brief Returns the block `b` (last used on stream `stream_event`) to the pool.
@@ -213,16 +212,13 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
                 rmm::bad_alloc,
                 "Maximum allocation size exceeded");
     auto const b = this->underlying().get_block(bytes, stream_event);
-    auto split   = this->underlying().allocate_from_block(b, bytes);
-    if (split.remainder.is_valid()) stream_free_blocks_[stream_event].insert(split.remainder);
-    RMM_LOG_TRACE("[A][stream {:p}][{}B][{:p}]",
-                  fmt::ptr(stream_event.stream),
-                  bytes,
-                  fmt::ptr(split.allocated_pointer));
+
+    RMM_LOG_TRACE(
+      "[A][stream {:p}][{}B][{:p}]", fmt::ptr(stream_event.stream), bytes, fmt::ptr(b.pointer()));
 
     log_summary_trace();
 
-    return split.allocated_pointer;
+    return b.pointer();
   }
 
   /**
@@ -234,9 +230,12 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
    */
   virtual void do_deallocate(void* p, std::size_t bytes, cuda_stream_view stream) override
   {
+    RMM_LOG_TRACE("[D][stream {:p}][{}B][{:p}]", fmt::ptr(stream.value()), bytes, p);
+
+    if (bytes <= 0 || p == nullptr) return;
+
     lock_guard lock(mtx_);
     auto stream_event = get_event(stream);
-    RMM_LOG_TRACE("[D][stream {:p}][{}B][{:p}]", fmt::ptr(stream_event.stream), bytes, p);
 
     bytes        = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
     auto const b = this->underlying().free_block(p, bytes);
@@ -301,19 +300,35 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   }
 
   /**
-   * @brief Get an avaible memory block of at least `size` bytes
+   * @brief Splits a block into an allocated block of `size` bytes and a remainder block, and
+   * inserts the remainder into a free list.
+   *
+   * @param b The block to split into allocated and remainder portions.
+   * @param size The size of the block to allocate from `b`.
+   * @param blocks The `free_list` in which to insert the remainder block.
+   * @return The allocated block.
+   */
+  block_type allocate_and_insert_remainder(block_type b, std::size_t size, free_list& blocks)
+  {
+    auto const [allocated, remainder] = this->underlying().allocate_from_block(b, size);
+    if (remainder.is_valid()) blocks.insert(remainder);
+    return allocated;
+  }
+
+  /**
+   * @brief Get an available memory block of at least `size` bytes
    *
    * @param size The number of bytes to allocate
    * @param stream_event The stream and associated event on which the allocation will be used.
    * @return block_type A block of memory of at least `size` bytes
    */
-  block_type get_block(size_t size, stream_event_pair stream_event)
+  block_type get_block(std::size_t size, stream_event_pair stream_event)
   {
     // Try to find a satisfactory block in free list for the same stream (no sync required)
     auto iter = stream_free_blocks_.find(stream_event);
     if (iter != stream_free_blocks_.end()) {
-      block_type b = iter->second.get_block(size);
-      if (b.is_valid()) { return b; }
+      block_type const b = iter->second.get_block(size);
+      if (b.is_valid()) { return allocate_and_insert_remainder(b, size, iter->second); }
     }
 
     free_list& blocks =
@@ -334,7 +349,10 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
     log_summary_trace();
 
     // no large enough blocks available after merging, so grow the pool
-    return this->underlying().expand_pool(size, blocks, cuda_stream_view{stream_event.stream});
+    block_type const b =
+      this->underlying().expand_pool(size, blocks, cuda_stream_view{stream_event.stream});
+
+    return allocate_and_insert_remainder(b, size, blocks);
   }
 
   /**
@@ -352,7 +370,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
    * @return A block with non-null pointer and size >= `size`, or a nullptr block if none is
    *         available in `blocks`.
    */
-  block_type get_block_from_other_stream(size_t size,
+  block_type get_block_from_other_stream(std::size_t size,
                                          stream_event_pair stream_event,
                                          free_list& blocks,
                                          bool merge_first)
@@ -375,10 +393,18 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
 
             stream_free_blocks_.erase(it);
 
-            return blocks.get_block(size);  // get the best fit block in merged lists
+            block_type const b = blocks.get_block(size);  // get the best fit block in merged lists
+            if (b.is_valid()) { return allocate_and_insert_remainder(b, size, blocks); }
           } else {
-            return other_blocks.get_block(size);  // get the best fit block in other list
+            block_type const b = other_blocks.get_block(size);
+            if (b.is_valid()) {
+              // Since we found a block associated with a different stream, we have to insert a wait
+              // on the stream's associated event into the allocating stream.
+              RMM_CUDA_TRY(cudaStreamWaitEvent(stream_event.stream, other_event, 0));
+              return allocate_and_insert_remainder(b, size, other_blocks);
+            }
           }
+          return block_type{};
         }();
 
         if (b.is_valid()) {
@@ -387,12 +413,6 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
                         fmt::ptr(stream_event.stream),
                         size,
                         fmt::ptr(it->first.stream));
-
-          if (not merge_first) {
-            merge_lists(stream_event, blocks, other_event, std::move(other_blocks));
-            stream_free_blocks_.erase(it);
-          }
-
           return b;
         }
       }
diff --git a/include/rmm/mr/device/fixed_size_memory_resource.hpp b/include/rmm/mr/device/fixed_size_memory_resource.hpp
index fe5f9707b..ac7dbc131 100644
--- a/include/rmm/mr/device/fixed_size_memory_resource.hpp
+++ b/include/rmm/mr/device/fixed_size_memory_resource.hpp
@@ -129,10 +129,10 @@ class fixed_size_memory_resource
   /**
    * @brief Get the (fixed) size of allocations supported by this memory resource
    *
-   * @return size_t The (fixed) maximum size of a single allocation supported by this memory
+   * @return std::size_t The (fixed) maximum size of a single allocation supported by this memory
    * resource
    */
-  size_t get_maximum_allocation_size() const { return get_block_size(); }
+  std::size_t get_maximum_allocation_size() const { return get_block_size(); }
 
   /**
    * @brief Allocate a block from upstream to supply the suballocation pool.
@@ -144,7 +144,7 @@ class fixed_size_memory_resource
    * @param stream The stream on which the memory is to be used.
    * @return block_type The allocated block
    */
-  block_type expand_pool(size_t size, free_list& blocks, cuda_stream_view stream)
+  block_type expand_pool(std::size_t size, free_list& blocks, cuda_stream_view stream)
   {
     blocks.insert(std::move(blocks_from_upstream(stream)));
     return blocks.get_block(size);
@@ -181,9 +181,9 @@ class fixed_size_memory_resource
    * @return A pair comprising the allocated pointer and any unallocated remainder of the input
    * block.
    */
-  split_block allocate_from_block(block_type const& b, size_t size)
+  split_block allocate_from_block(block_type const& b, std::size_t size)
   {
-    return split_block{b.pointer(), block_type{nullptr}};
+    return {b, block_type{nullptr}};
   }
 
   /**
@@ -195,7 +195,7 @@ class fixed_size_memory_resource
    * @return The (now freed) block associated with `p`. The caller is expected to return the block
    * to the pool.
    */
-  block_type free_block(void* p, size_t size) noexcept
+  block_type free_block(void* p, std::size_t size) noexcept
   {
     // Deallocating a fixed-size block just inserts it in the free list, which is
     // handled by the parent class
diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
index 8b09e5208..7a2a5b9c7 100644
--- a/include/rmm/mr/device/pool_memory_resource.hpp
+++ b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -31,7 +31,7 @@
 #include <cuda_runtime_api.h>
 
 #include <algorithm>
-#include <cstdint>
+#include <cstddef>
 #include <iostream>
 #include <map>
 #include <mutex>
@@ -141,11 +141,14 @@ class pool_memory_resource final
    * @brief Get the maximum size of allocations supported by this memory resource
    *
    * Note this does not depend on the memory size of the device. It simply returns the maximum
-   * value of `size_t`
+   * value of `std::size_t`
    *
-   * @return size_t The maximum size of a single allocation supported by this memory resource
+   * @return std::size_t The maximum size of a single allocation supported by this memory resource
    */
-  size_t get_maximum_allocation_size() const { return std::numeric_limits<size_t>::max(); }
+  std::size_t get_maximum_allocation_size() const
+  {
+    return std::numeric_limits<std::size_t>::max();
+  }
 
   /**
    * @brief Try to expand the pool by allocating a block of at least `min_size` bytes from
@@ -247,7 +250,7 @@ class pool_memory_resource final
    * Returns 0 if the requested size cannot be satisfied.
    *
    * @param size The size of the minimum allocation immediately needed
-   * @return size_t The computed size to grow the pool.
+   * @return std::size_t The computed size to grow the pool.
    */
   std::size_t size_to_grow(std::size_t size) const
   {
@@ -268,7 +271,7 @@ class pool_memory_resource final
    * @param stream The stream on which the memory is to be used.
    * @return block_type The allocated block
    */
-  thrust::optional<block_type> block_from_upstream(size_t size, cuda_stream_view stream)
+  thrust::optional<block_type> block_from_upstream(std::size_t size, cuda_stream_view stream)
   {
     RMM_LOG_DEBUG("[A][Stream {}][Upstream {}B]", fmt::ptr(stream.value()), size);
 
@@ -294,7 +297,7 @@ class pool_memory_resource final
    * @return A pair comprising the allocated pointer and any unallocated remainder of the input
    * block.
    */
-  split_block allocate_from_block(block_type const& b, size_t size)
+  split_block allocate_from_block(block_type const& b, std::size_t size)
   {
     block_type const alloc{b.pointer(), size, b.is_head()};
 #ifdef RMM_POOL_TRACK_ALLOCATIONS
@@ -303,7 +306,7 @@ class pool_memory_resource final
 
     auto rest =
       (b.size() > size) ? block_type{b.pointer() + size, b.size() - size, false} : block_type{};
-    return {reinterpret_cast<void*>(alloc.pointer()), rest};
+    return {alloc, rest};
   }
 
   /**
@@ -315,7 +318,7 @@ class pool_memory_resource final
    * @return The (now freed) block associated with `p`. The caller is expected to return the block
    * to the pool.
    */
-  block_type free_block(void* p, size_t size) noexcept
+  block_type free_block(void* p, std::size_t size) noexcept
   {
 #ifdef RMM_POOL_TRACK_ALLOCATIONS
     if (p == nullptr) return block_type{};
@@ -338,9 +341,9 @@ class pool_memory_resource final
    *
    * Includes allocated as well as free memory.
    *
-   * @return size_t The total size of the currently allocated pool.
+   * @return std::size_t The total size of the currently allocated pool.
    */
-  size_t pool_size() const noexcept { return current_pool_size_; }
+  std::size_t pool_size() const noexcept { return current_pool_size_; }
 
   /**
    * @brief Free all memory allocated from the upstream memory_resource.
@@ -419,7 +422,7 @@ class pool_memory_resource final
    * @param stream to execute on
    * @return std::pair contaiing free_size and total_size of memory
    */
-  std::pair<size_t, size_t> do_get_mem_info(cuda_stream_view stream) const override
+  std::pair<std::size_t, std::size_t> do_get_mem_info(cuda_stream_view stream) const override
   {
     std::size_t free_size{};
     std::size_t total_size{};