diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp index b6123e772..958ea523b 100644 --- a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp +++ b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,12 +16,14 @@ #pragma once #include +#include #include #include #include #include +#include #include #include #include @@ -66,10 +68,10 @@ struct crtp { * Classes derived from stream_ordered_memory_resource must implement the following four methods, * documented separately: * - * 1. `size_t get_maximum_allocation_size() const` - * 2. `block_type expand_pool(size_t size, free_list& blocks, cuda_stream_view stream)` - * 3. `split_block allocate_from_block(block_type const& b, size_t size)` - * 4. `block_type free_block(void* p, size_t size) noexcept` + * 1. `std::size_t get_maximum_allocation_size() const` + * 2. `block_type expand_pool(std::size_t size, free_list& blocks, cuda_stream_view stream)` + * 3. `split_block allocate_from_block(block_type const& b, std::size_t size)` + * 4. `block_type free_block(void* p, std::size_t size) noexcept` */ template class stream_ordered_memory_resource : public crtp, public device_memory_resource { @@ -89,18 +91,18 @@ class stream_ordered_memory_resource : public crtp, public device_ // Derived classes must implement these four methods - /** + /* * @brief Get the maximum size of a single allocation supported by this suballocator memory * resource * - * Default implementation is the maximum `size_t` value, but fixed-size allocators will have a - * lower limit. Override this function in derived classes as necessary. + * Default implementation is the maximum `std::size_t` value, but fixed-size allocators will have + * a lower limit. Override this function in derived classes as necessary. * - * @return size_t The maximum size of a single allocation supported by this memory resource + * @return std::size_t The maximum size of a single allocation supported by this memory resource */ - // size_t get_maximum_allocation_size() const { return std::numeric_limits::max(); } + // std::size_t get_maximum_allocation_size() const - /** + /* * @brief Allocate space (typically from upstream) to supply the suballocation pool and return * a sufficiently sized block. * @@ -114,15 +116,12 @@ class stream_ordered_memory_resource : public crtp, public device_ * @param stream The stream on which the memory is to be used. * @return block_type a block of at least `size` bytes */ - // block_type expand_pool(size_t size, free_list& blocks, cuda_stream_view stream) + // block_type expand_pool(std::size_t size, free_list& blocks, cuda_stream_view stream) - /// Struct representing a block that has been split for allocation - struct split_block { - void* allocated_pointer; ///< The pointer allocated from a block - block_type remainder; ///< The remainder of the block from which the pointer was allocated - }; + /// Pair representing a block that has been split for allocation + using split_block = std::pair; - /** + /* * @brief Split block `b` if necessary to return a pointer to memory of `size` bytes. * * If the block is split, the remainder is returned as the remainder element in the output @@ -134,9 +133,9 @@ class stream_ordered_memory_resource : public crtp, public device_ * @return A `split_block` comprising the allocated pointer and any unallocated remainder of the * input block. */ - // split_block allocate_from_block(block_type const& b, size_t size) + // split_block allocate_from_block(block_type const& b, std::size_t size) - /** + /* * @brief Finds, frees and returns the block associated with pointer `p`. * * @param p The pointer to the memory to free. @@ -144,7 +143,7 @@ class stream_ordered_memory_resource : public crtp, public device_ * @return The (now freed) block associated with `p`. The caller is expected to return the block * to the pool. */ - // block_type free_block(void* p, size_t size) noexcept + // block_type free_block(void* p, std::size_t size) noexcept /** * @brief Returns the block `b` (last used on stream `stream_event`) to the pool. @@ -213,16 +212,13 @@ class stream_ordered_memory_resource : public crtp, public device_ rmm::bad_alloc, "Maximum allocation size exceeded"); auto const b = this->underlying().get_block(bytes, stream_event); - auto split = this->underlying().allocate_from_block(b, bytes); - if (split.remainder.is_valid()) stream_free_blocks_[stream_event].insert(split.remainder); - RMM_LOG_TRACE("[A][stream {:p}][{}B][{:p}]", - fmt::ptr(stream_event.stream), - bytes, - fmt::ptr(split.allocated_pointer)); + + RMM_LOG_TRACE( + "[A][stream {:p}][{}B][{:p}]", fmt::ptr(stream_event.stream), bytes, fmt::ptr(b.pointer())); log_summary_trace(); - return split.allocated_pointer; + return b.pointer(); } /** @@ -234,9 +230,12 @@ class stream_ordered_memory_resource : public crtp, public device_ */ virtual void do_deallocate(void* p, std::size_t bytes, cuda_stream_view stream) override { + RMM_LOG_TRACE("[D][stream {:p}][{}B][{:p}]", fmt::ptr(stream.value()), bytes, p); + + if (bytes <= 0 || p == nullptr) return; + lock_guard lock(mtx_); auto stream_event = get_event(stream); - RMM_LOG_TRACE("[D][stream {:p}][{}B][{:p}]", fmt::ptr(stream_event.stream), bytes, p); bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT); auto const b = this->underlying().free_block(p, bytes); @@ -301,19 +300,35 @@ class stream_ordered_memory_resource : public crtp, public device_ } /** - * @brief Get an avaible memory block of at least `size` bytes + * @brief Splits a block into an allocated block of `size` bytes and a remainder block, and + * inserts the remainder into a free list. + * + * @param b The block to split into allocated and remainder portions. + * @param size The size of the block to allocate from `b`. + * @param blocks The `free_list` in which to insert the remainder block. + * @return The allocated block. + */ + block_type allocate_and_insert_remainder(block_type b, std::size_t size, free_list& blocks) + { + auto const [allocated, remainder] = this->underlying().allocate_from_block(b, size); + if (remainder.is_valid()) blocks.insert(remainder); + return allocated; + } + + /** + * @brief Get an available memory block of at least `size` bytes * * @param size The number of bytes to allocate * @param stream_event The stream and associated event on which the allocation will be used. * @return block_type A block of memory of at least `size` bytes */ - block_type get_block(size_t size, stream_event_pair stream_event) + block_type get_block(std::size_t size, stream_event_pair stream_event) { // Try to find a satisfactory block in free list for the same stream (no sync required) auto iter = stream_free_blocks_.find(stream_event); if (iter != stream_free_blocks_.end()) { - block_type b = iter->second.get_block(size); - if (b.is_valid()) { return b; } + block_type const b = iter->second.get_block(size); + if (b.is_valid()) { return allocate_and_insert_remainder(b, size, iter->second); } } free_list& blocks = @@ -334,7 +349,10 @@ class stream_ordered_memory_resource : public crtp, public device_ log_summary_trace(); // no large enough blocks available after merging, so grow the pool - return this->underlying().expand_pool(size, blocks, cuda_stream_view{stream_event.stream}); + block_type const b = + this->underlying().expand_pool(size, blocks, cuda_stream_view{stream_event.stream}); + + return allocate_and_insert_remainder(b, size, blocks); } /** @@ -352,7 +370,7 @@ class stream_ordered_memory_resource : public crtp, public device_ * @return A block with non-null pointer and size >= `size`, or a nullptr block if none is * available in `blocks`. */ - block_type get_block_from_other_stream(size_t size, + block_type get_block_from_other_stream(std::size_t size, stream_event_pair stream_event, free_list& blocks, bool merge_first) @@ -375,10 +393,18 @@ class stream_ordered_memory_resource : public crtp, public device_ stream_free_blocks_.erase(it); - return blocks.get_block(size); // get the best fit block in merged lists + block_type const b = blocks.get_block(size); // get the best fit block in merged lists + if (b.is_valid()) { return allocate_and_insert_remainder(b, size, blocks); } } else { - return other_blocks.get_block(size); // get the best fit block in other list + block_type const b = other_blocks.get_block(size); + if (b.is_valid()) { + // Since we found a block associated with a different stream, we have to insert a wait + // on the stream's associated event into the allocating stream. + RMM_CUDA_TRY(cudaStreamWaitEvent(stream_event.stream, other_event, 0)); + return allocate_and_insert_remainder(b, size, other_blocks); + } } + return block_type{}; }(); if (b.is_valid()) { @@ -387,12 +413,6 @@ class stream_ordered_memory_resource : public crtp, public device_ fmt::ptr(stream_event.stream), size, fmt::ptr(it->first.stream)); - - if (not merge_first) { - merge_lists(stream_event, blocks, other_event, std::move(other_blocks)); - stream_free_blocks_.erase(it); - } - return b; } } diff --git a/include/rmm/mr/device/fixed_size_memory_resource.hpp b/include/rmm/mr/device/fixed_size_memory_resource.hpp index fe5f9707b..ac7dbc131 100644 --- a/include/rmm/mr/device/fixed_size_memory_resource.hpp +++ b/include/rmm/mr/device/fixed_size_memory_resource.hpp @@ -129,10 +129,10 @@ class fixed_size_memory_resource /** * @brief Get the (fixed) size of allocations supported by this memory resource * - * @return size_t The (fixed) maximum size of a single allocation supported by this memory + * @return std::size_t The (fixed) maximum size of a single allocation supported by this memory * resource */ - size_t get_maximum_allocation_size() const { return get_block_size(); } + std::size_t get_maximum_allocation_size() const { return get_block_size(); } /** * @brief Allocate a block from upstream to supply the suballocation pool. @@ -144,7 +144,7 @@ class fixed_size_memory_resource * @param stream The stream on which the memory is to be used. * @return block_type The allocated block */ - block_type expand_pool(size_t size, free_list& blocks, cuda_stream_view stream) + block_type expand_pool(std::size_t size, free_list& blocks, cuda_stream_view stream) { blocks.insert(std::move(blocks_from_upstream(stream))); return blocks.get_block(size); @@ -181,9 +181,9 @@ class fixed_size_memory_resource * @return A pair comprising the allocated pointer and any unallocated remainder of the input * block. */ - split_block allocate_from_block(block_type const& b, size_t size) + split_block allocate_from_block(block_type const& b, std::size_t size) { - return split_block{b.pointer(), block_type{nullptr}}; + return {b, block_type{nullptr}}; } /** @@ -195,7 +195,7 @@ class fixed_size_memory_resource * @return The (now freed) block associated with `p`. The caller is expected to return the block * to the pool. */ - block_type free_block(void* p, size_t size) noexcept + block_type free_block(void* p, std::size_t size) noexcept { // Deallocating a fixed-size block just inserts it in the free list, which is // handled by the parent class diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index 8b09e5208..7a2a5b9c7 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include #include @@ -141,11 +141,14 @@ class pool_memory_resource final * @brief Get the maximum size of allocations supported by this memory resource * * Note this does not depend on the memory size of the device. It simply returns the maximum - * value of `size_t` + * value of `std::size_t` * - * @return size_t The maximum size of a single allocation supported by this memory resource + * @return std::size_t The maximum size of a single allocation supported by this memory resource */ - size_t get_maximum_allocation_size() const { return std::numeric_limits::max(); } + std::size_t get_maximum_allocation_size() const + { + return std::numeric_limits::max(); + } /** * @brief Try to expand the pool by allocating a block of at least `min_size` bytes from @@ -247,7 +250,7 @@ class pool_memory_resource final * Returns 0 if the requested size cannot be satisfied. * * @param size The size of the minimum allocation immediately needed - * @return size_t The computed size to grow the pool. + * @return std::size_t The computed size to grow the pool. */ std::size_t size_to_grow(std::size_t size) const { @@ -268,7 +271,7 @@ class pool_memory_resource final * @param stream The stream on which the memory is to be used. * @return block_type The allocated block */ - thrust::optional block_from_upstream(size_t size, cuda_stream_view stream) + thrust::optional block_from_upstream(std::size_t size, cuda_stream_view stream) { RMM_LOG_DEBUG("[A][Stream {}][Upstream {}B]", fmt::ptr(stream.value()), size); @@ -294,7 +297,7 @@ class pool_memory_resource final * @return A pair comprising the allocated pointer and any unallocated remainder of the input * block. */ - split_block allocate_from_block(block_type const& b, size_t size) + split_block allocate_from_block(block_type const& b, std::size_t size) { block_type const alloc{b.pointer(), size, b.is_head()}; #ifdef RMM_POOL_TRACK_ALLOCATIONS @@ -303,7 +306,7 @@ class pool_memory_resource final auto rest = (b.size() > size) ? block_type{b.pointer() + size, b.size() - size, false} : block_type{}; - return {reinterpret_cast(alloc.pointer()), rest}; + return {alloc, rest}; } /** @@ -315,7 +318,7 @@ class pool_memory_resource final * @return The (now freed) block associated with `p`. The caller is expected to return the block * to the pool. */ - block_type free_block(void* p, size_t size) noexcept + block_type free_block(void* p, std::size_t size) noexcept { #ifdef RMM_POOL_TRACK_ALLOCATIONS if (p == nullptr) return block_type{}; @@ -338,9 +341,9 @@ class pool_memory_resource final * * Includes allocated as well as free memory. * - * @return size_t The total size of the currently allocated pool. + * @return std::size_t The total size of the currently allocated pool. */ - size_t pool_size() const noexcept { return current_pool_size_; } + std::size_t pool_size() const noexcept { return current_pool_size_; } /** * @brief Free all memory allocated from the upstream memory_resource. @@ -419,7 +422,7 @@ class pool_memory_resource final * @param stream to execute on * @return std::pair contaiing free_size and total_size of memory */ - std::pair do_get_mem_info(cuda_stream_view stream) const override + std::pair do_get_mem_info(cuda_stream_view stream) const override { std::size_t free_size{}; std::size_t total_size{};