diff --git a/CHANGELOG.md b/CHANGELOG.md index 67c4e765c..82fa165c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,8 @@ - PR #375 Support out-of-band buffers in Python pickling - PR #391 Add `get_default_resource_type` -- PR #396 Remove deprecated RMM APIs. +- PR #396 Remove deprecated RMM APIs +- PR #425 Add CUDA per-thread default stream support and thread safety to `pool_memory_resource` ## Improvements @@ -29,7 +30,7 @@ - PR #383 Explicitly require NumPy - PR #398 Fix missing head flag in merge_blocks (pool_memory_resource) and improve block class - PR #403 Mark Cython `memory_resource_wrappers` `extern` as `nogil` -- PR #406 Sets Google Benchmark to a fixed version, v1.5.1. +- PR #406 Sets Google Benchmark to a fixed version, v1.5.1 # RMM 0.14.0 (Date TBD) diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp index 1e5e8000c..954ec2b76 100644 --- a/benchmarks/random_allocations/random_allocations.cpp +++ b/benchmarks/random_allocations/random_allocations.cpp @@ -58,7 +58,6 @@ allocation remove_at(allocation_vector& allocs, std::size_t index) // nested MR type names can get long... using cuda_mr = rmm::mr::cuda_memory_resource; using pool_mr = rmm::mr::pool_memory_resource; -using safe_pool_mr = rmm::mr::thread_safe_resource_adaptor; using fixed_multisize_mr = rmm::mr::fixed_multisize_memory_resource; using hybrid_mr = rmm::mr::hybrid_memory_resource; using safe_hybrid_mr = rmm::mr::thread_safe_resource_adaptor; @@ -184,9 +183,9 @@ resource_wrapper::resource_wrapper() } template <> -resource_wrapper::resource_wrapper() +resource_wrapper::resource_wrapper() { - mr = new rmm::mr::thread_safe_resource_adaptor(new pool_mr(new cuda_mr())); + mr = new pool_mr(new cuda_mr()); } template <> @@ -228,12 +227,10 @@ resource_wrapper::~resource_wrapper() } template <> -resource_wrapper::~resource_wrapper() +resource_wrapper::~resource_wrapper() { - auto pool = mr->get_upstream(); - auto cuda = pool->get_upstream(); + auto cuda = mr->get_upstream(); delete mr; - delete pool; delete cuda; } @@ -299,7 +296,7 @@ void declare_benchmark(std::string name) if (name == "hybrid") BENCHMARK_TEMPLATE(BM_RandomAllocations, safe_hybrid_mr)->Apply(benchmark_range); else if (name == "pool") - BENCHMARK_TEMPLATE(BM_RandomAllocations, safe_pool_mr)->Apply(benchmark_range); + BENCHMARK_TEMPLATE(BM_RandomAllocations, pool_mr)->Apply(benchmark_range); else if (name == "fixed_multisize") BENCHMARK_TEMPLATE(BM_RandomAllocations, fixed_multisize_mr)->Apply(benchmark_range); else if (name == "cnmem") @@ -318,7 +315,7 @@ int main(int argc, char** argv) if (argc > 3) max_size = atoi(argv[3]); declare_benchmark(mr_name); } else { - BENCHMARK_TEMPLATE(BM_RandomAllocations, safe_pool_mr)->Apply(benchmark_range); + BENCHMARK_TEMPLATE(BM_RandomAllocations, pool_mr)->Apply(benchmark_range); BENCHMARK_TEMPLATE(BM_RandomAllocations, safe_hybrid_mr)->Apply(benchmark_range); BENCHMARK_TEMPLATE(BM_RandomAllocations, cnmem_mr)->Apply(benchmark_range); BENCHMARK_TEMPLATE(BM_RandomAllocations, cuda_mr)->Apply(benchmark_range); diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index 7732ce53a..57e98900e 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -22,14 +22,17 @@ #include #include +#include #include -#include +#include #include #include #include #include #include #include +#include +#include #include namespace rmm { @@ -39,6 +42,9 @@ namespace mr { * @brief A coalescing best-fit suballocator which uses a pool of memory allocated from * an upstream memory_resource. * + * Allocation (do_allocate()) and deallocation (do_deallocate()) are thread-safe. Also, + * this class is compatible with CUDA per-thread default stream. + * * @tparam UpstreamResource memory_resource to use for allocating the pool. Implements * rmm::mr::device_memory_resource interface. */ @@ -75,15 +81,24 @@ class pool_memory_resource final : public device_memory_resource { if (maximum_pool_size == default_maximum_size) maximum_pool_size_ = props.totalGlobalMem; - // Allocate initial block - stream_free_blocks_[0].insert(block_from_upstream(initial_pool_size, 0)); + // Allocate initial block and insert into free list for the legacy default stream + stream_free_blocks_[get_event(cudaStreamLegacy)].insert( + block_from_upstream(initial_pool_size, 0)); } /** * @brief Destroy the `pool_memory_resource` and deallocate all memory it allocated using * the upstream resource. */ - ~pool_memory_resource() { release(); } + ~pool_memory_resource() + { + // foo + release(); +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM + for (auto& event : ptds_events_) + event.get().parent = nullptr; +#endif + } /** * @brief Queries whether the resource supports use of non-null CUDA streams for @@ -108,84 +123,99 @@ class pool_memory_resource final : public device_memory_resource { Upstream* get_upstream() const noexcept { return upstream_mr_; } private: - using block = rmm::mr::detail::block; - using free_list = rmm::mr::detail::free_list<>; + using id_type = uint32_t; + using block = rmm::mr::detail::block; + using free_list = rmm::mr::detail::free_list<>; + using lock_guard = std::lock_guard; /** - * @brief Find a free block of at least `size` bytes in `free_list` `blocks` associated with - * stream `blocks_stream`, for use on `stream`. + * @brief A structure pairing a CUDA stream and an associated event for the stream. * - * @param blocks The `free_list` to look in for a free block of sufficient size. - * @param blocks_stream The stream that all blocks in `blocks` are associated with. - * @param size The requested size of the allocation. - - * @param stream The stream on which the allocation is being requested. - * @return block A block with non-null pointer and size >= `size`, or a nullptr block if none is - * available in `blocks`. */ - block block_from_stream(free_list& blocks, - cudaStream_t blocks_stream, - size_t size, - cudaStream_t stream) - { - block const b = blocks.best_fit(size); // get the best fit block - - // If we found a block associated with a different stream, - // we have to synchronize the stream in order to use it - if ((blocks_stream != stream) && b.is_valid()) { - cudaError_t result = cudaStreamSynchronize(blocks_stream); - - RMM_EXPECTS((result == cudaSuccess || // stream synced - result == cudaErrorInvalidResourceHandle), // stream deleted - rmm::bad_alloc, - "cudaStreamSynchronize failure"); + struct stream_event_pair { + cudaStream_t stream; + cudaEvent_t event; - // Now that this stream is synced, insert all other blocks into this stream's list - // Note: This could cause thrashing between two streams. On the other hand, it reduces - // fragmentation by coalescing. - stream_free_blocks_[stream].insert(blocks.begin(), blocks.end()); + bool operator<(stream_event_pair const& rhs) const { return event < rhs.event; } + }; - // remove this stream from the freelist - stream_free_blocks_.erase(blocks_stream); + /** + * @brief Find a free block of at least `size` bytes in a `free_list` with a different + * stream/event than `stream_event`. + * + * If an appropriate block is found in a free list F associated with event E, if + * `CUDA_API_PER_THREAD_DEFAULT_STREAM` is defined, `stream_event.stream` will be made to wait on + * event E. Otherwise, the stream associated with free list F will be synchronized. In either + * case all other blocks in free list F will be moved to the free list associated with + * `stream_event.stream`. This results in coalescing with other blocks in that free list, + * hopefully reducing fragmentation. + * + * @param size The requested size of the allocation. + * @param stream_event The stream and associated event on which the allocation is being requested. + * @return A block with non-null pointer and size >= `size`, or a nullptr block if none is + * available in `blocks`. + */ + block get_block_from_other_stream(size_t size, stream_event_pair stream_event) + { + // nothing in this stream's free list, look for one on another stream + for (auto s = stream_free_blocks_.begin(); s != stream_free_blocks_.end(); ++s) { + auto blocks_event = s->first; + if (blocks_event.event != stream_event.event) { + auto blocks = s->second; + + block const b = blocks.best_fit(size); // get the best fit block + + if (b.is_valid()) { + // Since we found a block associated with a different stream, we have to insert a wait on + // the stream's associated event into the allocating stream. + // TODO: could eliminate this ifdef and have the same behavior for PTDS and non-PTDS + // But the cudaEventRecord() on every free_block reduces performance significantly +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM + RMM_CUDA_TRY(cudaStreamWaitEvent(stream_event.stream, blocks_event.event, 0)); +#else + RMM_CUDA_TRY(cudaStreamSynchronize(blocks_event.stream)); +#endif + // Move all the blocks to the requesting stream, since it has waited on them + stream_free_blocks_[stream_event].insert(blocks.begin(), blocks.end()); + stream_free_blocks_.erase(s); + + return b; + } + } } - return b; + return block{}; } /** * @brief Find an available block in the pool of at least `size` bytes, for use on `stream`. * - * Attempts to find a free block that was last used on `stream` to avoid synchronization. If none - * is available, it finds a block last used on another stream. In this case, the stream associated - * with the found block is synchronized to ensure all asynchronous work on the memory is finished - * before it is used on `stream`. + * Attempts to find a free block that was last used on `stream` to avoid synchronization. If + * none is available, it finds a block last used on another stream. In this case, the stream + * associated with the found block is synchronized to ensure all asynchronous work on the memory + * is finished before it is used on `stream`. + * + * @throw `std::bad_alloc` if the requested allocation could not be fulfilled. * * @param size The size of the requested allocation, in bytes. - * @param stream The stream on which the allocation will be used. + * @param stream_event The stream and associated event on which the allocation is being requested. * @return block A block with non-null pointer and size >= `size`. */ - block available_larger_block(size_t size, cudaStream_t stream) + block available_larger_block(size_t size, stream_event_pair stream_event) { - // Try to find a larger block in free list for the same stream - auto iter = stream_free_blocks_.find(stream); + // Try to find a larger block in free list for the same stream (no sync required) + auto iter = stream_free_blocks_.find(stream_event); if (iter != stream_free_blocks_.end()) { - block b = block_from_stream(iter->second, stream, size, stream); + block b = iter->second.best_fit(size); if (b.is_valid()) return b; } - // nothing in this stream's free list, look for one on another stream - auto s = stream_free_blocks_.begin(); - while (s != stream_free_blocks_.end()) { - if (s->first != stream) { - block b = block_from_stream(s->second, s->first, size, stream); - if (b.is_valid()) return b; - } - ++s; - } + block b = get_block_from_other_stream(size, stream_event); + if (b.is_valid()) return b; // no larger blocks available on other streams, so grow the pool and create a block size_t grow_size = size_to_grow(size); RMM_EXPECTS(grow_size > 0, rmm::bad_alloc, "Maximum pool size exceeded"); - return block_from_upstream(grow_size, stream); + return block_from_upstream(grow_size, stream_event.stream); } /** @@ -195,16 +225,16 @@ class pool_memory_resource final : public device_memory_resource { * * @param b The block to allocate from. * @param size The size in bytes of the requested allocation. - * @param stream The stream on which the allocation will be used. + * @param stream_event The stream and associated event on which the allocation will be used. * @return void* The pointer to the allocated memory. */ - void* allocate_from_block(block const& b, size_t size, cudaStream_t stream) + void* allocate_from_block(block const& b, size_t size, stream_event_pair stream_event) { block const alloc{b.pointer(), size, b.is_head()}; if (b.size() > size) { block rest{b.pointer() + size, b.size() - size, false}; - stream_free_blocks_[stream].insert(rest); + stream_free_blocks_[stream_event].insert(rest); } allocated_blocks_.insert(alloc); @@ -222,11 +252,21 @@ class pool_memory_resource final : public device_memory_resource { { if (p == nullptr) return; + stream_event_pair stream_event = get_event(stream); + auto const i = allocated_blocks_.find(static_cast(p)); assert(i != allocated_blocks_.end()); - assert(i->size == rmm::detail::align_up(size, allocation_alignment)); + assert(i->size() == rmm::detail::align_up(size, allocation_alignment)); - stream_free_blocks_[stream].insert(*i); + // TODO: cudaEventRecord has significant overhead on deallocations, however it could mean less + // synchronization So we need to test in real non-PTDS applications that have multiple streams + // whether or not the overhead is worth it +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM + auto result = cudaEventRecord(stream_event.event, stream); + assert(cudaSuccess == result); +#endif + + stream_free_blocks_[stream_event].insert(*i); allocated_blocks_.erase(i); } @@ -284,9 +324,18 @@ class pool_memory_resource final : public device_memory_resource { */ void release() { + lock_guard lock(mtx_); + for (auto b : upstream_blocks_) upstream_mr_->deallocate(b.pointer(), b.size()); upstream_blocks_.clear(); + allocated_blocks_.clear(); + + for (auto s_e : stream_events_) + destroy_event(s_e.second); + stream_events_.clear(); + stream_free_blocks_.clear(); + current_pool_size_ = 0; } @@ -297,6 +346,8 @@ class pool_memory_resource final : public device_memory_resource { */ void print() { + lock_guard lock(mtx_); + std::size_t free, total; std::tie(free, total) = upstream_mr_->get_mem_info(0); std::cout << "GPU free memory: " << free << "total: " << total << "\n"; @@ -306,7 +357,7 @@ class pool_memory_resource final : public device_memory_resource { for (auto h : upstream_blocks_) { h.print(); - upstream_total += h.size; + upstream_total += h.size(); } std::cout << "total upstream: " << upstream_total << " B\n"; @@ -317,7 +368,7 @@ class pool_memory_resource final : public device_memory_resource { std::cout << "sync free blocks: "; for (auto s : stream_free_blocks_) { - std::cout << "stream " << s.first << " "; + std::cout << "stream: " << s.first.stream << " event: " << s.first.event << " "; s.second.print(); } std::cout << "\n"; @@ -338,9 +389,14 @@ class pool_memory_resource final : public device_memory_resource { void* do_allocate(std::size_t bytes, cudaStream_t stream) override { if (bytes <= 0) return nullptr; - bytes = rmm::detail::align_up(bytes, allocation_alignment); - block const b = available_larger_block(bytes, stream); - return allocate_from_block(b, bytes, stream); + + lock_guard lock(mtx_); + + stream_event_pair stream_event = get_event(stream); + bytes = rmm::detail::align_up(bytes, allocation_alignment); + block const b = available_larger_block(bytes, stream_event); + auto p = allocate_from_block(b, bytes, stream_event); + return p; } /** @@ -352,6 +408,7 @@ class pool_memory_resource final : public device_memory_resource { */ void do_deallocate(void* p, std::size_t bytes, cudaStream_t stream) override { + lock_guard lock(mtx_); free_block(p, bytes, stream); } @@ -371,19 +428,125 @@ class pool_memory_resource final : public device_memory_resource { return std::make_pair(free_size, total_size); } +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM + /** + * @brief RAII wrapper for a CUDA event for a per-thread default stream + * + * These objects take care of creating and freeing an event associated with a per-thread default + * stream. They are needed because the event needs to exist in thread_local memory, so it must + * be cleaned up when the thread exits. They maintain a pointer to the parent + * (pool_memory_resource) that created them, because when a thread exits, if the parent still + * exists, they must tell the parent to merge the free list associated with the event. Also, the + * parent maintains a list of references to the created cuda_event objects so that if any remain + * when the parent is destroyed, it can set their parent pointers to nullptr to we don't have a + * use-after-free race. Note: all of this is a workaround for the fact that there is no way + * currently to get a unique handle to a CUDA per-thread default stream. :( + */ + struct default_stream_event { + default_stream_event(pool_memory_resource* parent) : parent(parent) + { + auto result = cudaEventCreateWithFlags(&event, cudaEventDisableTiming); + assert(cudaSuccess == result); + if (parent) parent->ptds_events_.push_back(*this); + } + ~default_stream_event() + { + if (parent) { + lock_guard lock(parent->mtx_); + parent->destroy_event(stream_event_pair{cudaStreamDefault, event}); + } + } + + cudaEvent_t event; + pool_memory_resource* parent; + }; +#endif + + /** + * @brief get a unique CUDA event (possibly new) associated with `stream` + * + * The event is created on the first call, and it is not recorded. If compiled for per-thread + * default stream and `stream` is the default stream, the event is created in thread local memory + * and is unique per CPU thread. + * + * @param stream The stream for which to get an event. + * @return The stream_event for `stream`. + */ + stream_event_pair get_event(cudaStream_t stream) + { +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM + if (cudaStreamDefault == stream || cudaStreamPerThread == stream) { + static thread_local default_stream_event e{this}; + return stream_event_pair{stream, e.event}; + } +#else + // We use cudaStreamLegacy as the event map key for the default stream for consistency between + // PTDS and non-PTDS mode. In PTDS mode, the cudaStreamLegacy map key will only exist if the + // user explicitly passes it, so it is used as the default location for the free list + // at construction, and for merging free lists when a thread exits (see destroy_event()). + // For consistency, the same key is used for null stream free lists in non-PTDS mode. + if (cudaStreamDefault == stream) { stream = cudaStreamLegacy; } +#endif + + auto iter = stream_events_.find(stream); + if (iter == stream_events_.end()) { + stream_event_pair stream_event{stream}; + auto result = cudaEventCreateWithFlags(&stream_event.event, cudaEventDisableTiming); + assert(cudaSuccess == result); + stream_events_[stream] = stream_event; + return stream_event; + } else { + return iter->second; + } + } + + /** + * @brief Destroy the specified CUDA event and move all free blocks for the associated stream + * to the default stream free list. + * + * @param event The event to destroy. + */ + void destroy_event(stream_event_pair stream_event) + { + // If we are destroying an event with associated free list, we need to synchronize that event + // and then merge its free list into the (legacy) default stream's list + auto free_list_iter = stream_free_blocks_.find(stream_event); + if (free_list_iter != stream_free_blocks_.end()) { + auto blocks = free_list_iter->second; + stream_free_blocks_[get_event(cudaStreamLegacy)].insert(blocks.begin(), blocks.end()); + stream_free_blocks_.erase(free_list_iter); + + auto result = cudaEventSynchronize(stream_event.event); + assert(cudaSuccess == result); + } + auto result = cudaEventDestroy(stream_event.event); + assert(cudaSuccess == result); + } + size_t maximum_pool_size_; size_t current_pool_size_{0}; Upstream* upstream_mr_; // The "heap" to allocate the pool from - // map of [stream_id, free_list] pairs - // stream stream_id must be synced before allocating from this list to a different stream - std::map stream_free_blocks_; + // map of [cudaEvent_t, free_list] pairs + // Event (or associated stream) must be synced before allocating from associated free_list to a + // different stream + std::map stream_free_blocks_; std::set> allocated_blocks_; // blocks allocated from upstream: so they can be easily freed std::vector upstream_blocks_; + + // bidirectional mapping between non-default streams and events + std::unordered_map stream_events_; + +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM + // references to per-thread events to avoid use-after-free when threads exit after MR is deleted + std::list> ptds_events_; +#endif + + std::mutex mutable mtx_; // mutex for thread-safe access }; } // namespace mr diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index cad684133..6f20478cf 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -53,7 +53,9 @@ include_directories("${GTEST_INCLUDE_DIR}" ################################################################################################### # - library paths --------------------------------------------------------------------------------- -link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc +# CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link +# directories for nvcc +link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" "${CMAKE_BINARY_DIR}/lib" "${GTEST_LIBRARY_DIR}") @@ -66,10 +68,11 @@ link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT ################################################################################################### ################################################################################################### -# - device mr tests ------------------------------------------------------------------------------------- +# - device mr tests -------------------------------------------------------------------------------- set(DEVICE_MR_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/mr_tests.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/mr_multithreaded_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/thrust_allocator_tests.cu") @@ -77,16 +80,28 @@ ConfigureTest(DEVICE_MR_TEST "${DEVICE_MR_TEST_SRC}") ################################################################################################### ################################################################################################### -# - host mr tests ------------------------------------------------------------------------------------- +# - device mr per-thread default stream tests------------------------------------------------------ + + +set(DEVICE_MR_PTDS_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/mr/device/mr_multithreaded_tests.cpp") + +ConfigureTest(DEVICE_MR_PTDS_TEST "${DEVICE_MR_PTDS_TEST_SRC}") +target_compile_definitions(DEVICE_MR_PTDS_TEST PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM) + + +################################################################################################### +################################################################################################### +# - host mr tests ---------------------------------------------------------------------------------- set(HOST_MR_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/mr/host/mr_tests.cpp") - ConfigureTest(HOST_MR_TEST "${HOST_MR_TEST_SRC}") + ################################################################################################### ################################################################################################### -# - device buffer tests ------------------------------------------------------------------------------------- +# - device buffer tests ---------------------------------------------------------------------------- set(BUFFER_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/device_buffer_tests.cu") @@ -95,14 +110,14 @@ ConfigureTest(DEVICE_BUFFER_TEST "${BUFFER_TEST_SRC}") ################################################################################################### ################################################################################################### -# - device scalar tests ------------------------------------------------------------------------------------- +# - device scalar tests ---------------------------------------------------------------------------- set(SCALAR_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/device_scalar_tests.cpp") ConfigureTest(DEVICE_SCALAR_TEST "${SCALAR_TEST_SRC}") ################################################################################################### -# - logger tests ------------------------------------------------------------------------------------- +# - logger tests ---------------------------------------------------------------------------------- set(LOGGER_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/logger_tests.cpp") @@ -110,7 +125,7 @@ set(LOGGER_TEST_SRC ConfigureTest(LOGGER_TEST "${LOGGER_TEST_SRC}") ################################################################################################### -# - uvector tests ------------------------------------------------------------------------------------- +# - uvector tests --------------------------------------------------------------------------------- set(DEVICE_UVECTOR_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/device_uvector_tests.cu") diff --git a/tests/mr/device/mr_multithreaded_tests.cpp b/tests/mr/device/mr_multithreaded_tests.cpp new file mode 100644 index 000000000..2cb844b55 --- /dev/null +++ b/tests/mr/device/mr_multithreaded_tests.cpp @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gtest/gtest.h" +#include "mr/device/cuda_memory_resource.hpp" +#include "mr/device/default_memory_resource.hpp" +#include "mr/device/pool_memory_resource.hpp" +#include "mr_test.hpp" + +#include +#include + +namespace { + +using thread_safe_fixed_size_mr = rmm::mr::thread_safe_resource_adaptor; +using thread_safe_fixed_multisize_mr = rmm::mr::thread_safe_resource_adaptor; +using thread_safe_fixed_multisize_pool_mr = + rmm::mr::thread_safe_resource_adaptor; +using thread_safe_hybrid_mr = rmm::mr::thread_safe_resource_adaptor; + +constexpr std::size_t num_threads{4}; + +template +void spawn(Task task, Arguments... args) +{ + std::vector threads; + threads.reserve(num_threads); + for (int i = 0; i < num_threads; ++i) + threads.emplace_back(std::thread(task, args...)); + + for (auto& t : threads) + t.join(); +} + +} // namespace + +// specialize test constructor for thread-safe types + +template <> +inline MRTest::MRTest() + : mr{new thread_safe_fixed_size_mr(new fixed_size_mr(rmm::mr::get_default_resource()))} +{ +} + +template <> +inline MRTest::~MRTest() +{ + auto fixed = mr->get_upstream(); + this->mr.reset(); + delete fixed; +} + +template <> +inline MRTest::MRTest() + : mr{new thread_safe_fixed_multisize_mr(new fixed_multisize_mr(rmm::mr::get_default_resource()))} +{ +} + +template <> +inline MRTest::~MRTest() +{ + auto fixed = mr->get_upstream(); + this->mr.reset(); + delete fixed; +} + +template <> +inline MRTest::MRTest() + : mr{new thread_safe_fixed_multisize_pool_mr( + new fixed_multisize_pool_mr(new pool_mr(new rmm::mr::cuda_memory_resource)))} +{ +} + +template <> +inline MRTest::~MRTest() +{ + auto fixed = mr->get_upstream(); + auto pool = fixed->get_upstream(); + auto cuda = pool->get_upstream(); + this->mr.reset(); + delete fixed; + delete pool; + delete cuda; +} + +template <> +inline MRTest::MRTest() +{ + rmm::mr::cuda_memory_resource* cuda = new rmm::mr::cuda_memory_resource{}; + pool_mr* pool = new pool_mr(cuda); + this->mr.reset(new thread_safe_hybrid_mr(new hybrid_mr(new fixed_multisize_pool_mr(pool), pool))); +} + +template <> +inline MRTest::~MRTest() +{ + auto hybrid = mr->get_upstream(); + auto fixed = hybrid->get_small_mr(); + auto pool = hybrid->get_large_mr(); + auto cuda = pool->get_upstream(); + this->mr.reset(); + delete hybrid; + delete fixed; + delete pool; + delete cuda; +} + +// specialize get_max_size for thread-safe MRs +template <> +std::size_t get_max_size(thread_safe_fixed_size_mr* mr) +{ + return mr->get_upstream()->get_block_size(); +} + +template <> +std::size_t get_max_size(thread_safe_fixed_multisize_mr* mr) +{ + return mr->get_upstream()->get_max_size(); +} + +template <> +std::size_t get_max_size(thread_safe_fixed_multisize_pool_mr* mr) +{ + return mr->get_upstream()->get_max_size(); +} + +// specialize random allocations to not allocate too large +template <> +inline void test_random_allocations(thread_safe_fixed_size_mr* mr, + std::size_t num_allocations, + cudaStream_t stream) +{ + return test_random_allocations_base(mr, num_allocations, 1_MiB, stream); +} + +template <> +inline void test_random_allocations( + thread_safe_fixed_multisize_mr* mr, std::size_t num_allocations, cudaStream_t stream) +{ + return test_random_allocations_base(mr, num_allocations, 1_MiB, stream); +} + +template <> +inline void test_random_allocations( + thread_safe_fixed_multisize_pool_mr* mr, std::size_t num_allocations, cudaStream_t stream) +{ + return test_random_allocations_base(mr, num_allocations, 1_MiB, stream); +} + +template <> +inline void test_mixed_random_allocation_free( + thread_safe_fixed_size_mr* mr, cudaStream_t stream) +{ + test_mixed_random_allocation_free_base(mr, 1_MiB, stream); +} + +template <> +inline void test_mixed_random_allocation_free( + thread_safe_fixed_multisize_mr* mr, cudaStream_t stream) +{ + test_mixed_random_allocation_free_base(mr, 4_MiB, stream); +} + +template <> +inline void test_mixed_random_allocation_free( + thread_safe_fixed_multisize_pool_mr* mr, cudaStream_t stream) +{ + test_mixed_random_allocation_free_base(mr, 4_MiB, stream); +} + +// Test on all memory resource classes +using resources = ::testing::Types; + +template +using MRTest_mt = MRTest; + +TYPED_TEST_CASE(MRTest_mt, resources); + +TEST(DefaultTest, UseDefaultResource_mt) { spawn(test_get_default_resource); } + +TYPED_TEST(MRTest_mt, SetDefaultResource_mt) +{ + // single thread changes default resource, then multiple threads use it + + // Not necessarily false, since two cuda_memory_resources are always equal + // EXPECT_FALSE(this->mr->is_equal(*rmm::mr::get_default_resource())); + rmm::mr::device_memory_resource* old{nullptr}; + EXPECT_NO_THROW(old = rmm::mr::set_default_resource(this->mr.get())); + EXPECT_NE(nullptr, old); + + spawn([mr = this->mr.get()]() { + EXPECT_EQ(mr, rmm::mr::get_default_resource()); + test_get_default_resource(); // test allocating with the new default resource + }); + + // setting default resource w/ nullptr should reset to initial + EXPECT_NO_THROW(rmm::mr::set_default_resource(nullptr)); + EXPECT_TRUE(old->is_equal(*rmm::mr::get_default_resource())); + // Not necessarily false, since two cuda_memory_resources are always equal + // EXPECT_FALSE(this->mr->is_equal(*rmm::mr::get_default_resource())); +} + +TYPED_TEST(MRTest_mt, Allocate) { spawn(test_various_allocations, this->mr.get()); } + +TYPED_TEST(MRTest_mt, AllocateOnStream) +{ + spawn(test_various_allocations_on_stream, this->mr.get(), this->stream); +} + +TYPED_TEST(MRTest_mt, RandomAllocations) +{ + spawn(test_random_allocations, this->mr.get(), 100, nullptr); +} + +TYPED_TEST(MRTest_mt, RandomAllocationsStream) +{ + spawn(test_random_allocations, this->mr.get(), 100, this->stream); +} + +TYPED_TEST(MRTest_mt, MixedRandomAllocationFree) +{ + spawn(test_mixed_random_allocation_free, this->mr.get(), nullptr); +} + +TYPED_TEST(MRTest_mt, MixedRandomAllocationFreeStream) +{ + spawn(test_mixed_random_allocation_free, this->mr.get(), this->stream); +} + +template +void allocate_loop(MemoryResourceType* mr, + std::size_t num_allocations, + std::list& allocations, + std::mutex& mtx, + cudaStream_t stream) +{ + constexpr std::size_t max_size{1_MiB}; + + std::default_random_engine generator; + std::uniform_int_distribution size_distribution(1, max_size); + + for (std::size_t i = 0; i < num_allocations; ++i) { + size_t size = size_distribution(generator); + void* ptr{}; + EXPECT_NO_THROW(ptr = mr->allocate(size, stream)); + { + std::lock_guard lock(mtx); + allocations.emplace_back(ptr, size); + } + } +} + +template +void deallocate_loop(MemoryResourceType* mr, + std::size_t num_allocations, + std::list& allocations, + std::mutex& mtx, + cudaStream_t stream) +{ + for (std::size_t i = 0; i < num_allocations;) { + { + std::lock_guard lock(mtx); + if (allocations.empty()) + continue; + else { + i++; + allocation alloc = allocations.front(); + allocations.pop_front(); + EXPECT_NO_THROW(mr->deallocate(alloc.p, alloc.size, stream)); + } + } + } +} + +template +void test_allocate_free_different_threads(MemoryResourceType* mr, + cudaStream_t streamA, + cudaStream_t streamB) +{ + std::default_random_engine generator; + constexpr std::size_t num_allocations{100}; + constexpr std::size_t max_size{1_MiB}; + + std::uniform_int_distribution size_distribution(1, max_size); + + std::mutex mtx; + std::list allocations; + + std::thread producer(allocate_loop, + mr, + num_allocations, + std::ref(allocations), + std::ref(mtx), + streamA); + std::thread consumer(deallocate_loop, + mr, + num_allocations, + std::ref(allocations), + std::ref(mtx), + streamB); + + producer.join(); + consumer.join(); +} + +TYPED_TEST(MRTest_mt, AllocFreeDifferentThreadsDefaultStream) +{ + test_allocate_free_different_threads(this->mr.get(), nullptr, nullptr); +} + +TYPED_TEST(MRTest_mt, AllocFreeDifferentThreadsSameStream) +{ + test_allocate_free_different_threads(this->mr.get(), this->stream, this->stream); +} + +// cnmem does not allow freeing on a different stream than allocating +using resources_different_stream = ::testing::Types; + +template +using MRTestDifferentStream_mt = MRTest; + +TYPED_TEST_CASE(MRTestDifferentStream_mt, resources_different_stream); + +TYPED_TEST(MRTestDifferentStream_mt, AllocFreeDifferentThreadsDifferentStream) +{ + cudaStream_t streamB{}; + EXPECT_EQ(cudaSuccess, cudaStreamCreate(&streamB)); + test_allocate_free_different_threads(this->mr.get(), this->stream, streamB); + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(streamB)); + EXPECT_EQ(cudaSuccess, cudaStreamDestroy(streamB)); +} diff --git a/tests/mr/device/mr_test.hpp b/tests/mr/device/mr_test.hpp new file mode 100644 index 000000000..f9e9d8663 --- /dev/null +++ b/tests/mr/device/mr_test.hpp @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "gtest/gtest.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace { + +inline bool is_aligned(void* p, std::size_t alignment = 256) +{ + return (0 == reinterpret_cast(p) % alignment); +} + +/** + * @brief Returns if a pointer points to a device memory or managed memory + * allocation. + */ +inline bool is_device_memory(void* p) +{ + cudaPointerAttributes attributes{}; + if (cudaSuccess != cudaPointerGetAttributes(&attributes, p)) { return false; } +#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 + return attributes.memoryType == cudaMemoryTypeDevice; +#else + return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); +#endif +} + +// some useful allocation sizes +constexpr long operator""_B(unsigned long long const x) { return x; } +constexpr long operator""_KiB(unsigned long long const x) { return x * (long{1} << 10); } +constexpr long operator""_MiB(unsigned long long const x) { return x * (long{1} << 20); } +constexpr long operator""_GiB(unsigned long long const x) { return x * (long{1} << 30); } +constexpr long operator""_TiB(unsigned long long const x) { return x * (long{1} << 40); } +constexpr long operator""_PiB(unsigned long long const x) { return x * (long{1} << 50); } + +struct allocation { + void* p{nullptr}; + std::size_t size{0}; + allocation(void* _p, std::size_t _size) : p{_p}, size{_size} {} + allocation() = default; +}; + +// nested MR type names can get long... +using pool_mr = rmm::mr::pool_memory_resource; +using fixed_size_mr = rmm::mr::fixed_size_memory_resource; +using fixed_multisize_mr = + rmm::mr::fixed_multisize_memory_resource; +using fixed_multisize_pool_mr = rmm::mr::fixed_multisize_memory_resource; +using hybrid_mr = rmm::mr::hybrid_memory_resource; + +} // namespace + +template +std::size_t get_max_size(MemoryResourceType* mr) +{ + return std::numeric_limits::max(); +} + +template <> +inline std::size_t get_max_size(fixed_size_mr* mr) +{ + return mr->get_block_size(); +} + +template <> +inline std::size_t get_max_size(fixed_multisize_mr* mr) +{ + return mr->get_max_size(); +} + +// Various test functions, shared between single-threaded and multithreaded tests. + +inline void test_get_default_resource() +{ + EXPECT_NE(nullptr, rmm::mr::get_default_resource()); + void* p{nullptr}; + EXPECT_NO_THROW(p = rmm::mr::get_default_resource()->allocate(1_MiB)); + EXPECT_NE(nullptr, p); + EXPECT_TRUE(is_aligned(p)); + EXPECT_TRUE(is_device_memory(p)); + EXPECT_NO_THROW(rmm::mr::get_default_resource()->deallocate(p, 1_MiB)); +} + +template +void test_allocate(MemoryResourceType* mr, std::size_t bytes, cudaStream_t stream = 0) +{ + void* p{nullptr}; + if (bytes > get_max_size(mr)) { + EXPECT_THROW(p = mr->allocate(bytes), std::bad_alloc); + } else { + EXPECT_NO_THROW(p = mr->allocate(bytes)); + if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + EXPECT_NE(nullptr, p); + EXPECT_TRUE(is_aligned(p)); + EXPECT_TRUE(is_device_memory(p)); + EXPECT_NO_THROW(mr->deallocate(p, bytes)); + if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + } +} + +template +void test_various_allocations(MemoryResourceType* mr) +{ + // test allocating zero bytes + { + void* p{nullptr}; + EXPECT_NO_THROW(p = mr->allocate(0)); + EXPECT_EQ(nullptr, p); + EXPECT_NO_THROW(mr->deallocate(p, 0)); + } + + test_allocate(mr, 4_B); + test_allocate(mr, 1_KiB); + test_allocate(mr, 1_MiB); + test_allocate(mr, 1_GiB); + + // should fail to allocate too much + { + void* p{nullptr}; + EXPECT_THROW(p = mr->allocate(1_PiB), rmm::bad_alloc); + EXPECT_EQ(nullptr, p); + } +} + +template +void test_various_allocations_on_stream(MemoryResourceType* mr, cudaStream_t stream = 0) +{ + // test allocating zero bytes on non-default stream + { + void* p{nullptr}; + EXPECT_NO_THROW(p = mr->allocate(0, stream)); + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + EXPECT_NO_THROW(mr->deallocate(p, 0, stream)); + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + } + + test_allocate(mr, 4_B, stream); + test_allocate(mr, 1_KiB, stream); + test_allocate(mr, 1_MiB, stream); + test_allocate(mr, 1_GiB, stream); + + // should fail to allocate too much + { + void* p{nullptr}; + EXPECT_THROW(p = mr->allocate(1_PiB, stream), rmm::bad_alloc); + EXPECT_EQ(nullptr, p); + } +} + +template +void test_random_allocations_base(MemoryResourceType* mr, + std::size_t num_allocations = 100, + std::size_t max_size = 5_MiB, + cudaStream_t stream = 0) +{ + std::vector allocations(num_allocations); + + std::default_random_engine generator; + std::uniform_int_distribution distribution(1, max_size); + + // 100 allocations from [0,5MB) + std::for_each( + allocations.begin(), allocations.end(), [&generator, &distribution, stream, mr](allocation& a) { + a.size = distribution(generator); + EXPECT_NO_THROW(a.p = mr->allocate(a.size, stream)); + if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + EXPECT_NE(nullptr, a.p); + EXPECT_TRUE(is_aligned(a.p)); + }); + + std::for_each( + allocations.begin(), allocations.end(), [generator, distribution, stream, mr](allocation& a) { + EXPECT_NO_THROW(mr->deallocate(a.p, a.size, stream)); + if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + }); +} + +template +void test_random_allocations(MemoryResourceType* mr, + std::size_t num_allocations = 100, + cudaStream_t stream = 0) +{ + return test_random_allocations_base(mr, num_allocations, 5_MiB, stream); +} + +template <> +inline void test_random_allocations(fixed_size_mr* mr, + std::size_t num_allocations, + cudaStream_t stream) +{ + return test_random_allocations_base(mr, num_allocations, 1_MiB, stream); +} + +template <> +inline void test_random_allocations(fixed_multisize_mr* mr, + std::size_t num_allocations, + cudaStream_t stream) +{ + return test_random_allocations_base(mr, num_allocations, 1_MiB, stream); +} + +template +void test_mixed_random_allocation_free_base(MemoryResourceType* mr, + std::size_t max_size = 5_MiB, + cudaStream_t stream = 0) +{ + std::default_random_engine generator; + constexpr std::size_t num_allocations{100}; + + std::uniform_int_distribution size_distribution(1, max_size); + + constexpr int allocation_probability = 53; // percent + std::uniform_int_distribution op_distribution(0, 99); + std::uniform_int_distribution index_distribution(0, num_allocations - 1); + + int active_allocations{0}; + int allocation_count{0}; + + std::vector allocations; + + for (int i = 0; i < num_allocations * 2; ++i) { + bool do_alloc = true; + if (active_allocations > 0) { + int chance = op_distribution(generator); + do_alloc = (chance < allocation_probability) && (allocation_count < num_allocations); + } + + if (do_alloc) { + size_t size = size_distribution(generator); + active_allocations++; + allocation_count++; + EXPECT_NO_THROW(allocations.emplace_back(mr->allocate(size, stream), size)); + auto new_allocation = allocations.back(); + EXPECT_NE(nullptr, new_allocation.p); + EXPECT_TRUE(is_aligned(new_allocation.p)); + } else { + size_t index = index_distribution(generator) % active_allocations; + active_allocations--; + allocation to_free = allocations[index]; + allocations.erase(std::next(allocations.begin(), index)); + EXPECT_NO_THROW(mr->deallocate(to_free.p, to_free.size, stream)); + } + } + + EXPECT_EQ(active_allocations, 0); + EXPECT_EQ(allocations.size(), active_allocations); +} + +template +void test_mixed_random_allocation_free(MemoryResourceType* mr, cudaStream_t stream) +{ + test_mixed_random_allocation_free_base(mr, 5_MiB, stream); +} + +template <> +inline void test_mixed_random_allocation_free(fixed_size_mr* mr, cudaStream_t stream) +{ + test_mixed_random_allocation_free_base(mr, 1_MiB, stream); +} + +template <> +inline void test_mixed_random_allocation_free(fixed_multisize_mr* mr, + cudaStream_t stream) +{ + test_mixed_random_allocation_free_base(mr, 4_MiB, stream); +} + +// The test fixture +template +struct MRTest : public ::testing::Test { + std::unique_ptr mr; + cudaStream_t stream; + + MRTest() : mr{new MemoryResourceType} {} + + void SetUp() override { EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); } + + void TearDown() override { EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); }; + + ~MRTest() {} +}; + +// Specialize constructor to pass arguments +template <> +inline MRTest::MRTest() : mr{new fixed_size_mr{rmm::mr::get_default_resource()}} +{ +} + +template <> +inline MRTest::MRTest() + : mr{new fixed_multisize_mr(rmm::mr::get_default_resource())} +{ +} + +template <> +inline MRTest::MRTest() : mr{} +{ + rmm::mr::cuda_memory_resource* cuda = new rmm::mr::cuda_memory_resource{}; + this->mr.reset(new pool_mr(cuda)); +} + +template <> +inline MRTest::MRTest() +{ + rmm::mr::cuda_memory_resource* cuda = new rmm::mr::cuda_memory_resource{}; + pool_mr* pool = new pool_mr(cuda); + this->mr.reset(new hybrid_mr(new fixed_multisize_pool_mr(pool), pool)); +} + +template <> +inline MRTest::~MRTest() +{ + auto upstream = this->mr->get_upstream(); + this->mr.reset(); + delete upstream; +} + +template <> +inline MRTest::~MRTest() +{ + auto fixed = this->mr->get_small_mr(); + auto pool = this->mr->get_large_mr(); + auto cuda = pool->get_upstream(); + this->mr.reset(); + delete fixed; + delete pool; + delete cuda; +} diff --git a/tests/mr/device/mr_tests.cpp b/tests/mr/device/mr_tests.cpp index 6131938ad..5f62a8e3a 100644 --- a/tests/mr/device/mr_tests.cpp +++ b/tests/mr/device/mr_tests.cpp @@ -14,307 +14,13 @@ * limitations under the License. */ -#include "gtest/gtest.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace { -static constexpr std::size_t ALIGNMENT{256}; -inline bool is_aligned(void* p, std::size_t alignment = ALIGNMENT) -{ - return (0 == reinterpret_cast(p) % alignment); -} - -/** - * @brief Returns if a pointer points to a device memory or managed memory - * allocation. - */ -inline bool is_device_memory(void* p) -{ - cudaPointerAttributes attributes{}; - if (cudaSuccess != cudaPointerGetAttributes(&attributes, p)) { return false; } -#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 - return attributes.memoryType == cudaMemoryTypeDevice; -#else - return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); -#endif -} - -// some useful allocation sizes -static constexpr std::size_t size_word{4}; -static constexpr std::size_t size_kb{std::size_t{1} << 10}; -static constexpr std::size_t size_mb{std::size_t{1} << 20}; -static constexpr std::size_t size_gb{std::size_t{1} << 30}; -static constexpr std::size_t size_tb{std::size_t{1} << 40}; -static constexpr std::size_t size_pb{std::size_t{1} << 50}; - -struct allocation { - void* p{nullptr}; - std::size_t size{0}; - allocation(void* _p, std::size_t _size) : p{_p}, size{_size} {} - allocation() = default; -}; -} // namespace - -// nested MR type names can get long... -using pool_mr = rmm::mr::pool_memory_resource; -using fixed_size_mr = rmm::mr::fixed_size_memory_resource; -using fixed_multisize_mr = - rmm::mr::fixed_multisize_memory_resource; -using fixed_multisize_pool_mr = rmm::mr::fixed_multisize_memory_resource; -using hybrid_mr = rmm::mr::hybrid_memory_resource; - -using thread_safe_cuda_mr = rmm::mr::thread_safe_resource_adaptor; - -template -struct MRTest : public ::testing::Test { - std::unique_ptr mr; - cudaStream_t stream; - - MRTest() : mr{new MemoryResourceType} {} - - void SetUp() override { EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); } - - void TearDown() override { EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); }; - - ~MRTest() {} - - void test_allocate(std::size_t bytes, cudaStream_t stream = 0); - void test_random_allocations_base(std::size_t num_allocations = 100, - std::size_t max_size = 5 * size_mb, - cudaStream_t stream = 0); - void test_random_allocations(std::size_t num_allocations = 100, cudaStream_t stream = 0); - void test_mixed_random_allocation_free_base(std::size_t max_size = 5 * size_mb, - cudaStream_t stream = 0); - void test_mixed_random_allocation_free(cudaStream_t stream = 0); -}; - -// Specialize constructor to pass arguments -template <> -MRTest::MRTest() : mr{new fixed_size_mr{rmm::mr::get_default_resource()}} -{ -} - -template <> -MRTest::MRTest() : mr{new fixed_multisize_mr(rmm::mr::get_default_resource())} -{ -} - -template <> -MRTest::MRTest() -{ - rmm::mr::cuda_memory_resource* cuda = new rmm::mr::cuda_memory_resource{}; - this->mr.reset(new pool_mr(cuda)); -} - -template <> -MRTest::MRTest() -{ - rmm::mr::cuda_memory_resource* cuda = new rmm::mr::cuda_memory_resource{}; - pool_mr* pool = new pool_mr(cuda); - this->mr.reset(new hybrid_mr(new fixed_multisize_pool_mr(pool), pool)); -} - -template <> -MRTest::~MRTest() -{ - auto upstream = this->mr->get_upstream(); - this->mr.reset(); - delete upstream; -} - -template <> -MRTest::~MRTest() -{ - auto small = this->mr->get_small_mr(); - auto large = this->mr->get_large_mr(); - this->mr.reset(); - delete small; - delete large; -} - -template <> -MRTest::MRTest() - : mr{new thread_safe_cuda_mr(new rmm::mr::cuda_memory_resource)} -{ -} - -template <> -MRTest::~MRTest() -{ - auto upstream = mr->get_upstream(); - delete upstream; -} - -template -std::size_t get_max_size(MemoryResourceType* mr) -{ - return std::numeric_limits::max(); -} - -template <> -std::size_t get_max_size(fixed_size_mr* mr) -{ - return mr->get_block_size(); -} - -template <> -std::size_t get_max_size(fixed_multisize_mr* mr) -{ - return mr->get_max_size(); -} - -template -void MRTest::test_allocate(std::size_t bytes, cudaStream_t stream) -{ - void* p{nullptr}; - if (bytes > get_max_size(this->mr.get())) { - EXPECT_THROW(p = this->mr->allocate(bytes), std::bad_alloc); - } else { - EXPECT_NO_THROW(p = this->mr->allocate(bytes)); - if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(this->stream)); - EXPECT_NE(nullptr, p); - EXPECT_TRUE(is_aligned(p)); - EXPECT_TRUE(is_device_memory(p)); - EXPECT_NO_THROW(this->mr->deallocate(p, bytes)); - if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(this->stream)); - } -} - -template -void MRTest::test_random_allocations_base(std::size_t num_allocations, - std::size_t max_size, - cudaStream_t stream) -{ - std::vector allocations(num_allocations); - - std::default_random_engine generator; - std::uniform_int_distribution distribution(1, max_size); - - // 100 allocations from [0,5MB) - std::for_each(allocations.begin(), - allocations.end(), - [&generator, &distribution, stream, this](allocation& a) { - a.size = distribution(generator); - EXPECT_NO_THROW(a.p = this->mr->allocate(a.size, stream)); - if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); - EXPECT_NE(nullptr, a.p); - EXPECT_TRUE(is_aligned(a.p)); - }); - - std::for_each( - allocations.begin(), allocations.end(), [generator, distribution, stream, this](allocation& a) { - EXPECT_NO_THROW(this->mr->deallocate(a.p, a.size, stream)); - if (stream != 0) EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); - }); -} - -template -void MRTest::test_random_allocations(std::size_t num_allocations, - cudaStream_t stream) -{ - return test_random_allocations_base(num_allocations, 5 * size_mb, stream); -} - -template <> -void MRTest::test_random_allocations(std::size_t num_allocations, - cudaStream_t stream) -{ - return test_random_allocations_base(num_allocations, 1 * size_mb, stream); -} - -template <> -void MRTest::test_random_allocations(std::size_t num_allocations, - cudaStream_t stream) -{ - return test_random_allocations_base(num_allocations, 4 * size_mb, stream); -} - -template -void MRTest::test_mixed_random_allocation_free_base(std::size_t max_size, - cudaStream_t stream) -{ - std::default_random_engine generator; - constexpr std::size_t num_allocations{100}; - - std::uniform_int_distribution size_distribution(1, max_size); - - constexpr int allocation_probability = 53; // percent - std::uniform_int_distribution op_distribution(0, 99); - std::uniform_int_distribution index_distribution(0, num_allocations - 1); - - int active_allocations{0}; - int allocation_count{0}; - - std::vector allocations; - - for (int i = 0; i < num_allocations * 2; ++i) { - bool do_alloc = true; - if (active_allocations > 0) { - int chance = op_distribution(generator); - do_alloc = (chance < allocation_probability) && (allocation_count < num_allocations); - } - - if (do_alloc) { - size_t size = size_distribution(generator); - active_allocations++; - allocation_count++; - EXPECT_NO_THROW(allocations.emplace_back(this->mr->allocate(size, stream), size)); - auto new_allocation = allocations.back(); - EXPECT_NE(nullptr, new_allocation.p); - EXPECT_TRUE(is_aligned(new_allocation.p)); - } else { - size_t index = index_distribution(generator) % active_allocations; - active_allocations--; - allocation to_free = allocations[index]; - allocations.erase(std::next(allocations.begin(), index)); - EXPECT_NO_THROW(this->mr->deallocate(to_free.p, to_free.size, stream)); - } - } - - EXPECT_EQ(active_allocations, 0); - EXPECT_EQ(allocations.size(), active_allocations); -} - -template -void MRTest::test_mixed_random_allocation_free(cudaStream_t stream) -{ - test_mixed_random_allocation_free_base(5 * size_mb, stream); -} - -template <> -void MRTest::test_mixed_random_allocation_free(cudaStream_t stream) -{ - test_mixed_random_allocation_free_base(size_mb, stream); -} - -template <> -void MRTest::test_mixed_random_allocation_free(cudaStream_t stream) -{ - test_mixed_random_allocation_free_base(4 * size_mb, stream); -} +#include "mr_test.hpp" // Test on all memory resource classes using resources = ::testing::Typesallocate(size_mb)); - EXPECT_NE(nullptr, p); - EXPECT_TRUE(is_aligned(p)); - EXPECT_TRUE(is_device_memory(p)); - EXPECT_NO_THROW(rmm::mr::get_default_resource()->deallocate(p, size_mb)); -} +TEST(DefaultTest, UseDefaultResource) { test_get_default_resource(); } TYPED_TEST(MRTest, SetDefaultResource) { @@ -340,13 +37,9 @@ TYPED_TEST(MRTest, SetDefaultResource) rmm::mr::device_memory_resource* old{nullptr}; EXPECT_NO_THROW(old = rmm::mr::set_default_resource(this->mr.get())); EXPECT_NE(nullptr, old); - EXPECT_TRUE(this->mr->is_equal(*rmm::mr::get_default_resource())); - void* p{nullptr}; - EXPECT_NO_THROW(p = rmm::mr::get_default_resource()->allocate(size_mb)); - EXPECT_NE(nullptr, p); - EXPECT_TRUE(is_aligned(p)); - EXPECT_TRUE(is_device_memory(p)); - EXPECT_NO_THROW(rmm::mr::get_default_resource()->deallocate(p, size_mb)); + + test_get_default_resource(); // test allocating with the new default resource + // setting default resource w/ nullptr should reset to initial EXPECT_NO_THROW(rmm::mr::set_default_resource(nullptr)); EXPECT_TRUE(old->is_equal(*rmm::mr::get_default_resource())); @@ -356,58 +49,28 @@ TYPED_TEST(MRTest, SetDefaultResource) TYPED_TEST(MRTest, SelfEquality) { EXPECT_TRUE(this->mr->is_equal(*this->mr)); } -TYPED_TEST(MRTest, AllocateZeroBytes) -{ - void* p{nullptr}; - EXPECT_NO_THROW(p = this->mr->allocate(0)); - EXPECT_EQ(nullptr, p); - EXPECT_NO_THROW(this->mr->deallocate(p, 0)); -} +TYPED_TEST(MRTest, Allocate) { test_various_allocations(this->mr.get()); } -TYPED_TEST(MRTest, AllocateZeroBytesStream) +TYPED_TEST(MRTest, AllocateOnStream) { - void* p{nullptr}; - EXPECT_NO_THROW(p = this->mr->allocate(0, this->stream)); - EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(this->stream)); - EXPECT_NO_THROW(this->mr->deallocate(p, 0, this->stream)); - EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(this->stream)); + test_various_allocations_on_stream(this->mr.get(), this->stream); } -TYPED_TEST(MRTest, Allocate) -{ - this->test_allocate(size_word); - this->test_allocate(size_kb); - this->test_allocate(size_mb); - this->test_allocate(size_gb); +TYPED_TEST(MRTest, RandomAllocations) { test_random_allocations(this->mr.get()); } - // should fail to allocate too much - void* p{nullptr}; - EXPECT_THROW(p = this->mr->allocate(size_pb), rmm::bad_alloc); - EXPECT_EQ(nullptr, p); +TYPED_TEST(MRTest, RandomAllocationsStream) +{ + test_random_allocations(this->mr.get(), 100, this->stream); } -TYPED_TEST(MRTest, AllocateOnStream) +TYPED_TEST(MRTest, MixedRandomAllocationFree) { - this->test_allocate(size_word, this->stream); - this->test_allocate(size_kb, this->stream); - this->test_allocate(size_mb, this->stream); - this->test_allocate(size_gb, this->stream); - - // should fail to allocate too much - void* p{nullptr}; - EXPECT_THROW(p = this->mr->allocate(size_pb, this->stream), rmm::bad_alloc); - EXPECT_EQ(nullptr, p); + test_mixed_random_allocation_free(this->mr.get(), nullptr); } -TYPED_TEST(MRTest, RandomAllocations) { this->test_random_allocations(); } - -TYPED_TEST(MRTest, RandomAllocationsStream) { this->test_random_allocations(100, this->stream); } - -TYPED_TEST(MRTest, MixedRandomAllocationFree) { this->test_mixed_random_allocation_free(); } - TYPED_TEST(MRTest, MixedRandomAllocationFreeStream) { - this->test_mixed_random_allocation_free(this->stream); + test_mixed_random_allocation_free(this->mr.get(), this->stream); } TYPED_TEST(MRTest, GetMemInfo)