Skip to content

Commit

Permalink
[REVIEW] Merge free lists in pool_memory_resource to defragment befor…
Browse files Browse the repository at this point in the history
…e growing from upstream (#532)

* Add list merging before growing pool.

* Fix list merging

* Changelog for #530

* Correct CL#

* Hoist list size() to before it may change.

* Add iterator comment.

* Add POOL_MR_PTDS_TEST

* Update CI build readme

* Remove invalid optimization for case when there is only one free list.

* Fix comment

* Remove extra log_summary_trace()

* Update version to 0.16 in ci local README

Co-authored-by: Keith Kraus <[email protected]>
  • Loading branch information
harrism and Keith Kraus authored Sep 14, 2020
1 parent e3c55f7 commit 6a3a62f
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 56 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

- PR #474 Use CMake find_package(CUDAToolkit)
- PR #477 Just use `None` for `strides` in `DeviceBuffer`
- PR #528 Add `maximum_pool_size` parameter to reinitialize API
- PR #528 Add maximum_pool_size parameter to reinitialize API
- PR #532 Merge free lists in pool_memory_resource to defragment before growing from upstream
- PR #537 Add CMake option to disable deprecation warnings
- PR #541 Refine CMakeLists.txt to make it easy to import by external projects
- PR #538 Upgrade CUB and Thrust to the latest commits
Expand Down
4 changes: 2 additions & 2 deletions ci/local/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ where:
```

Example Usage:
`bash build.sh -r ~/rapids/rmm -i gpuci/rapidsai-base:cuda10.1-ubuntu16.04-gcc5-py3.6`
`bash build.sh -r ~/rapids/rmm -i gpuci/rapidsai:0.16-cuda10.2-devel-ubuntu16.04-py3.7`

For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai-base/tags) page.
For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai/tags) page.

Style Check:
```bash
Expand Down
4 changes: 1 addition & 3 deletions cmake/Modules/ConfigureGoogleTest.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ set(GTEST_CMAKE_ARGS "")
# " -Dgtest_build_samples=ON"
# " -DCMAKE_VERBOSE_MAKEFILE=ON")

# Poor's man workaround for
# https://github.com/google/googletest/issues/854
# Workaround https://github.com/google/googletest/issues/854
if(CMAKE_CXX_COMPILER MATCHES ".*clang")
list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_C_FLAGS=-fPIC")
list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-fPIC")
Expand Down Expand Up @@ -53,4 +52,3 @@ message(STATUS "GoogleTest installed here: " ${GTEST_ROOT}/install)
set(GTEST_INCLUDE_DIR "${GTEST_ROOT}/install/include")
set(GTEST_LIBRARY_DIR "${GTEST_ROOT}/install/lib")
set(GTEST_FOUND TRUE)

116 changes: 78 additions & 38 deletions include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,25 +203,27 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
*/
virtual void* do_allocate(std::size_t bytes, cudaStream_t stream) override
{
RMM_LOG_TRACE("[A][{}B][stream {:p}]", bytes, static_cast<void*>(stream));
RMM_LOG_TRACE("[A][stream {:p}][{}B]", static_cast<void*>(stream), bytes);

if (bytes <= 0) return nullptr;

lock_guard lock(mtx_);

auto stream_event = get_event(stream);
bytes = rmm::detail::align_up(bytes, allocation_alignment);

bytes = rmm::detail::align_up(bytes, allocation_alignment);
RMM_EXPECTS(bytes <= this->underlying().get_maximum_allocation_size(),
rmm::bad_alloc,
"Maximum allocation size exceeded");
auto const b = this->underlying().get_block(bytes, stream_event);
auto split = this->underlying().allocate_from_block(b, bytes);
if (split.remainder.is_valid()) stream_free_blocks_[stream_event].insert(split.remainder);
RMM_LOG_TRACE("[A][{}B][{:p}][stream {:p}]",
RMM_LOG_TRACE("[A][stream {:p}][{}B][{:p}]",
static_cast<void*>(stream_event.stream),
bytes,
static_cast<void*>(split.allocated_pointer),
static_cast<void*>(stream));
static_cast<void*>(split.allocated_pointer));

log_summary_trace(stream_event.stream);
log_summary_trace();

return split.allocated_pointer;
}
Expand All @@ -236,10 +238,11 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
virtual void do_deallocate(void* p, std::size_t bytes, cudaStream_t stream) override
{
lock_guard lock(mtx_);
RMM_LOG_TRACE("[D][{}B][{:p}][stream {:p}]", bytes, p, static_cast<void*>(stream));
auto stream_event = get_event(stream);
bytes = rmm::detail::align_up(bytes, allocation_alignment);
auto const b = this->underlying().free_block(p, bytes);
RMM_LOG_TRACE("[D][stream {:p}][{}B][{:p}]", static_cast<void*>(stream_event.stream), bytes, p);

bytes = rmm::detail::align_up(bytes, allocation_alignment);
auto const b = this->underlying().free_block(p, bytes);

// TODO: cudaEventRecord has significant overhead on deallocations. For the non-PTDS case
// we may be able to delay recording the event in some situations. But using events rather than
Expand All @@ -248,7 +251,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_

stream_free_blocks_[stream_event].insert(b);

log_summary_trace(stream_event.stream);
log_summary_trace();
}

private:
Expand Down Expand Up @@ -324,18 +327,25 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
if (b.is_valid()) { return b; }
}

// Otherwise try to find a block associated with another stream
block_type b = get_block_from_other_stream(size, stream_event);
if (b.is_valid()) { return b; }
// note this creates a free list if it doesn't already exist, hence we get num lists first above
free_list& blocks =
(iter != stream_free_blocks_.end()) ? iter->second : stream_free_blocks_[stream_event];

// no larger blocks available on other streams, so grow the pool and create a block
// Try to find an existing block in another stream
{
block_type const b = get_block_from_other_stream(size, stream_event, blocks, false);
if (b.is_valid()) return b;
}

log_summary_trace(stream_event.stream);
// no large enough blocks available on other streams, so sync and merge until we find one
{
block_type const b = get_block_from_other_stream(size, stream_event, blocks, true);
if (b.is_valid()) return b;
}

// avoid searching for this stream's list again
free_list& blocks =
(iter != stream_free_blocks_.end()) ? iter->second : stream_free_blocks_[stream_event];
log_summary_trace();

// no large enough blocks available after merging, so grow the pool
return this->underlying().expand_pool(size, blocks, stream_event.stream);
}

Expand All @@ -356,29 +366,46 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
* @return A block with non-null pointer and size >= `size`, or a nullptr block if none is
* available in `blocks`.
*/
block_type get_block_from_other_stream(size_t size, stream_event_pair stream_event)
block_type get_block_from_other_stream(size_t size,
stream_event_pair stream_event,
free_list& blocks,
bool merge_first)
{
// nothing in this stream's free list, look for one on another stream
for (auto s = stream_free_blocks_.begin(); s != stream_free_blocks_.end(); ++s) {
auto blocks_event = s->first;
if (blocks_event.event != stream_event.event) {
auto blocks = s->second;

block_type const b = blocks.get_block(size); // get the best fit block
for (auto it = stream_free_blocks_.begin(), next_it = it; it != stream_free_blocks_.end();
it = next_it) {
++next_it; // Points to element after `it` to allow erasing `it` in the loop body
auto other_event = it->first.event;
if (other_event != stream_event.event) {
auto other_blocks = it->second;

block_type const b = [&]() {
if (merge_first) {
merge_lists(stream_event, blocks, other_event, std::move(other_blocks));

RMM_LOG_INFO("[A][Stream {:p}][{}B][Merged stream {:p}]",
static_cast<void*>(stream_event.stream),
size,
static_cast<void*>(it->first.stream));

stream_free_blocks_.erase(it);

return blocks.get_block(size); // get the best fit block in merged lists
} else {
return other_blocks.get_block(size); // get the best fit block in other list
}
}();

if (b.is_valid()) {
// Since we found a block associated with a different stream, we have to insert a wait
// on the stream's associated event into the allocating stream.
RMM_CUDA_TRY(cudaStreamWaitEvent(stream_event.stream, blocks_event.event, 0));

// Move all the blocks to the requesting stream, since it has waited on them
stream_free_blocks_[stream_event].insert(std::move(blocks));
stream_free_blocks_.erase(s);

RMM_LOG_INFO("[A][{}B][Stream {:p} steal from stream {:p}]",
size,
RMM_LOG_INFO((merge_first) ? "[A][Stream {:p}][{}B][Found after merging stream {:p}]"
: "[A][Stream {:p}][{}B][Taken from stream {:p}]",
static_cast<void*>(stream_event.stream),
static_cast<void*>(s->first.stream));
size,
static_cast<void*>(it->first.stream));

if (not merge_first) {
merge_lists(stream_event, blocks, other_event, std::move(other_blocks));
stream_free_blocks_.erase(it);
}

return b;
}
Expand All @@ -387,6 +414,19 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
return block_type{};
}

void merge_lists(stream_event_pair stream_event,
free_list& blocks,
cudaEvent_t other_event,
free_list&& other_blocks)
{
// Since we found a block associated with a different stream, we have to insert a wait
// on the stream's associated event into the allocating stream.
RMM_CUDA_TRY(cudaStreamWaitEvent(stream_event.stream, other_event, 0));

// Merge the two free lists
blocks.insert(std::move(other_blocks));
}

/**
* @brief Clear free lists and events
*
Expand All @@ -405,7 +445,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
stream_free_blocks_.clear();
}

void log_summary_trace(cudaStream_t stream)
void log_summary_trace()
{
#if (SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_TRACE)
std::size_t num_blocks{0};
Expand Down
30 changes: 18 additions & 12 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,17 @@ set(DEVICE_MR_TEST_SRC

ConfigureTest(DEVICE_MR_TEST "${DEVICE_MR_TEST_SRC}")

###################################################################################################
###################################################################################################
# - device mr per-thread default stream tests------------------------------------------------------

set(DEVICE_MR_PTDS_TEST_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/mr/device/mr_multithreaded_tests.cpp")

ConfigureTest(DEVICE_MR_PTDS_TEST "${DEVICE_MR_PTDS_TEST_SRC}")
target_compile_definitions(DEVICE_MR_PTDS_TEST PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)


###################################################################################################
###################################################################################################
# - pool mr tests --------------------------------------------------------------------------------
Expand All @@ -75,6 +86,13 @@ set(POOL_MR_TEST_SRC

ConfigureTest(POOL_MR_TEST "${POOL_MR_TEST_SRC}")

set(POOL_MR_PTDS_TEST_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/mr/device/pool_mr_tests.cpp")

ConfigureTest(POOL_MR_PTDS_TEST "${POOL_MR_PTDS_TEST_SRC}")
target_compile_definitions(POOL_MR_PTDS_TEST PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)


###################################################################################################
###################################################################################################
# - thrust allocator tests --------------------------------------------------------------------------------
Expand All @@ -84,18 +102,6 @@ set(THRUST_ALLOCATOR_TEST_SRC

ConfigureTest(THRUST_ALLOCATOR_TEST "${THRUST_ALLOCATOR_TEST_SRC}")

###################################################################################################
###################################################################################################
# - device mr per-thread default stream tests------------------------------------------------------


set(DEVICE_MR_PTDS_TEST_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/mr/device/mr_multithreaded_tests.cpp")

ConfigureTest(DEVICE_MR_PTDS_TEST "${DEVICE_MR_PTDS_TEST_SRC}")
target_compile_definitions(DEVICE_MR_PTDS_TEST PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)


###################################################################################################
###################################################################################################
# - host mr tests ----------------------------------------------------------------------------------
Expand Down

0 comments on commit 6a3a62f

Please sign in to comment.