Skip to content

Commit

Permalink
arena_memory_resource optimization: disable tracking allocated blocks…
Browse files Browse the repository at this point in the history
… by default (#732)

This is done similarly to #702.

Previously `arena_memory_resource` maintained a set of allocated blocks, but this was only used for reporting/debugging purposes. Maintaining this set requires a `set::find` at every deallocation, which can get expensive when there are many allocated blocks. This PR moves the tracking behind a default-undefined preprocessor flag. This results in some speedup in the random allocations benchmark for `arena_memory_resource`. Tracking can be enabled by defining `RMM_POOL_TRACK_ALLOCATIONS`.

This should also fix the Spark small shuffle buffer issue: NVIDIA/spark-rapids#1711

Before:
```console
------------------------------------------------------------------------------------
Benchmark                                          Time             CPU   Iterations
------------------------------------------------------------------------------------
BM_RandomAllocations/arena_mr/1000/1            1.36 ms         1.36 ms          457
BM_RandomAllocations/arena_mr/1000/4            1.21 ms         1.21 ms          517
BM_RandomAllocations/arena_mr/1000/64           1.22 ms         1.22 ms          496
BM_RandomAllocations/arena_mr/1000/256          1.08 ms         1.07 ms          535
BM_RandomAllocations/arena_mr/1000/1024        0.949 ms        0.948 ms          583
BM_RandomAllocations/arena_mr/1000/4096        0.853 ms        0.848 ms          680
BM_RandomAllocations/arena_mr/10000/1           98.7 ms         98.3 ms            8
BM_RandomAllocations/arena_mr/10000/4           65.4 ms         65.4 ms            9
BM_RandomAllocations/arena_mr/10000/64          16.6 ms         16.5 ms           38
BM_RandomAllocations/arena_mr/10000/256         11.2 ms         11.2 ms           48
BM_RandomAllocations/arena_mr/10000/1024        9.45 ms         9.44 ms           62
BM_RandomAllocations/arena_mr/10000/4096        9.24 ms         9.20 ms           59
BM_RandomAllocations/arena_mr/100000/1          7536 ms         7536 ms            1
BM_RandomAllocations/arena_mr/100000/4          3002 ms         3002 ms            1
BM_RandomAllocations/arena_mr/100000/64          170 ms          170 ms            3
BM_RandomAllocations/arena_mr/100000/256         107 ms          107 ms            7
BM_RandomAllocations/arena_mr/100000/1024       96.0 ms         95.7 ms            6
BM_RandomAllocations/arena_mr/100000/4096       86.7 ms         86.7 ms            6
```

After:
```console
------------------------------------------------------------------------------------
Benchmark                                          Time             CPU   Iterations
------------------------------------------------------------------------------------
BM_RandomAllocations/arena_mr/1000/1            1.20 ms         1.20 ms          519
BM_RandomAllocations/arena_mr/1000/4            1.08 ms         1.08 ms          588
BM_RandomAllocations/arena_mr/1000/64           1.11 ms         1.11 ms          552
BM_RandomAllocations/arena_mr/1000/256         0.957 ms        0.957 ms          611
BM_RandomAllocations/arena_mr/1000/1024        0.857 ms        0.857 ms          687
BM_RandomAllocations/arena_mr/1000/4096        0.795 ms        0.793 ms          724
BM_RandomAllocations/arena_mr/10000/1           73.0 ms         73.0 ms           10
BM_RandomAllocations/arena_mr/10000/4           45.7 ms         45.7 ms           14
BM_RandomAllocations/arena_mr/10000/64          14.4 ms         14.4 ms           40
BM_RandomAllocations/arena_mr/10000/256         9.87 ms         9.82 ms           60
BM_RandomAllocations/arena_mr/10000/1024        8.72 ms         8.72 ms           69
BM_RandomAllocations/arena_mr/10000/4096        7.32 ms         7.30 ms           85
BM_RandomAllocations/arena_mr/100000/1          6384 ms         6384 ms            1
BM_RandomAllocations/arena_mr/100000/4          2480 ms         2480 ms            1
BM_RandomAllocations/arena_mr/100000/64          147 ms          147 ms            5
BM_RandomAllocations/arena_mr/100000/256         103 ms          103 ms            7
BM_RandomAllocations/arena_mr/100000/1024       78.1 ms         78.1 ms            9
BM_RandomAllocations/arena_mr/100000/4096       72.3 ms         72.3 ms            9
```

@abellina

Authors:
  - Rong Ou (@rongou)

Approvers:
  - Mark Harris (@harrism)
  - Conor Hoekstra (@codereport)

URL: #732
  • Loading branch information
rongou authored Mar 17, 2021
1 parent 0f3ba03 commit 3826a89
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 0 deletions.
6 changes: 6 additions & 0 deletions include/rmm/mr/device/arena_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,16 @@ class arena_memory_resource final : public device_memory_resource {
if (p == nullptr || bytes <= 0) return;

bytes = detail::arena::align_up(bytes);
#ifdef RMM_POOL_TRACK_ALLOCATIONS
if (!get_arena(stream).deallocate(p, bytes, stream)) {
deallocate_from_other_arena(p, bytes, stream);
}
#else
get_arena(stream).deallocate(p, bytes, stream);
#endif
}

#ifdef RMM_POOL_TRACK_ALLOCATIONS
/**
* @brief Deallocate memory pointed to by `p` that was allocated in a different arena.
*
Expand Down Expand Up @@ -186,6 +191,7 @@ class arena_memory_resource final : public device_memory_resource {
// global arena.
global_arena_.deallocate({p, bytes});
}
#endif

/**
* @brief Get the arena associated with the current thread or the given stream.
Expand Down
14 changes: 14 additions & 0 deletions include/rmm/mr/device/detail/arena.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,9 @@ class arena {
{
lock_guard lock(mtx_);
auto const b = get_block(bytes);
#ifdef RMM_POOL_TRACK_ALLOCATIONS
allocated_blocks_.emplace(b.pointer(), b);
#endif
return b.pointer();
}

Expand All @@ -462,14 +464,19 @@ class arena {
bool deallocate(void* p, std::size_t bytes, cuda_stream_view stream)
{
lock_guard lock(mtx_);
#ifdef RMM_POOL_TRACK_ALLOCATIONS
auto const b = free_block(p, bytes);
#else
block const b{p, bytes};
#endif
if (b.is_valid()) {
auto const merged = coalesce_block(free_blocks_, b);
shrink_arena(merged, stream);
}
return b.is_valid();
}

#ifdef RMM_POOL_TRACK_ALLOCATIONS
/**
* @brief Deallocate memory pointed to by `p`, keeping all free superblocks.
*
Expand All @@ -489,6 +496,7 @@ class arena {
if (b.is_valid()) { global_arena_.deallocate(b); }
return b.is_valid();
}
#endif

/**
* @brief Clean the arena and deallocate free blocks from the global arena.
Expand All @@ -500,7 +508,9 @@ class arena {
lock_guard lock(mtx_);
global_arena_.deallocate(free_blocks_);
free_blocks_.clear();
#ifdef RMM_POOL_TRACK_ALLOCATIONS
allocated_blocks_.clear();
#endif
}

private:
Expand Down Expand Up @@ -537,6 +547,7 @@ class arena {
return global_arena_.allocate(superblock_size);
}

#ifdef RMM_POOL_TRACK_ALLOCATIONS
/**
* @brief Finds, frees and returns the block associated with pointer `p`.
*
Expand All @@ -558,6 +569,7 @@ class arena {

return found;
}
#endif

/**
* @brief Shrink this arena by returning free superblocks to upstream.
Expand All @@ -580,8 +592,10 @@ class arena {
global_arena<Upstream>& global_arena_;
/// Free blocks.
std::set<block> free_blocks_;
#ifdef RMM_POOL_TRACK_ALLOCATIONS
//// Map of pointer address to allocated blocks.
std::unordered_map<void*, block> allocated_blocks_;
#endif
/// Mutex for exclusive lock.
mutable std::mutex mtx_;
};
Expand Down

0 comments on commit 3826a89

Please sign in to comment.