Skip to content

Commit

Permalink
Add python wrapper for system memory resource (#1605)
Browse files Browse the repository at this point in the history
Follow up on #1581 to add access to the system memory resource in python.

Fixes #1622

Authors:
  - Rong Ou (https://github.com/rongou)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #1605
  • Loading branch information
rongou authored Jul 25, 2024
1 parent 8c20e14 commit 67a78d6
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 71 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@

namespace rmm::mr {
/**
* @addtogroup device_resource_adaptors
* @addtogroup device_memory_resources
* @{
* @file
*/
/**
* @brief Resource that adapts system memory resource to allocate memory with a headroom.
* @brief Resource that uses system memory resource to allocate memory with a headroom.
*
* System allocated memory (SAM) can be migrated to the GPU, but is never migrated back the host. If
* GPU memory is over-subscribed, this can cause other CUDA calls to fail with out-of-memory errors.
Expand All @@ -39,46 +39,22 @@ namespace rmm::mr {
* Since doing this check on every allocation can be expensive, the caller may choose to use other
* allocators (e.g. `binning_memory_resource`) for small allocations, and use this allocator for
* large allocations only.
*
* @tparam Upstream Type of the upstream resource used for allocation/deallocation. Must be
* `system_memory_resource`.
*/
template <typename Upstream>
class sam_headroom_resource_adaptor final : public device_memory_resource {
class sam_headroom_memory_resource final : public device_memory_resource {
public:
/**
* @brief Construct a headroom adaptor using `upstream` to satisfy allocation requests.
* @brief Construct a headroom memory resource.
*
* @param upstream The resource used for allocating/deallocating device memory. Must be
* `system_memory_resource`.
* @param headroom Size of the reserved GPU memory as headroom
*/
explicit sam_headroom_resource_adaptor(Upstream* upstream, std::size_t headroom)
: upstream_{upstream}, headroom_{headroom}
{
static_assert(std::is_same_v<system_memory_resource, Upstream>,
"Upstream must be rmm::mr::system_memory_resource");
}
explicit sam_headroom_memory_resource(std::size_t headroom) : system_mr_{}, headroom_{headroom} {}

sam_headroom_resource_adaptor() = delete;
~sam_headroom_resource_adaptor() override = default;
sam_headroom_resource_adaptor(sam_headroom_resource_adaptor const&) = delete;
sam_headroom_resource_adaptor(sam_headroom_resource_adaptor&&) = delete;
sam_headroom_resource_adaptor& operator=(sam_headroom_resource_adaptor const&) = delete;
sam_headroom_resource_adaptor& operator=(sam_headroom_resource_adaptor&&) = delete;

/**
* @briefreturn{rmm::device_async_resource_ref to the upstream resource}
*/
[[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept
{
return upstream_;
}

/**
* @briefreturn{Upstream* to the upstream memory resource}
*/
[[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
sam_headroom_memory_resource() = delete;
~sam_headroom_memory_resource() override = default;
sam_headroom_memory_resource(sam_headroom_memory_resource const&) = delete;
sam_headroom_memory_resource(sam_headroom_memory_resource&&) = delete;
sam_headroom_memory_resource& operator=(sam_headroom_memory_resource const&) = delete;
sam_headroom_memory_resource& operator=(sam_headroom_memory_resource&&) = delete;

private:
/**
Expand All @@ -94,8 +70,7 @@ class sam_headroom_resource_adaptor final : public device_memory_resource {
*/
void* do_allocate(std::size_t bytes, [[maybe_unused]] cuda_stream_view stream) override
{
void* pointer =
get_upstream_resource().allocate_async(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
void* pointer = system_mr_.allocate_async(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);

auto const free = rmm::available_device_memory().first;
auto const allocatable = free > headroom_ ? free - headroom_ : 0UL;
Expand Down Expand Up @@ -131,7 +106,7 @@ class sam_headroom_resource_adaptor final : public device_memory_resource {
[[maybe_unused]] std::size_t bytes,
[[maybe_unused]] cuda_stream_view stream) override
{
get_upstream_resource().deallocate_async(ptr, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
system_mr_.deallocate_async(ptr, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
}

/**
Expand All @@ -144,13 +119,15 @@ class sam_headroom_resource_adaptor final : public device_memory_resource {
[[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
{
if (this == &other) { return true; }
auto cast = dynamic_cast<sam_headroom_resource_adaptor const*>(&other);
auto cast = dynamic_cast<sam_headroom_memory_resource const*>(&other);
if (cast == nullptr) { return false; }
return get_upstream_resource() == cast->get_upstream_resource() && headroom_ == cast->headroom_;
return headroom_ == cast->headroom_;
}

Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests
std::size_t headroom_; ///< Size of GPU memory reserved as headroom
///< The system memory resource used for satisfying allocation requests
system_memory_resource system_mr_;
///< Size of GPU memory reserved as headroom
std::size_t headroom_;
};
/** @} */ // end of group
} // namespace rmm::mr
6 changes: 6 additions & 0 deletions python/rmm/rmm/_lib/memory_resource.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ cdef class CudaMemoryResource(DeviceMemoryResource):
cdef class ManagedMemoryResource(DeviceMemoryResource):
pass

cdef class SystemMemoryResource(DeviceMemoryResource):
pass

cdef class SamHeadroomMemoryResource(DeviceMemoryResource):
pass

cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
pass

Expand Down
47 changes: 47 additions & 0 deletions python/rmm/rmm/_lib/memory_resource.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \
cdef cppclass managed_memory_resource(device_memory_resource):
managed_memory_resource() except +

cdef extern from "rmm/mr/device/system_memory_resource.hpp" \
namespace "rmm::mr" nogil:
cdef cppclass system_memory_resource(device_memory_resource):
system_memory_resource() except +

cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \
namespace "rmm::mr" nogil:
cdef cppclass sam_headroom_memory_resource(device_memory_resource):
sam_headroom_memory_resource(size_t headroom) except +

cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
namespace "rmm::mr" nogil:

Expand Down Expand Up @@ -366,6 +376,43 @@ cdef class ManagedMemoryResource(DeviceMemoryResource):
pass


cdef class SystemMemoryResource(DeviceMemoryResource):
def __cinit__(self):
self.c_obj.reset(
new system_memory_resource()
)

def __init__(self):
"""
Memory resource that uses ``malloc``/``free`` for
allocation/deallocation.
"""
pass


cdef class SamHeadroomMemoryResource(DeviceMemoryResource):
def __cinit__(
self,
size_t headroom
):
self.c_obj.reset(new sam_headroom_memory_resource(headroom))

def __init__(
self,
size_t headroom
):
"""
Memory resource that uses ``malloc``/``free`` for
allocation/deallocation.
Parameters
----------
headroom : size_t
Size of the reserved GPU memory as headroom
"""
pass


cdef class PoolMemoryResource(UpstreamResourceAdaptor):

def __cinit__(
Expand Down
4 changes: 4 additions & 0 deletions python/rmm/rmm/mr.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
ManagedMemoryResource,
PoolMemoryResource,
PrefetchResourceAdaptor,
SamHeadroomMemoryResource,
StatisticsResourceAdaptor,
SystemMemoryResource,
TrackingResourceAdaptor,
UpstreamResourceAdaptor,
_flush_logs,
Expand Down Expand Up @@ -54,7 +56,9 @@
"ManagedMemoryResource",
"PoolMemoryResource",
"PrefetchResourceAdaptor",
"SamHeadroomMemoryResource",
"StatisticsResourceAdaptor",
"SystemMemoryResource",
"TrackingResourceAdaptor",
"FailureCallbackResourceAdaptor",
"UpstreamResourceAdaptor",
Expand Down
58 changes: 56 additions & 2 deletions python/rmm/rmm/tests/test_rmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@
_runtime_version >= 11020
)

_SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute(
cudart.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess,
rmm._cuda.gpu.getDevice(),
)


def array_tester(dtype, nelem, alloc):
# data
Expand Down Expand Up @@ -91,6 +96,39 @@ def test_rmm_modes(dtype, nelem, alloc, managed, pool):
array_tester(dtype, nelem, alloc)


@pytest.mark.skipif(
not _SYSTEM_MEMORY_SUPPORTED,
reason="System memory not supported",
)
@pytest.mark.parametrize("dtype", _dtypes)
@pytest.mark.parametrize("nelem", _nelems)
@pytest.mark.parametrize("alloc", _allocs)
@pytest.mark.parametrize(
"system, pool, headroom",
list(product([False, True], [False, True], [False, True])),
)
def test_rmm_modes_system_memory(dtype, nelem, alloc, system, pool, headroom):
assert rmm.is_initialized()
array_tester(dtype, nelem, alloc)

if system:
if headroom:
base_mr = rmm.mr.SamHeadroomMemoryResource(headroom=1 << 20)
else:
base_mr = rmm.mr.SystemMemoryResource()
else:
base_mr = rmm.mr.CudaMemoryResource()
if pool:
mr = rmm.mr.PoolMemoryResource(base_mr)
else:
mr = base_mr
rmm.mr.set_current_device_resource(mr)

assert rmm.is_initialized()

array_tester(dtype, nelem, alloc)


@pytest.mark.parametrize("dtype", _dtypes)
@pytest.mark.parametrize("nelem", _nelems)
@pytest.mark.parametrize("alloc", _allocs)
Expand Down Expand Up @@ -410,7 +448,15 @@ def test_pool_memory_resource(dtype, nelem, alloc):
[
lambda: rmm.mr.CudaMemoryResource(),
lambda: rmm.mr.ManagedMemoryResource(),
],
]
+ (
[
lambda: rmm.mr.SystemMemoryResource(),
lambda: rmm.mr.SamHeadroomMemoryResource(headroom=1 << 20),
]
if _SYSTEM_MEMORY_SUPPORTED
else []
),
)
def test_fixed_size_memory_resource(dtype, nelem, alloc, upstream):
mr = rmm.mr.FixedSizeMemoryResource(
Expand All @@ -432,7 +478,15 @@ def test_fixed_size_memory_resource(dtype, nelem, alloc, upstream):
lambda: rmm.mr.PoolMemoryResource(
rmm.mr.CudaMemoryResource(), 1 << 20
),
],
]
+ (
[
lambda: rmm.mr.SystemMemoryResource(),
lambda: rmm.mr.SamHeadroomMemoryResource(headroom=1 << 20),
]
if _SYSTEM_MEMORY_SUPPORTED
else []
),
)
def test_binning_memory_resource(dtype, nelem, alloc, upstream_mr):
upstream = upstream_mr()
Expand Down
39 changes: 12 additions & 27 deletions tests/mr/device/system_mr_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

#include <rmm/cuda_device.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/mr/device/sam_headroom_resource_adaptor.hpp>
#include <rmm/mr/device/sam_headroom_memory_resource.hpp>
#include <rmm/mr/device/system_memory_resource.hpp>

#include <gtest/gtest.h>
Expand Down Expand Up @@ -54,9 +54,9 @@ void touch_on_gpu(void* ptr, std::size_t size)
using system_mr = rmm::mr::system_memory_resource;
static_assert(cuda::mr::resource_with<system_mr, cuda::mr::device_accessible>);
static_assert(cuda::mr::async_resource_with<system_mr, cuda::mr::device_accessible>);
using headroom_adaptor = rmm::mr::sam_headroom_resource_adaptor<rmm::mr::system_memory_resource>;
static_assert(cuda::mr::resource_with<headroom_adaptor, cuda::mr::device_accessible>);
static_assert(cuda::mr::async_resource_with<headroom_adaptor, cuda::mr::device_accessible>);
using headroom_mr = rmm::mr::sam_headroom_memory_resource;
static_assert(cuda::mr::resource_with<headroom_mr, cuda::mr::device_accessible>);
static_assert(cuda::mr::async_resource_with<headroom_mr, cuda::mr::device_accessible>);

class SystemMRTest : public ::testing::Test {
protected:
Expand All @@ -79,19 +79,6 @@ TEST(SystemMRSimpleTest, ThrowIfNotSupported)
}
}

TEST(SAMHeadroomAdaptorTest, ThrowIfNotSupported)
{
auto construct_mr = []() {
system_mr mr;
headroom_adaptor adaptor{&mr, 0};
};
if (rmm::mr::detail::is_system_memory_supported(rmm::get_current_cuda_device())) {
EXPECT_NO_THROW(construct_mr());
} else {
EXPECT_THROW(construct_mr(), rmm::logic_error);
}
}

TEST_F(SystemMRTest, FirstTouchOnCPU)
{
auto const free = rmm::available_device_memory().first;
Expand All @@ -114,23 +101,21 @@ TEST_F(SystemMRTest, FirstTouchOnGPU)
mr.deallocate(ptr, size_mb);
}

TEST_F(SystemMRTest, AdaptorReserveAllFreeMemory)
TEST_F(SystemMRTest, HeadroomMRReserveAllFreeMemory)
{
auto const free = rmm::available_device_memory().first;
system_mr mr;
// All the free GPU memory is set as headroom, so allocation is only on the CPU.
headroom_adaptor adaptor{&mr, free + size_gb};
void* ptr = adaptor.allocate(size_mb);
headroom_mr mr{free + size_gb};
void* ptr = mr.allocate(size_mb);
touch_on_cpu(ptr, size_mb);
adaptor.deallocate(ptr, size_mb);
mr.deallocate(ptr, size_mb);
}

TEST_F(SystemMRTest, AdaptorDifferentParametersUnequal)
TEST_F(SystemMRTest, HeadroomMRDifferentParametersUnequal)
{
system_mr mr;
headroom_adaptor adaptor1{&mr, size_mb};
headroom_adaptor adaptor2{&mr, size_gb};
EXPECT_FALSE(adaptor1.is_equal(adaptor2));
headroom_mr mr1{size_mb};
headroom_mr mr2{size_gb};
EXPECT_FALSE(mr1.is_equal(mr2));
}
} // namespace
} // namespace rmm::test

0 comments on commit 67a78d6

Please sign in to comment.