From 397265f29e195a71e4fff32f242e8262f0cad1c9 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 30 Aug 2024 10:55:28 +0200 Subject: [PATCH 01/16] failure_callback_resource_adaptor --- .../failure_alternate_resource_adaptor.hpp | 151 ++++++++++++++++++ python/rmm/rmm/_lib/memory_resource.pxd | 3 + python/rmm/rmm/_lib/memory_resource.pyx | 36 +++++ python/rmm/rmm/mr.py | 2 + python/rmm/rmm/tests/test_rmm.py | 44 +++++ 5 files changed, 236 insertions(+) create mode 100644 include/rmm/mr/device/failure_alternate_resource_adaptor.hpp diff --git a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp new file mode 100644 index 000000000..977ea6008 --- /dev/null +++ b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace RMM_NAMESPACE { +namespace mr { +/** + * @addtogroup device_resource_adaptors + * @{ + * @file + */ + +template +class failure_alternate_resource_adaptor final : public device_memory_resource { + public: + using exception_type = ExceptionType; ///< The type of exception this object catches/throws + + /** + * @brief Construct a new `failure_alternate_resource_adaptor` using `upstream` to satisfy + * allocation requests. + * + * @throws rmm::logic_error if `upstream == nullptr` + * + * @param upstream The resource used for allocating/deallocating device memory + * @param alternate_upstream The resource used for alternate allocating/deallocating device + * memory + */ + failure_alternate_resource_adaptor(Upstream* upstream, Upstream* alternate_upstream) + : upstream_{upstream}, alternate_upstream_{alternate_upstream} + { + RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + RMM_EXPECTS(nullptr != alternate_upstream, + "Unexpected null alternate upstream resource pointer."); + } + + failure_alternate_resource_adaptor() = delete; + ~failure_alternate_resource_adaptor() override = default; + failure_alternate_resource_adaptor(failure_alternate_resource_adaptor const&) = delete; + failure_alternate_resource_adaptor& operator=(failure_alternate_resource_adaptor const&) = delete; + failure_alternate_resource_adaptor(failure_alternate_resource_adaptor&&) noexcept = + default; ///< @default_move_constructor + failure_alternate_resource_adaptor& operator=(failure_alternate_resource_adaptor&&) noexcept = + default; ///< @default_move_assignment{failure_alternate_resource_adaptor} + + /** + * @briefreturn{rmm::device_async_resource_ref to the upstream resource} + */ + [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept + { + return upstream_; + } + + /** + * @briefreturn{Upstream* to the upstream memory resource} + */ + [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } + + private: + using lock_guard = std::lock_guard; + + /** + * @brief Allocates memory of size at least `bytes` using the upstream + * resource. + * + * @throws `exception_type` if the requested allocation could not be fulfilled + * by the upstream resource. + * + * @param bytes The size, in bytes, of the allocation + * @param stream Stream on which to perform the allocation + * @return void* Pointer to the newly allocated memory + */ + void* do_allocate(std::size_t bytes, cuda_stream_view stream) override + { + void* ret{}; + try { + ret = upstream_->allocate(bytes, stream); + } catch (exception_type const& e) { + ret = alternate_upstream_->allocate(bytes, stream); + lock_guard lock(mtx_); + alternate_allocations_.insert(ret); + } + return ret; + } + + /** + * @brief Free allocation of size `bytes` pointed to by `ptr` + * + * @param ptr Pointer to be deallocated + * @param bytes Size of the allocation + * @param stream Stream on which to perform the deallocation + */ + void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override + { + std::size_t count{0}; + { + lock_guard lock(mtx_); + count = alternate_allocations_.erase(ptr); + } + if (count > 0) { + alternate_upstream_->deallocate(ptr, bytes, stream); + } else { + upstream_->deallocate(ptr, bytes, stream); + } + } + + /** + * @brief Compare the upstream resource to another. + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equal + */ + [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override + { + if (this == &other) { return true; } + auto cast = dynamic_cast const*>(&other); + if (cast == nullptr) { return upstream_->is_equal(other); } + return get_upstream_resource() == cast->get_upstream_resource(); + } + + Upstream* upstream_; // the upstream used for satisfying allocation requests + Upstream* alternate_upstream_; // the upstream used for satisfying alternate allocation requests + std::unordered_set alternate_allocations_; // set of alternate allocations + mutable std::mutex mtx_; // Mutex for exclusive lock. +}; + +/** @} */ // end of group +} // namespace mr +} // namespace RMM_NAMESPACE diff --git a/python/rmm/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd index 000a3fe1e..ef638b90e 100644 --- a/python/rmm/rmm/_lib/memory_resource.pxd +++ b/python/rmm/rmm/_lib/memory_resource.pxd @@ -97,6 +97,9 @@ cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor): cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor): cdef object _callback +cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): + pass + cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor): pass diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index 5030c5d2d..f68c6dd0a 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -229,6 +229,16 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ void* callback_arg ) except + +cdef extern from "rmm/mr/device/failure_alternate_resource_adaptor.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass failure_alternate_resource_adaptor[Upstream]( + device_memory_resource + ): + failure_alternate_resource_adaptor( + Upstream* upstream_mr, + Upstream* alternate_upstream_mr, + ) except + + cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \ namespace "rmm::mr" nogil: cdef cppclass prefetch_resource_adaptor[Upstream](device_memory_resource): @@ -1039,6 +1049,32 @@ cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor): """ pass + +cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): + + def __cinit__( + self, + DeviceMemoryResource upstream_mr, + DeviceMemoryResource alternate_upstream_mr, + ): + self.c_obj.reset( + new failure_alternate_resource_adaptor[device_memory_resource]( + upstream_mr.get_mr(), + alternate_upstream_mr.get_mr(), + ) + ) + + def __init__( + self, + DeviceMemoryResource upstream_mr, + DeviceMemoryResource alternate_upstream_mr, + ): + """ + TODO + """ + pass + + cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor): def __cinit__( diff --git a/python/rmm/rmm/mr.py b/python/rmm/rmm/mr.py index 6eb94da0f..5a10c4e62 100644 --- a/python/rmm/rmm/mr.py +++ b/python/rmm/rmm/mr.py @@ -17,6 +17,7 @@ CudaAsyncMemoryResource, CudaMemoryResource, DeviceMemoryResource, + FailureAlternateResourceAdaptor, FailureCallbackResourceAdaptor, FixedSizeMemoryResource, LimitingResourceAdaptor, @@ -61,6 +62,7 @@ "SystemMemoryResource", "TrackingResourceAdaptor", "FailureCallbackResourceAdaptor", + "FailureAlternateResourceAdaptor", "UpstreamResourceAdaptor", "_flush_logs", "_initialize", diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index c4fd90c45..74e26dbee 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -787,6 +787,50 @@ def callback(nbytes: int) -> bool: assert retried[0] +def test_failure_alternate_resource_adaptor(): + base = rmm.mr.CudaMemoryResource() + + def alloc_cb(size, stream, *, track: list[int], limit: int): + if size > limit: + raise MemoryError() + ret = base.allocate(size, stream) + track.append(ret) + return ret + + def dealloc_cb(ptr, size, stream, *, track: list[int]): + track.append(ptr) + return base.deallocate(ptr, size, stream) + + main_track = [] + main_mr = rmm.mr.CallbackMemoryResource( + functools.partial(alloc_cb, track=main_track, limit=200), + functools.partial(dealloc_cb, track=main_track), + ) + alternate_track = [] + alternate_mr = rmm.mr.CallbackMemoryResource( + functools.partial(alloc_cb, track=alternate_track, limit=1000), + functools.partial(dealloc_cb, track=alternate_track), + ) + mr = rmm.mr.FailureAlternateResourceAdaptor(main_mr, alternate_mr) + + # Buffer size within the limit of `main_mr` + rmm.DeviceBuffer(size=100, mr=mr) + # we expect an alloc and a dealloc of the same buffer in + # `main_track` and an empty `alternate_track` + assert len(main_track) == 2 + assert main_track[0] == main_track[1] + assert len(alternate_track) == 0 + + # Buffer size outside the limit of `main_mr` + rmm.DeviceBuffer(size=500, mr=mr) + # we expect an alloc and a dealloc of the same buffer in + # `alternate_track` and an unchanged `main_mr` + assert len(main_track) == 2 + assert main_track[0] == main_track[1] + assert len(alternate_track) == 2 + assert alternate_track[0] == alternate_track[1] + + @pytest.mark.parametrize("managed", [True, False]) def test_prefetch_resource_adaptor(managed): if managed: From 6d76ec6a84cd70f4bb7d7fa018a41baf5767a1f0 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 2 Sep 2024 14:03:39 +0200 Subject: [PATCH 02/16] alternate_upstream_mr --- python/rmm/rmm/_lib/memory_resource.pxd | 5 ++++- python/rmm/rmm/_lib/memory_resource.pyx | 11 +++++++++-- python/rmm/rmm/tests/test_rmm.py | 7 +++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/python/rmm/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd index ef638b90e..139a45653 100644 --- a/python/rmm/rmm/_lib/memory_resource.pxd +++ b/python/rmm/rmm/_lib/memory_resource.pxd @@ -98,7 +98,10 @@ cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor): cdef object _callback cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): - pass + cdef readonly DeviceMemoryResource alternate_upstream_mr + + cpdef DeviceMemoryResource get_alternate_upstream(self) + cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor): pass diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index f68c6dd0a..d07c27e50 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -289,7 +289,6 @@ cdef class UpstreamResourceAdaptor(DeviceMemoryResource): """ def __cinit__(self, DeviceMemoryResource upstream_mr, *args, **kwargs): - if (upstream_mr is None): raise Exception("Argument `upstream_mr` must not be None") @@ -1057,6 +1056,10 @@ cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): DeviceMemoryResource upstream_mr, DeviceMemoryResource alternate_upstream_mr, ): + if (alternate_upstream_mr is None): + raise Exception("Argument `alternate_upstream_mr` must not be None") + self.alternate_upstream_mr = alternate_upstream_mr + self.c_obj.reset( new failure_alternate_resource_adaptor[device_memory_resource]( upstream_mr.get_mr(), @@ -1070,10 +1073,14 @@ cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): DeviceMemoryResource alternate_upstream_mr, ): """ - TODO + TODO: doc """ pass + cpdef DeviceMemoryResource get_alternate_upstream(self): + return self.alternate_upstream_mr + + cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor): diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index 74e26dbee..bef94a1b6 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -812,6 +812,13 @@ def dealloc_cb(ptr, size, stream, *, track: list[int]): functools.partial(dealloc_cb, track=alternate_track), ) mr = rmm.mr.FailureAlternateResourceAdaptor(main_mr, alternate_mr) + assert main_mr is mr.get_upstream() + assert alternate_mr is mr.get_alternate_upstream() + + # Delete the upstream memory resources here to check that they are + # kept alive by `mr` + del main_mr + del alternate_mr # Buffer size within the limit of `main_mr` rmm.DeviceBuffer(size=100, mr=mr) From bcaef4c8aca897cf2445fa01a220f4c6db38b4ab Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 2 Sep 2024 15:02:53 +0200 Subject: [PATCH 03/16] AlternateUpstream --- .../failure_alternate_resource_adaptor.hpp | 78 +++++++++++++------ python/rmm/rmm/_lib/memory_resource.pyx | 15 ++-- 2 files changed, 64 insertions(+), 29 deletions(-) diff --git a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp index 977ea6008..7057ae959 100644 --- a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,23 +32,40 @@ namespace mr { * @file */ -template +/** + * @brief A device memory resource that use an alternate upstream resource when the primary throw a + * specified exception type. + * + * An instance of this resource must be constructed with two existing upstream resource in order to + * satisfy allocation requests. + * + * @tparam PrimaryUpstream The type of the primary upstream resource used for + * allocation/deallocation. + * @tparam AlternateUpstream The type of the alternate upstream resource used for + * allocation/deallocation when the primary fails. + * @tparam ExceptionType The type of exception that this adaptor should respond to. + */ +template class failure_alternate_resource_adaptor final : public device_memory_resource { public: using exception_type = ExceptionType; ///< The type of exception this object catches/throws /** - * @brief Construct a new `failure_alternate_resource_adaptor` using `upstream` to satisfy - * allocation requests. + * @brief Construct a new `failure_alternate_resource_adaptor` using `upstream` as the + * primary resource to satisfy allocation requests and if that fails, use `alternate_upstream` + * as an alternate * - * @throws rmm::logic_error if `upstream == nullptr` + * @throws rmm::logic_error if `upstream == nullptr` or `alternate_upstream == nullptr` * * @param upstream The resource used for allocating/deallocating device memory * @param alternate_upstream The resource used for alternate allocating/deallocating device * memory */ - failure_alternate_resource_adaptor(Upstream* upstream, Upstream* alternate_upstream) - : upstream_{upstream}, alternate_upstream_{alternate_upstream} + failure_alternate_resource_adaptor(PrimaryUpstream* upstream, + AlternateUpstream* alternate_upstream) + : primary_upstream_{upstream}, alternate_upstream_{alternate_upstream} { RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); RMM_EXPECTS(nullptr != alternate_upstream, @@ -69,23 +86,37 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { */ [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept { - return upstream_; + return primary_upstream_; } /** - * @briefreturn{Upstream* to the upstream memory resource} + * @briefreturn{rmm::device_async_resource_ref to the alternate upstream resource} */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } + [[nodiscard]] rmm::device_async_resource_ref get_alternate_upstream_resource() const noexcept + { + return alternate_upstream_; + } - private: - using lock_guard = std::lock_guard; + /** + * @briefreturn{PrimaryUpstream* to the upstream memory resource} + */ + [[nodiscard]] PrimaryUpstream* get_upstream() const noexcept { return primary_upstream_; } + /** + * @briefreturn{AlternateUpstream* to the alternate upstream memory resource} + */ + [[nodiscard]] AlternateUpstream* get_alternate_upstream() const noexcept + { + return alternate_upstream_; + } + + private: /** * @brief Allocates memory of size at least `bytes` using the upstream * resource. * * @throws `exception_type` if the requested allocation could not be fulfilled - * by the upstream resource. + * by the primary or the alternate upstream resource. * * @param bytes The size, in bytes, of the allocation * @param stream Stream on which to perform the allocation @@ -95,10 +126,10 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { { void* ret{}; try { - ret = upstream_->allocate(bytes, stream); + ret = primary_upstream_->allocate(bytes, stream); } catch (exception_type const& e) { ret = alternate_upstream_->allocate(bytes, stream); - lock_guard lock(mtx_); + std::lock_guard lock(mtx_); alternate_allocations_.insert(ret); } return ret; @@ -115,13 +146,13 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { { std::size_t count{0}; { - lock_guard lock(mtx_); + std::lock_guard lock(mtx_); count = alternate_allocations_.erase(ptr); } if (count > 0) { alternate_upstream_->deallocate(ptr, bytes, stream); } else { - upstream_->deallocate(ptr, bytes, stream); + primary_upstream_->deallocate(ptr, bytes, stream); } } @@ -135,13 +166,16 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override { if (this == &other) { return true; } - auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } - return get_upstream_resource() == cast->get_upstream_resource(); + auto cast = + dynamic_cast const*>( + &other); + if (cast == nullptr) { return primary_upstream_->is_equal(other); } + return get_upstream_resource() == cast->get_upstream_resource() && + get_alternate_upstream_resource() == cast->get_alternate_upstream_resource(); } - Upstream* upstream_; // the upstream used for satisfying allocation requests - Upstream* alternate_upstream_; // the upstream used for satisfying alternate allocation requests + PrimaryUpstream* primary_upstream_; // the primary upstream + AlternateUpstream* alternate_upstream_; // the alternate upstream std::unordered_set alternate_allocations_; // set of alternate allocations mutable std::mutex mtx_; // Mutex for exclusive lock. }; diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index d07c27e50..f35e4ba84 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -231,12 +231,12 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ cdef extern from "rmm/mr/device/failure_alternate_resource_adaptor.hpp" \ namespace "rmm::mr" nogil: - cdef cppclass failure_alternate_resource_adaptor[Upstream]( - device_memory_resource - ): + cdef cppclass failure_alternate_resource_adaptor[ + PrimaryUpstream, AlternateUpstream + ](device_memory_resource): failure_alternate_resource_adaptor( - Upstream* upstream_mr, - Upstream* alternate_upstream_mr, + PrimaryUpstream* upstream_mr, + AlternateUpstream* alternate_upstream_mr, ) except + cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \ @@ -1061,7 +1061,9 @@ cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): self.alternate_upstream_mr = alternate_upstream_mr self.c_obj.reset( - new failure_alternate_resource_adaptor[device_memory_resource]( + new failure_alternate_resource_adaptor[ + device_memory_resource, device_memory_resource + ]( upstream_mr.get_mr(), alternate_upstream_mr.get_mr(), ) @@ -1081,7 +1083,6 @@ cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): return self.alternate_upstream_mr - cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor): def __cinit__( From e42fc60c8679627bcee8707c0e55798d49f85025 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 2 Sep 2024 15:06:19 +0200 Subject: [PATCH 04/16] cleanup --- .../failure_alternate_resource_adaptor.hpp | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp index 7057ae959..559a3d6ee 100644 --- a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp @@ -36,7 +36,7 @@ namespace mr { * @brief A device memory resource that use an alternate upstream resource when the primary throw a * specified exception type. * - * An instance of this resource must be constructed with two existing upstream resource in order to + * An instance of this resource must be constructed with two existing upstream resource to * satisfy allocation requests. * * @tparam PrimaryUpstream The type of the primary upstream resource used for @@ -53,23 +53,22 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { using exception_type = ExceptionType; ///< The type of exception this object catches/throws /** - * @brief Construct a new `failure_alternate_resource_adaptor` using `upstream` as the + * @brief Construct a new `failure_alternate_resource_adaptor` using `primary_upstream` as the * primary resource to satisfy allocation requests and if that fails, use `alternate_upstream` * as an alternate * - * @throws rmm::logic_error if `upstream == nullptr` or `alternate_upstream == nullptr` + * @throws rmm::logic_error if `primary_upstream == nullptr` or `alternate_upstream == nullptr` * - * @param upstream The resource used for allocating/deallocating device memory - * @param alternate_upstream The resource used for alternate allocating/deallocating device + * @param primary_upstream The primary resource used for allocating/deallocating device memory + * @param alternate_upstream The alternate resource used for allocating/deallocating device memory * memory */ - failure_alternate_resource_adaptor(PrimaryUpstream* upstream, + failure_alternate_resource_adaptor(PrimaryUpstream* primary_upstream, AlternateUpstream* alternate_upstream) - : primary_upstream_{upstream}, alternate_upstream_{alternate_upstream} + : primary_upstream_{primary_upstream}, alternate_upstream_{alternate_upstream} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); - RMM_EXPECTS(nullptr != alternate_upstream, - "Unexpected null alternate upstream resource pointer."); + RMM_EXPECTS(nullptr != primary_upstream, "Unexpected null upstream resource pointer."); + RMM_EXPECTS(nullptr != alternate_upstream, "Unexpected null upstream resource pointer."); } failure_alternate_resource_adaptor() = delete; @@ -157,7 +156,7 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { } /** - * @brief Compare the upstream resource to another. + * @brief Compare the resource to another. * * @param other The other resource to compare to * @return true If the two resources are equivalent From cbd7f43d5093438eaa6d0fde4a3f5fe7f1147f24 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 2 Sep 2024 21:30:24 +0200 Subject: [PATCH 05/16] use device_async_resource_ref --- .../failure_alternate_resource_adaptor.hpp | 47 +++++-------------- python/rmm/rmm/_lib/memory_resource.pyx | 25 ++++++---- 2 files changed, 27 insertions(+), 45 deletions(-) diff --git a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp index 559a3d6ee..1757f4ce9 100644 --- a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp @@ -39,15 +39,9 @@ namespace mr { * An instance of this resource must be constructed with two existing upstream resource to * satisfy allocation requests. * - * @tparam PrimaryUpstream The type of the primary upstream resource used for - * allocation/deallocation. - * @tparam AlternateUpstream The type of the alternate upstream resource used for - * allocation/deallocation when the primary fails. * @tparam ExceptionType The type of exception that this adaptor should respond to. */ -template +template class failure_alternate_resource_adaptor final : public device_memory_resource { public: using exception_type = ExceptionType; ///< The type of exception this object catches/throws @@ -57,18 +51,14 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { * primary resource to satisfy allocation requests and if that fails, use `alternate_upstream` * as an alternate * - * @throws rmm::logic_error if `primary_upstream == nullptr` or `alternate_upstream == nullptr` - * * @param primary_upstream The primary resource used for allocating/deallocating device memory * @param alternate_upstream The alternate resource used for allocating/deallocating device memory * memory */ - failure_alternate_resource_adaptor(PrimaryUpstream* primary_upstream, - AlternateUpstream* alternate_upstream) + failure_alternate_resource_adaptor(device_async_resource_ref primary_upstream, + device_async_resource_ref alternate_upstream) : primary_upstream_{primary_upstream}, alternate_upstream_{alternate_upstream} { - RMM_EXPECTS(nullptr != primary_upstream, "Unexpected null upstream resource pointer."); - RMM_EXPECTS(nullptr != alternate_upstream, "Unexpected null upstream resource pointer."); } failure_alternate_resource_adaptor() = delete; @@ -96,19 +86,6 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { return alternate_upstream_; } - /** - * @briefreturn{PrimaryUpstream* to the upstream memory resource} - */ - [[nodiscard]] PrimaryUpstream* get_upstream() const noexcept { return primary_upstream_; } - - /** - * @briefreturn{AlternateUpstream* to the alternate upstream memory resource} - */ - [[nodiscard]] AlternateUpstream* get_alternate_upstream() const noexcept - { - return alternate_upstream_; - } - private: /** * @brief Allocates memory of size at least `bytes` using the upstream @@ -125,9 +102,9 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { { void* ret{}; try { - ret = primary_upstream_->allocate(bytes, stream); + ret = primary_upstream_.allocate_async(bytes, stream); } catch (exception_type const& e) { - ret = alternate_upstream_->allocate(bytes, stream); + ret = alternate_upstream_.allocate_async(bytes, stream); std::lock_guard lock(mtx_); alternate_allocations_.insert(ret); } @@ -149,9 +126,9 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { count = alternate_allocations_.erase(ptr); } if (count > 0) { - alternate_upstream_->deallocate(ptr, bytes, stream); + alternate_upstream_.deallocate_async(ptr, bytes, stream); } else { - primary_upstream_->deallocate(ptr, bytes, stream); + primary_upstream_.deallocate_async(ptr, bytes, stream); } } @@ -165,16 +142,14 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override { if (this == &other) { return true; } - auto cast = - dynamic_cast const*>( - &other); - if (cast == nullptr) { return primary_upstream_->is_equal(other); } + auto cast = dynamic_cast(&other); + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource() && get_alternate_upstream_resource() == cast->get_alternate_upstream_resource(); } - PrimaryUpstream* primary_upstream_; // the primary upstream - AlternateUpstream* alternate_upstream_; // the alternate upstream + device_async_resource_ref primary_upstream_; // the primary upstream + device_async_resource_ref alternate_upstream_; // the alternate upstream std::unordered_set alternate_allocations_; // set of alternate allocations mutable std::mutex mtx_; // Mutex for exclusive lock. }; diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index f35e4ba84..17c67c8c3 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -32,7 +32,9 @@ from libcpp.string cimport string from cuda.cudart import cudaError_t from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice + from rmm._cuda.stream cimport Stream + from rmm._cuda.stream import DEFAULT_STREAM from rmm._lib.cuda_stream_view cimport cuda_stream_view @@ -44,6 +46,7 @@ from rmm._lib.per_device_resource cimport ( cuda_device_id, set_per_device_resource as cpp_set_per_device_resource, ) + from rmm.statistics import Statistics # Transparent handle of a C++ exception @@ -84,6 +87,10 @@ cdef extern from *: # NOTE: Keep extern declarations in .pyx file as much as possible to avoid # leaking dependencies when importing RMM Cython .pxd files + +cdef extern from "rmm/error.hpp" namespace "rmm" nogil: + cdef cppclass out_of_memory + cdef extern from "rmm/mr/device/cuda_memory_resource.hpp" \ namespace "rmm::mr" nogil: cdef cppclass cuda_memory_resource(device_memory_resource): @@ -125,7 +132,6 @@ cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \ win32 win32_kmt - cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \ namespace "rmm::mr" nogil: cdef cppclass pool_memory_resource[Upstream](device_memory_resource): @@ -231,12 +237,15 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ cdef extern from "rmm/mr/device/failure_alternate_resource_adaptor.hpp" \ namespace "rmm::mr" nogil: - cdef cppclass failure_alternate_resource_adaptor[ - PrimaryUpstream, AlternateUpstream - ](device_memory_resource): + cdef cppclass failure_alternate_resource_adaptor[ExceptionType]( + device_memory_resource + ): + # Notice, `failure_alternate_resource_adaptor` takes `device_async_resource_ref` + # as upstream arguments but we define them as `device_memory_resource*` and + # rely on implicit type conversion. failure_alternate_resource_adaptor( - PrimaryUpstream* upstream_mr, - AlternateUpstream* alternate_upstream_mr, + device_memory_resource* upstream_mr, + device_memory_resource* alternate_upstream_mr, ) except + cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \ @@ -1061,9 +1070,7 @@ cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): self.alternate_upstream_mr = alternate_upstream_mr self.c_obj.reset( - new failure_alternate_resource_adaptor[ - device_memory_resource, device_memory_resource - ]( + new failure_alternate_resource_adaptor[out_of_memory]( upstream_mr.get_mr(), alternate_upstream_mr.get_mr(), ) From 451900d5d3191b5f74e9e51ec52df53e305e2498 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 2 Sep 2024 22:08:56 +0200 Subject: [PATCH 06/16] doc --- python/rmm/rmm/_lib/memory_resource.pyx | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index 17c67c8c3..ba0a7c659 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -1082,7 +1082,14 @@ cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): DeviceMemoryResource alternate_upstream_mr, ): """ - TODO: doc + A memory resource that use an alternate resource when memory allocation fails. + + Parameters + ---------- + upstream : DeviceMemoryResource + The primary resource used for allocating/deallocating device memory + alternate_upstream : DeviceMemoryResource + The alternate resource used when the premary fails """ pass From bbec2917f53d26c69567354b37ad2cc17a0a7776 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 2 Sep 2024 22:19:53 +0200 Subject: [PATCH 07/16] doc --- python/rmm/rmm/_lib/memory_resource.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index ba0a7c659..d330234b6 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -241,7 +241,7 @@ cdef extern from "rmm/mr/device/failure_alternate_resource_adaptor.hpp" \ device_memory_resource ): # Notice, `failure_alternate_resource_adaptor` takes `device_async_resource_ref` - # as upstream arguments but we define them as `device_memory_resource*` and + # as upstream arguments but we define them here as `device_memory_resource*` and # rely on implicit type conversion. failure_alternate_resource_adaptor( device_memory_resource* upstream_mr, From 0324f515a4b04f8d1b3277c65f9013ab6fa81e5f Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 3 Sep 2024 10:30:47 +0200 Subject: [PATCH 08/16] doc Co-authored-by: Mark Harris <783069+harrism@users.noreply.github.com> --- .../device/failure_alternate_resource_adaptor.hpp | 13 ++++++------- python/rmm/rmm/_lib/memory_resource.pyx | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp index 1757f4ce9..360b22130 100644 --- a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp @@ -33,11 +33,11 @@ namespace mr { */ /** - * @brief A device memory resource that use an alternate upstream resource when the primary throw a - * specified exception type. + * @brief A device memory resource that uses an alternate upstream resource when the primary upstream + * resource throws a specified exception type. * - * An instance of this resource must be constructed with two existing upstream resource to - * satisfy allocation requests. + * An instance of this resource must be constructed with two upstream resources to satisfy allocation + * requests. * * @tparam ExceptionType The type of exception that this adaptor should respond to. */ @@ -47,9 +47,8 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { using exception_type = ExceptionType; ///< The type of exception this object catches/throws /** - * @brief Construct a new `failure_alternate_resource_adaptor` using `primary_upstream` as the - * primary resource to satisfy allocation requests and if that fails, use `alternate_upstream` - * as an alternate + * @brief Construct a new `failure_alternate_resource_adaptor` that uses `primary_upstream` + * to satisfy allocation requests and if that fails with `ExceptionType`, uses `alternate_upstream`. * * @param primary_upstream The primary resource used for allocating/deallocating device memory * @param alternate_upstream The alternate resource used for allocating/deallocating device memory diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index d330234b6..393446bb8 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -1082,14 +1082,14 @@ cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): DeviceMemoryResource alternate_upstream_mr, ): """ - A memory resource that use an alternate resource when memory allocation fails. + A memory resource that uses an alternate resource when memory allocation fails. Parameters ---------- upstream : DeviceMemoryResource The primary resource used for allocating/deallocating device memory alternate_upstream : DeviceMemoryResource - The alternate resource used when the premary fails + The alternate resource used when the primary fails to allocate """ pass From 263a76a4c3bf0f532806c18ac2edb7c1129cff91 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 3 Sep 2024 10:31:18 +0200 Subject: [PATCH 09/16] style --- .../mr/device/failure_alternate_resource_adaptor.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp index 360b22130..52334b33d 100644 --- a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp @@ -33,11 +33,11 @@ namespace mr { */ /** - * @brief A device memory resource that uses an alternate upstream resource when the primary upstream - * resource throws a specified exception type. + * @brief A device memory resource that uses an alternate upstream resource when the primary + * upstream resource throws a specified exception type. * - * An instance of this resource must be constructed with two upstream resources to satisfy allocation - * requests. + * An instance of this resource must be constructed with two upstream resources to satisfy + * allocation requests. * * @tparam ExceptionType The type of exception that this adaptor should respond to. */ @@ -48,7 +48,8 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { /** * @brief Construct a new `failure_alternate_resource_adaptor` that uses `primary_upstream` - * to satisfy allocation requests and if that fails with `ExceptionType`, uses `alternate_upstream`. + * to satisfy allocation requests and if that fails with `ExceptionType`, uses + * `alternate_upstream`. * * @param primary_upstream The primary resource used for allocating/deallocating device memory * @param alternate_upstream The alternate resource used for allocating/deallocating device memory From db596fa4317937457aa0d45107d092cf0da05bc2 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 3 Sep 2024 10:38:20 +0200 Subject: [PATCH 10/16] doc --- .../mr/device/failure_alternate_resource_adaptor.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp index 52334b33d..5b7bee337 100644 --- a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp @@ -91,8 +91,8 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { * @brief Allocates memory of size at least `bytes` using the upstream * resource. * - * @throws `exception_type` if the requested allocation could not be fulfilled - * by the primary or the alternate upstream resource. + * @throws any exceptions thrown from the upstream resources, only `exception_type` + * thrown by the primary upstream is caught. * * @param bytes The size, in bytes, of the allocation * @param stream Stream on which to perform the allocation @@ -148,10 +148,10 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { get_alternate_upstream_resource() == cast->get_alternate_upstream_resource(); } - device_async_resource_ref primary_upstream_; // the primary upstream - device_async_resource_ref alternate_upstream_; // the alternate upstream - std::unordered_set alternate_allocations_; // set of alternate allocations - mutable std::mutex mtx_; // Mutex for exclusive lock. + device_async_resource_ref primary_upstream_; + device_async_resource_ref alternate_upstream_; + std::unordered_set alternate_allocations_; + mutable std::mutex mtx_; }; /** @} */ // end of group From 72ada89caad2105b510193e350df83d971019fe1 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 3 Sep 2024 13:08:06 +0200 Subject: [PATCH 11/16] c++ tests --- tests/CMakeLists.txt | 5 +- .../mr/device/failure_alternate_mr_tests.cpp | 103 ++++++++++++++++++ 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 tests/mr/device/failure_alternate_mr_tests.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 75b15a90b..60b8e1a90 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -145,9 +145,12 @@ ConfigureTest(STATISTICS_TEST mr/device/statistics_mr_tests.cpp GPUS 1 PERCENT 1 # tracking adaptor tests ConfigureTest(TRACKING_TEST mr/device/tracking_mr_tests.cpp GPUS 1 PERCENT 100) -# out-of-memory callback adaptor tests +# failure callback adaptor tests ConfigureTest(FAILURE_CALLBACK_TEST mr/device/failure_callback_mr_tests.cpp) +# failure alternate adaptor tests +ConfigureTest(FAILURE_ALTERNATE_TEST mr/device/failure_alternate_mr_tests.cpp) + # prefetch adaptor tests ConfigureTest(PREFETCH_ADAPTOR_TEST mr/device/prefetch_resource_adaptor_tests.cpp) diff --git a/tests/mr/device/failure_alternate_mr_tests.cpp b/tests/mr/device/failure_alternate_mr_tests.cpp new file mode 100644 index 000000000..ad985dfe8 --- /dev/null +++ b/tests/mr/device/failure_alternate_mr_tests.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../byte_literals.hpp" + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace rmm::test { +namespace { + +template +struct throw_at_limit_resource final : public mr::device_memory_resource { + throw_at_limit_resource(std::size_t limit) : limit{limit} {} + + void* do_allocate(std::size_t bytes, cuda_stream_view stream) override + { + if (bytes > limit) { throw ExceptionType{"foo"}; } + void* ptr{nullptr}; + RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, bytes)); + allocs.insert(ptr); + return ptr; + } + + void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override + { + RMM_ASSERT_CUDA_SUCCESS(cudaFree(ptr)); + allocs.erase(ptr); + } + + [[nodiscard]] bool do_is_equal(mr::device_memory_resource const& other) const noexcept override + { + return this == &other; + } + + const std::size_t limit; + std::unordered_set allocs{}; +}; + +TEST(FailureAlternateTest, TrackkBothUpstreams) +{ + throw_at_limit_resource primary_mr{100}; + throw_at_limit_resource alternate_mr{1000}; + rmm::mr::failure_alternate_resource_adaptor mr{primary_mr, alternate_mr}; + + // Check that a small allocation goes to the primary resource + { + void* a1 = mr.allocate(10); + EXPECT_EQ(primary_mr.allocs, std::unordered_set{{a1}}); + EXPECT_EQ(alternate_mr.allocs, std::unordered_set{}); + mr.deallocate(a1, 10); + EXPECT_EQ(primary_mr.allocs, std::unordered_set{}); + EXPECT_EQ(alternate_mr.allocs, std::unordered_set{}); + } + + // Check that a large allocation goes to the alternate resource + { + void* a1 = mr.allocate(200); + EXPECT_EQ(primary_mr.allocs, std::unordered_set{}); + EXPECT_EQ(alternate_mr.allocs, std::unordered_set{a1}); + mr.deallocate(a1, 200); + EXPECT_EQ(primary_mr.allocs, std::unordered_set{}); + EXPECT_EQ(alternate_mr.allocs, std::unordered_set{}); + } + + // Check that the exceptions raised by the alternate isn't caught + EXPECT_THROW(mr.allocate(2000), rmm::out_of_memory); +} + +TEST(FailureAlternateTest, DifferentExceptionTypes) +{ + throw_at_limit_resource primary_mr{100}; + throw_at_limit_resource alternate_mr{1000}; + rmm::mr::failure_alternate_resource_adaptor mr{primary_mr, alternate_mr}; + + // Check that only `rmm::out_of_memory` exceptions are caught + EXPECT_THROW(mr.allocate(200), std::invalid_argument); +} + +} // namespace +} // namespace rmm::test From ff3be661f3971f123a02886e3a1e4f2d4d24a09e Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 4 Sep 2024 08:24:18 +0200 Subject: [PATCH 12/16] typo Co-authored-by: Mark Harris <783069+harrism@users.noreply.github.com> --- tests/mr/device/failure_alternate_mr_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mr/device/failure_alternate_mr_tests.cpp b/tests/mr/device/failure_alternate_mr_tests.cpp index ad985dfe8..e74d12c97 100644 --- a/tests/mr/device/failure_alternate_mr_tests.cpp +++ b/tests/mr/device/failure_alternate_mr_tests.cpp @@ -59,7 +59,7 @@ struct throw_at_limit_resource final : public mr::device_memory_resource { std::unordered_set allocs{}; }; -TEST(FailureAlternateTest, TrackkBothUpstreams) +TEST(FailureAlternateTest, TrackBothUpstreams) { throw_at_limit_resource primary_mr{100}; throw_at_limit_resource alternate_mr{1000}; From 5345aed528a495aec0162de9834177e3e55070c6 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 6 Sep 2024 08:10:58 +0200 Subject: [PATCH 13/16] rename to fallback_resource_adapater --- ...tor.hpp => fallback_resource_adapater.hpp} | 24 +++++++++---------- python/rmm/rmm/_lib/memory_resource.pxd | 2 +- python/rmm/rmm/_lib/memory_resource.pyx | 12 +++++----- python/rmm/rmm/mr.py | 4 ++-- python/rmm/rmm/tests/test_rmm.py | 4 ++-- .../mr/device/failure_alternate_mr_tests.cpp | 6 ++--- 6 files changed, 26 insertions(+), 26 deletions(-) rename include/rmm/mr/device/{failure_alternate_resource_adaptor.hpp => fallback_resource_adapater.hpp} (81%) diff --git a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp b/include/rmm/mr/device/fallback_resource_adapater.hpp similarity index 81% rename from include/rmm/mr/device/failure_alternate_resource_adaptor.hpp rename to include/rmm/mr/device/fallback_resource_adapater.hpp index 5b7bee337..f6dc5d3c1 100644 --- a/include/rmm/mr/device/failure_alternate_resource_adaptor.hpp +++ b/include/rmm/mr/device/fallback_resource_adapater.hpp @@ -42,12 +42,12 @@ namespace mr { * @tparam ExceptionType The type of exception that this adaptor should respond to. */ template -class failure_alternate_resource_adaptor final : public device_memory_resource { +class fallback_resource_adapater final : public device_memory_resource { public: using exception_type = ExceptionType; ///< The type of exception this object catches/throws /** - * @brief Construct a new `failure_alternate_resource_adaptor` that uses `primary_upstream` + * @brief Construct a new `fallback_resource_adapater` that uses `primary_upstream` * to satisfy allocation requests and if that fails with `ExceptionType`, uses * `alternate_upstream`. * @@ -55,20 +55,20 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { * @param alternate_upstream The alternate resource used for allocating/deallocating device memory * memory */ - failure_alternate_resource_adaptor(device_async_resource_ref primary_upstream, - device_async_resource_ref alternate_upstream) + fallback_resource_adapater(device_async_resource_ref primary_upstream, + device_async_resource_ref alternate_upstream) : primary_upstream_{primary_upstream}, alternate_upstream_{alternate_upstream} { } - failure_alternate_resource_adaptor() = delete; - ~failure_alternate_resource_adaptor() override = default; - failure_alternate_resource_adaptor(failure_alternate_resource_adaptor const&) = delete; - failure_alternate_resource_adaptor& operator=(failure_alternate_resource_adaptor const&) = delete; - failure_alternate_resource_adaptor(failure_alternate_resource_adaptor&&) noexcept = + fallback_resource_adapater() = delete; + ~fallback_resource_adapater() override = default; + fallback_resource_adapater(fallback_resource_adapater const&) = delete; + fallback_resource_adapater& operator=(fallback_resource_adapater const&) = delete; + fallback_resource_adapater(fallback_resource_adapater&&) noexcept = default; ///< @default_move_constructor - failure_alternate_resource_adaptor& operator=(failure_alternate_resource_adaptor&&) noexcept = - default; ///< @default_move_assignment{failure_alternate_resource_adaptor} + fallback_resource_adapater& operator=(fallback_resource_adapater&&) noexcept = + default; ///< @default_move_assignment{fallback_resource_adapater} /** * @briefreturn{rmm::device_async_resource_ref to the upstream resource} @@ -142,7 +142,7 @@ class failure_alternate_resource_adaptor final : public device_memory_resource { [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override { if (this == &other) { return true; } - auto cast = dynamic_cast(&other); + auto cast = dynamic_cast(&other); if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource() && get_alternate_upstream_resource() == cast->get_alternate_upstream_resource(); diff --git a/python/rmm/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd index 139a45653..d006b75c5 100644 --- a/python/rmm/rmm/_lib/memory_resource.pxd +++ b/python/rmm/rmm/_lib/memory_resource.pxd @@ -97,7 +97,7 @@ cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor): cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor): cdef object _callback -cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): +cdef class FallbackResourceAdaptor(UpstreamResourceAdaptor): cdef readonly DeviceMemoryResource alternate_upstream_mr cpdef DeviceMemoryResource get_alternate_upstream(self) diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index 393446bb8..50f835021 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -235,15 +235,15 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ void* callback_arg ) except + -cdef extern from "rmm/mr/device/failure_alternate_resource_adaptor.hpp" \ +cdef extern from "rmm/mr/device/fallback_resource_adapater.hpp" \ namespace "rmm::mr" nogil: - cdef cppclass failure_alternate_resource_adaptor[ExceptionType]( + cdef cppclass fallback_resource_adapater[ExceptionType]( device_memory_resource ): - # Notice, `failure_alternate_resource_adaptor` takes `device_async_resource_ref` + # Notice, `fallback_resource_adapater` takes `device_async_resource_ref` # as upstream arguments but we define them here as `device_memory_resource*` and # rely on implicit type conversion. - failure_alternate_resource_adaptor( + fallback_resource_adapater( device_memory_resource* upstream_mr, device_memory_resource* alternate_upstream_mr, ) except + @@ -1058,7 +1058,7 @@ cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor): pass -cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): +cdef class FallbackResourceAdaptor(UpstreamResourceAdaptor): def __cinit__( self, @@ -1070,7 +1070,7 @@ cdef class FailureAlternateResourceAdaptor(UpstreamResourceAdaptor): self.alternate_upstream_mr = alternate_upstream_mr self.c_obj.reset( - new failure_alternate_resource_adaptor[out_of_memory]( + new fallback_resource_adapater[out_of_memory]( upstream_mr.get_mr(), alternate_upstream_mr.get_mr(), ) diff --git a/python/rmm/rmm/mr.py b/python/rmm/rmm/mr.py index 5a10c4e62..baca7e041 100644 --- a/python/rmm/rmm/mr.py +++ b/python/rmm/rmm/mr.py @@ -17,8 +17,8 @@ CudaAsyncMemoryResource, CudaMemoryResource, DeviceMemoryResource, - FailureAlternateResourceAdaptor, FailureCallbackResourceAdaptor, + FallbackResourceAdaptor, FixedSizeMemoryResource, LimitingResourceAdaptor, LoggingResourceAdaptor, @@ -62,7 +62,7 @@ "SystemMemoryResource", "TrackingResourceAdaptor", "FailureCallbackResourceAdaptor", - "FailureAlternateResourceAdaptor", + "FallbackResourceAdaptor", "UpstreamResourceAdaptor", "_flush_logs", "_initialize", diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index bef94a1b6..2c88812d9 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -787,7 +787,7 @@ def callback(nbytes: int) -> bool: assert retried[0] -def test_failure_alternate_resource_adaptor(): +def test_fallback_resource_adapater(): base = rmm.mr.CudaMemoryResource() def alloc_cb(size, stream, *, track: list[int], limit: int): @@ -811,7 +811,7 @@ def dealloc_cb(ptr, size, stream, *, track: list[int]): functools.partial(alloc_cb, track=alternate_track, limit=1000), functools.partial(dealloc_cb, track=alternate_track), ) - mr = rmm.mr.FailureAlternateResourceAdaptor(main_mr, alternate_mr) + mr = rmm.mr.FallbackResourceAdaptor(main_mr, alternate_mr) assert main_mr is mr.get_upstream() assert alternate_mr is mr.get_alternate_upstream() diff --git a/tests/mr/device/failure_alternate_mr_tests.cpp b/tests/mr/device/failure_alternate_mr_tests.cpp index e74d12c97..6ad77cc7d 100644 --- a/tests/mr/device/failure_alternate_mr_tests.cpp +++ b/tests/mr/device/failure_alternate_mr_tests.cpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include @@ -63,7 +63,7 @@ TEST(FailureAlternateTest, TrackBothUpstreams) { throw_at_limit_resource primary_mr{100}; throw_at_limit_resource alternate_mr{1000}; - rmm::mr::failure_alternate_resource_adaptor mr{primary_mr, alternate_mr}; + rmm::mr::fallback_resource_adapater mr{primary_mr, alternate_mr}; // Check that a small allocation goes to the primary resource { @@ -93,7 +93,7 @@ TEST(FailureAlternateTest, DifferentExceptionTypes) { throw_at_limit_resource primary_mr{100}; throw_at_limit_resource alternate_mr{1000}; - rmm::mr::failure_alternate_resource_adaptor mr{primary_mr, alternate_mr}; + rmm::mr::fallback_resource_adapater mr{primary_mr, alternate_mr}; // Check that only `rmm::out_of_memory` exceptions are caught EXPECT_THROW(mr.allocate(200), std::invalid_argument); From f31d5886d80dbf999192d10fc123082d038693a5 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 6 Sep 2024 08:12:05 +0200 Subject: [PATCH 14/16] style --- python/rmm/rmm/_lib/memory_resource.pxd | 1 - 1 file changed, 1 deletion(-) diff --git a/python/rmm/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd index d006b75c5..d9178db11 100644 --- a/python/rmm/rmm/_lib/memory_resource.pxd +++ b/python/rmm/rmm/_lib/memory_resource.pxd @@ -102,7 +102,6 @@ cdef class FallbackResourceAdaptor(UpstreamResourceAdaptor): cpdef DeviceMemoryResource get_alternate_upstream(self) - cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor): pass From d20dc82c8f95abf0adee6b4b7b4fc9c2b792eb9a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 6 Sep 2024 14:17:42 +0200 Subject: [PATCH 15/16] rename test file --- tests/CMakeLists.txt | 4 ++-- .../{failure_alternate_mr_tests.cpp => fallback_mr_tests.cpp} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename tests/mr/device/{failure_alternate_mr_tests.cpp => fallback_mr_tests.cpp} (100%) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 60b8e1a90..d1195a070 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -148,8 +148,8 @@ ConfigureTest(TRACKING_TEST mr/device/tracking_mr_tests.cpp GPUS 1 PERCENT 100) # failure callback adaptor tests ConfigureTest(FAILURE_CALLBACK_TEST mr/device/failure_callback_mr_tests.cpp) -# failure alternate adaptor tests -ConfigureTest(FAILURE_ALTERNATE_TEST mr/device/failure_alternate_mr_tests.cpp) +# failure fallback adaptor tests +ConfigureTest(FAILURE_ALTERNATE_TEST mr/device/fallback_mr_tests.cpp) # prefetch adaptor tests ConfigureTest(PREFETCH_ADAPTOR_TEST mr/device/prefetch_resource_adaptor_tests.cpp) diff --git a/tests/mr/device/failure_alternate_mr_tests.cpp b/tests/mr/device/fallback_mr_tests.cpp similarity index 100% rename from tests/mr/device/failure_alternate_mr_tests.cpp rename to tests/mr/device/fallback_mr_tests.cpp From 865d0101682b52a972ff1395494bb7a89a1ee144 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 10 Sep 2024 09:54:55 +0200 Subject: [PATCH 16/16] adaptor --- ...ater.hpp => fallback_resource_adaptor.hpp} | 24 +++++++++---------- python/rmm/rmm/_lib/memory_resource.pyx | 10 ++++---- python/rmm/rmm/tests/test_rmm.py | 2 +- tests/mr/device/fallback_mr_tests.cpp | 6 ++--- 4 files changed, 21 insertions(+), 21 deletions(-) rename include/rmm/mr/device/{fallback_resource_adapater.hpp => fallback_resource_adaptor.hpp} (84%) diff --git a/include/rmm/mr/device/fallback_resource_adapater.hpp b/include/rmm/mr/device/fallback_resource_adaptor.hpp similarity index 84% rename from include/rmm/mr/device/fallback_resource_adapater.hpp rename to include/rmm/mr/device/fallback_resource_adaptor.hpp index f6dc5d3c1..8083181b5 100644 --- a/include/rmm/mr/device/fallback_resource_adapater.hpp +++ b/include/rmm/mr/device/fallback_resource_adaptor.hpp @@ -42,12 +42,12 @@ namespace mr { * @tparam ExceptionType The type of exception that this adaptor should respond to. */ template -class fallback_resource_adapater final : public device_memory_resource { +class fallback_resource_adaptor final : public device_memory_resource { public: using exception_type = ExceptionType; ///< The type of exception this object catches/throws /** - * @brief Construct a new `fallback_resource_adapater` that uses `primary_upstream` + * @brief Construct a new `fallback_resource_adaptor` that uses `primary_upstream` * to satisfy allocation requests and if that fails with `ExceptionType`, uses * `alternate_upstream`. * @@ -55,20 +55,20 @@ class fallback_resource_adapater final : public device_memory_resource { * @param alternate_upstream The alternate resource used for allocating/deallocating device memory * memory */ - fallback_resource_adapater(device_async_resource_ref primary_upstream, - device_async_resource_ref alternate_upstream) + fallback_resource_adaptor(device_async_resource_ref primary_upstream, + device_async_resource_ref alternate_upstream) : primary_upstream_{primary_upstream}, alternate_upstream_{alternate_upstream} { } - fallback_resource_adapater() = delete; - ~fallback_resource_adapater() override = default; - fallback_resource_adapater(fallback_resource_adapater const&) = delete; - fallback_resource_adapater& operator=(fallback_resource_adapater const&) = delete; - fallback_resource_adapater(fallback_resource_adapater&&) noexcept = + fallback_resource_adaptor() = delete; + ~fallback_resource_adaptor() override = default; + fallback_resource_adaptor(fallback_resource_adaptor const&) = delete; + fallback_resource_adaptor& operator=(fallback_resource_adaptor const&) = delete; + fallback_resource_adaptor(fallback_resource_adaptor&&) noexcept = default; ///< @default_move_constructor - fallback_resource_adapater& operator=(fallback_resource_adapater&&) noexcept = - default; ///< @default_move_assignment{fallback_resource_adapater} + fallback_resource_adaptor& operator=(fallback_resource_adaptor&&) noexcept = + default; ///< @default_move_assignment{fallback_resource_adaptor} /** * @briefreturn{rmm::device_async_resource_ref to the upstream resource} @@ -142,7 +142,7 @@ class fallback_resource_adapater final : public device_memory_resource { [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override { if (this == &other) { return true; } - auto cast = dynamic_cast(&other); + auto cast = dynamic_cast(&other); if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource() && get_alternate_upstream_resource() == cast->get_alternate_upstream_resource(); diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index 50f835021..7cd8a05b7 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -235,15 +235,15 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ void* callback_arg ) except + -cdef extern from "rmm/mr/device/fallback_resource_adapater.hpp" \ +cdef extern from "rmm/mr/device/fallback_resource_adaptor.hpp" \ namespace "rmm::mr" nogil: - cdef cppclass fallback_resource_adapater[ExceptionType]( + cdef cppclass fallback_resource_adaptor[ExceptionType]( device_memory_resource ): - # Notice, `fallback_resource_adapater` takes `device_async_resource_ref` + # Notice, `fallback_resource_adaptor` takes `device_async_resource_ref` # as upstream arguments but we define them here as `device_memory_resource*` and # rely on implicit type conversion. - fallback_resource_adapater( + fallback_resource_adaptor( device_memory_resource* upstream_mr, device_memory_resource* alternate_upstream_mr, ) except + @@ -1070,7 +1070,7 @@ cdef class FallbackResourceAdaptor(UpstreamResourceAdaptor): self.alternate_upstream_mr = alternate_upstream_mr self.c_obj.reset( - new fallback_resource_adapater[out_of_memory]( + new fallback_resource_adaptor[out_of_memory]( upstream_mr.get_mr(), alternate_upstream_mr.get_mr(), ) diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index 2c88812d9..9379c71e8 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -787,7 +787,7 @@ def callback(nbytes: int) -> bool: assert retried[0] -def test_fallback_resource_adapater(): +def test_fallback_resource_adaptor(): base = rmm.mr.CudaMemoryResource() def alloc_cb(size, stream, *, track: list[int], limit: int): diff --git a/tests/mr/device/fallback_mr_tests.cpp b/tests/mr/device/fallback_mr_tests.cpp index 6ad77cc7d..0d6aff726 100644 --- a/tests/mr/device/fallback_mr_tests.cpp +++ b/tests/mr/device/fallback_mr_tests.cpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include @@ -63,7 +63,7 @@ TEST(FailureAlternateTest, TrackBothUpstreams) { throw_at_limit_resource primary_mr{100}; throw_at_limit_resource alternate_mr{1000}; - rmm::mr::fallback_resource_adapater mr{primary_mr, alternate_mr}; + rmm::mr::fallback_resource_adaptor mr{primary_mr, alternate_mr}; // Check that a small allocation goes to the primary resource { @@ -93,7 +93,7 @@ TEST(FailureAlternateTest, DifferentExceptionTypes) { throw_at_limit_resource primary_mr{100}; throw_at_limit_resource alternate_mr{1000}; - rmm::mr::fallback_resource_adapater mr{primary_mr, alternate_mr}; + rmm::mr::fallback_resource_adaptor mr{primary_mr, alternate_mr}; // Check that only `rmm::out_of_memory` exceptions are caught EXPECT_THROW(mr.allocate(200), std::invalid_argument);