From e52c099d174bb89429a91106c70cbdd689aa40df Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 15 May 2023 10:32:30 -0700 Subject: [PATCH 01/34] Fix and unify `waitRequests` in a template --- cpp/tests/include/utils.h | 23 +++++++++++++++++------ cpp/tests/request.cpp | 2 +- cpp/tests/utils.cpp | 24 ------------------------ cpp/tests/worker.cpp | 2 +- 4 files changed, 19 insertions(+), 32 deletions(-) diff --git a/cpp/tests/include/utils.h b/cpp/tests/include/utils.h index aeb7ef82..ca6cf36f 100644 --- a/cpp/tests/include/utils.h +++ b/cpp/tests/include/utils.h @@ -22,13 +22,24 @@ enum class ProgressMode { void createCudaContextCallback(void* callbackArg); +template void waitRequests(std::shared_ptr worker, - const std::vector>& requests, - const std::function& progressWorker); - -void waitRequestsTagMulti(std::shared_ptr worker, - const std::vector>& requests, - const std::function& progressWorker); + const std::vector>& requests, + const std::function& progressWorker) +{ + auto remainingRequests = requests; + while (!remainingRequests.empty()) { + auto updatedRequests = std::exchange(remainingRequests, decltype(remainingRequests)()); + remainingRequests.clear(); + for (auto& r : updatedRequests) { + if (progressWorker) progressWorker(); + if (!r->isCompleted()) + remainingRequests.push_back(r); + else + r->checkError(); + } + } +} std::function getProgressFunction(std::shared_ptr worker, ProgressMode progressMode); diff --git a/cpp/tests/request.cpp b/cpp/tests/request.cpp index 182c3f23..7958a20b 100644 --- a/cpp/tests/request.cpp +++ b/cpp/tests/request.cpp @@ -185,7 +185,7 @@ TEST_P(RequestTest, ProgressTagMulti) std::vector> requests; requests.push_back(_ep->tagMultiSend(_sendPtr, multiSize, multiIsCUDA, 0, false)); requests.push_back(_ep->tagMultiRecv(0, false)); - waitRequestsTagMulti(_worker, requests, _progressWorker); + waitRequests(_worker, requests, _progressWorker); auto recvRequest = requests[1]; diff --git a/cpp/tests/utils.cpp b/cpp/tests/utils.cpp index 7c5a9a0f..daf4ab97 100644 --- a/cpp/tests/utils.cpp +++ b/cpp/tests/utils.cpp @@ -13,30 +13,6 @@ void createCudaContextCallback(void* callbackArg) cudaFree(0); } -void waitRequests(std::shared_ptr worker, - const std::vector>& requests, - const std::function& progressWorker) -{ - for (auto& r : requests) { - do { - if (progressWorker) progressWorker(); - } while (!r->isCompleted()); - r->checkError(); - } -} - -void waitRequestsTagMulti(std::shared_ptr worker, - const std::vector>& requests, - const std::function& progressWorker) -{ - for (auto& r : requests) { - do { - if (progressWorker) progressWorker(); - } while (!r->isCompleted()); - r->checkError(); - } -} - std::function getProgressFunction(std::shared_ptr worker, ProgressMode progressMode) { diff --git a/cpp/tests/worker.cpp b/cpp/tests/worker.cpp index 89a0ee32..e1e08842 100644 --- a/cpp/tests/worker.cpp +++ b/cpp/tests/worker.cpp @@ -121,7 +121,7 @@ TEST_P(WorkerProgressTest, ProgressTagMulti) std::vector> requests; requests.push_back(ep->tagMultiSend(multiBuffer, multiSize, multiIsCUDA, 0, false)); requests.push_back(ep->tagMultiRecv(0, false)); - waitRequestsTagMulti(_worker, requests, _progressWorker); + waitRequests(_worker, requests, _progressWorker); for (const auto& br : requests[1]->_bufferRequests) { // br->buffer == nullptr are headers From 2a3c0cb67214cfab2eedf1cc7f5ba9a6cc0131dc Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 15 May 2023 11:58:43 -0700 Subject: [PATCH 02/34] Add new `RequestAM` class to handle Active Messages --- cpp/CMakeLists.txt | 2 + cpp/include/ucxx/constructors.h | 13 ++ cpp/include/ucxx/internal/request_am.h | 60 ++++++ cpp/include/ucxx/request.h | 2 + cpp/include/ucxx/request_am.h | 73 ++++++++ cpp/include/ucxx/typedefs.h | 3 + cpp/src/internal/request_am.cpp | 38 ++++ cpp/src/request.cpp | 2 + cpp/src/request_am.cpp | 245 +++++++++++++++++++++++++ 9 files changed, 438 insertions(+) create mode 100644 cpp/include/ucxx/internal/request_am.h create mode 100644 cpp/include/ucxx/request_am.h create mode 100644 cpp/src/internal/request_am.cpp create mode 100644 cpp/src/request_am.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 91a3f212..3dab7b36 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -112,9 +112,11 @@ add_library( src/endpoint.cpp src/header.cpp src/inflight_requests.cpp + src/internal/request_am.cpp src/listener.cpp src/log.cpp src/request.cpp + src/request_am.cpp src/request_helper.cpp src/request_stream.cpp src/request_tag.cpp diff --git a/cpp/include/ucxx/constructors.h b/cpp/include/ucxx/constructors.h index bf0422be..e75a0ea1 100644 --- a/cpp/include/ucxx/constructors.h +++ b/cpp/include/ucxx/constructors.h @@ -19,6 +19,7 @@ class Future; class Listener; class Notifier; class Request; +class RequestAM; class RequestStream; class RequestTag; class RequestTagMulti; @@ -53,6 +54,18 @@ std::shared_ptr createWorker(std::shared_ptr context, const bool enableDelayedSubmission); // Transfers +std::shared_ptr createRequestAMSend(std::shared_ptr endpoint, + void* buffer, + size_t length, + const bool enablePythonFuture, + RequestCallbackUserFunction callbackFunction, + RequestCallbackUserData callbackData); + +std::shared_ptr createRequestAMRecv(std::shared_ptr endpointOrWorker, + const bool enablePythonFuture, + RequestCallbackUserFunction callbackFunction, + RequestCallbackUserData callbackData); + std::shared_ptr createRequestStream(std::shared_ptr endpoint, bool send, void* buffer, diff --git a/cpp/include/ucxx/internal/request_am.h b/cpp/include/ucxx/internal/request_am.h new file mode 100644 index 00000000..4fdbf0c5 --- /dev/null +++ b/cpp/include/ucxx/internal/request_am.h @@ -0,0 +1,60 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: BSD-3-Clause + */ +#include +#include +#include +#include +#include + +#include + +#include + +namespace ucxx { + +class Buffer; +class InflightRequests; +class RequestAM; +class Request; +class Worker; + +namespace internal { + +class AmData; + +class RecvAmMessage { + public: + internal::AmData* _amData{nullptr}; + ucp_ep_h _ep{nullptr}; + std::shared_ptr _request{nullptr}; + std::shared_ptr _buffer{nullptr}; + + RecvAmMessage(internal::AmData* amData, + ucp_ep_h ep, + std::shared_ptr request, + std::shared_ptr buffer); + + void setUcpRequest(void* request); + + void callback(void* request, ucs_status_t status); +}; + +typedef std::unordered_map>> AmPoolType; +typedef std::unordered_map> RecvAmMessageMapType; + +class AmData { + public: + std::shared_ptr _worker{nullptr}; + AmPoolType _recvPool{}; + AmPoolType _recvWait{}; + RecvAmMessageMapType _recvAmMessageMap{}; + std::mutex _mutex{}; + std::function)> _registerInflightRequest{}; + std::unordered_map _allocators{}; +}; + +} // namespace internal + +} // namespace ucxx diff --git a/cpp/include/ucxx/request.h b/cpp/include/ucxx/request.h index e9326403..87017668 100644 --- a/cpp/include/ucxx/request.h +++ b/cpp/include/ucxx/request.h @@ -192,6 +192,8 @@ class Request : public Component { * @returns the formatted string containing the owner type and its handle. */ const std::string& getOwnerString() const; + + virtual std::shared_ptr getRecvBuffer(); }; } // namespace ucxx diff --git a/cpp/include/ucxx/request_am.h b/cpp/include/ucxx/request_am.h new file mode 100644 index 00000000..76994e94 --- /dev/null +++ b/cpp/include/ucxx/request_am.h @@ -0,0 +1,73 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: BSD-3-Clause + */ +#pragma once +#include +#include + +#include + +#include +#include +#include + +namespace ucxx { + +class Buffer; + +namespace internal { +class RecvAmMessage; +} // namespace internal + +class RequestAM : public Request { + private: + friend class internal::RecvAmMessage; + + ucs_memory_type_t _sendHeader{}; ///< The header to send + std::shared_ptr _buffer{nullptr}; ///< The AM received message buffer + + RequestAM(std::shared_ptr endpoint, + void* buffer, + size_t length, + const bool enablePythonFuture = false, + RequestCallbackUserFunction callbackFunction = nullptr, + RequestCallbackUserData callbackData = nullptr); + + RequestAM(std::shared_ptr endpointOrWorker, + const bool enablePythonFuture = false, + RequestCallbackUserFunction callbackFunction = nullptr, + RequestCallbackUserData callbackData = nullptr); + + public: + friend std::shared_ptr createRequestAMSend( + std::shared_ptr endpoint, + void* buffer, + size_t length, + const bool enablePythonFuture, + RequestCallbackUserFunction callbackFunction, + RequestCallbackUserData callbackData); + + friend std::shared_ptr createRequestAMRecv( + std::shared_ptr endpointOrWorker, + const bool enablePythonFuture, + RequestCallbackUserFunction callbackFunction, + RequestCallbackUserData callbackData); + + virtual void populateDelayedSubmission(); + + void request(); + + static void amSendCallback(void* request, ucs_status_t status, void* user_data); + + static ucs_status_t recvCallback(void* arg, + const void* header, + size_t header_length, + void* data, + size_t length, + const ucp_am_recv_param_t* param); + + std::shared_ptr getRecvBuffer() override; +}; + +} // namespace ucxx diff --git a/cpp/include/ucxx/typedefs.h b/cpp/include/ucxx/typedefs.h index 6cea3576..1c8b1933 100644 --- a/cpp/include/ucxx/typedefs.h +++ b/cpp/include/ucxx/typedefs.h @@ -11,6 +11,7 @@ namespace ucxx { +class Buffer; class Request; // Logging levels @@ -36,4 +37,6 @@ typedef std::unordered_map ConfigMap; typedef std::function)> RequestCallbackUserFunction; typedef std::shared_ptr RequestCallbackUserData; +typedef std::function(size_t)> AmAllocatorType; + } // namespace ucxx diff --git a/cpp/src/internal/request_am.cpp b/cpp/src/internal/request_am.cpp new file mode 100644 index 00000000..5006a86a --- /dev/null +++ b/cpp/src/internal/request_am.cpp @@ -0,0 +1,38 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: BSD-3-Clause + */ +#include +#include +#include +#include + +namespace ucxx { + +namespace internal { + +RecvAmMessage::RecvAmMessage(internal::AmData* amData, + ucp_ep_h ep, + std::shared_ptr request, + std::shared_ptr buffer) + : _amData(amData), _ep(ep), _request(request), _buffer(buffer) +{ + _request->_delayedSubmission = + std::make_shared(false, _buffer->data(), _buffer->getSize()); +} + +void RecvAmMessage::setUcpRequest(void* request) { _request->_request = request; } + +void RecvAmMessage::callback(void* request, ucs_status_t status) +{ + _request->_buffer = _buffer; + _request->callback(request, status); + { + std::lock_guard lock(_amData->_mutex); + _amData->_recvAmMessageMap.erase(_request.get()); + } +} + +} // namespace internal + +} // namespace ucxx diff --git a/cpp/src/request.cpp b/cpp/src/request.cpp index 89c32abd..2e19c1ca 100644 --- a/cpp/src/request.cpp +++ b/cpp/src/request.cpp @@ -184,4 +184,6 @@ void Request::setStatus(ucs_status_t status) const std::string& Request::getOwnerString() const { return _ownerString; } +std::shared_ptr Request::getRecvBuffer() { return nullptr; } + } // namespace ucxx diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp new file mode 100644 index 00000000..e026b813 --- /dev/null +++ b/cpp/src/request_am.cpp @@ -0,0 +1,245 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: BSD-3-Clause + */ +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace ucxx { + +std::shared_ptr createRequestAMSend( + std::shared_ptr endpoint, + void* buffer, + size_t length, + const bool enablePythonFuture = false, + RequestCallbackUserFunction callbackFunction = nullptr, + RequestCallbackUserData callbackData = nullptr) +{ + return std::shared_ptr( + new RequestAM(endpoint, buffer, length, enablePythonFuture, callbackFunction, callbackData)); +} + +std::shared_ptr createRequestAMRecv( + std::shared_ptr endpointOrWorker, + const bool enablePythonFuture = false, + RequestCallbackUserFunction callbackFunction = nullptr, + RequestCallbackUserData callbackData = nullptr) +{ + auto endpoint = std::dynamic_pointer_cast(endpointOrWorker); + auto worker = endpoint->getWorker(endpoint->getParent()); + + auto createRequest = [endpointOrWorker, enablePythonFuture, callbackFunction, callbackData]() { + return std::shared_ptr( + new RequestAM(endpointOrWorker, enablePythonFuture, callbackFunction, callbackData)); + }; + return worker->getAmRecv(endpoint->getHandle(), createRequest); +} + +RequestAM::RequestAM(std::shared_ptr endpoint, + void* buffer, + size_t length, + const bool enablePythonFuture, + RequestCallbackUserFunction callbackFunction, + RequestCallbackUserData callbackData) + : Request(endpoint, + std::make_shared(true, buffer, length), + std::string("amSend"), + enablePythonFuture) +{ + if (_endpoint == nullptr) throw ucxx::Error("An endpoint is required to send active messages"); + + _callback = callbackFunction; + _callbackData = callbackData; + + // A delayed notification request is not populated immediately, instead it is + // delayed to allow the worker progress thread to set its status, and more + // importantly the Python future later on, so that we don't need the GIL here. + _worker->registerDelayedSubmission( + std::bind(std::mem_fn(&Request::populateDelayedSubmission), this)); +} + +RequestAM::RequestAM(std::shared_ptr endpointOrWorker, + const bool enablePythonFuture, + RequestCallbackUserFunction callbackFunction, + RequestCallbackUserData callbackData) + : Request(endpointOrWorker, nullptr, std::string("amRecv"), enablePythonFuture) +{ + _callback = callbackFunction; + _callbackData = callbackData; +} + +void RequestAM::amSendCallback(void* request, ucs_status_t status, void* user_data) +{ + Request* req = reinterpret_cast(user_data); + ucxx_trace_req_f(req->getOwnerString().c_str(), request, "amSend", "amSendCallback"); + req->callback(request, status); +} + +static void _recvCompletedCallback(void* request, + ucs_status_t status, + size_t length, + void* user_data) +{ + internal::RecvAmMessage* recvAmMessage = static_cast(user_data); + ucxx_trace_req_f( + recvAmMessage->_request->getOwnerString().c_str(), request, "amRecv", "amRecvCallback"); + recvAmMessage->callback(request, status); +} + +ucs_status_t RequestAM::recvCallback(void* arg, + const void* header, + size_t header_length, + void* data, + size_t length, + const ucp_am_recv_param_t* param) +{ + internal::AMData* amData = reinterpret_cast(arg); + auto worker = amData->_worker; + auto& recvPool = amData->_recvPool; + auto& recvWait = amData->_recvWait; + + if ((param->recv_attr & UCP_AM_RECV_ATTR_FIELD_REPLY_EP) == 0) + ucxx_error("UCP_AM_RECV_ATTR_FIELD_REPLY_EP not set"); + + ucp_ep_h ep = param->reply_ep; + recvPool.try_emplace(ep, std::queue>()); + + bool is_rndv = param->recv_attr & UCP_AM_RECV_ATTR_FLAG_RNDV; + + std::shared_ptr buf{nullptr}; + auto allocatorType = *reinterpret_cast(header); + + std::shared_ptr req{nullptr}; + + { + std::lock_guard lock(amData->_mutex); + + if (recvWait.find(ep) != recvWait.end() && !recvWait[ep].empty()) { + req = recvWait.at(ep).front(); + recvWait.at(ep).pop(); + ucxx_trace_req("amRecv recvWait: %p", req.get()); + } else { + req = std::shared_ptr( + new RequestAM(worker, worker->isFutureEnabled(), nullptr, nullptr)); + recvPool.at(ep).push(req); + ucxx_trace_req("amRecv recvPool: %p", req.get()); + } + } + + if (is_rndv) { + if (allocatorType == UCS_MEMORY_TYPE_HOST || allocatorType == UCS_MEMORY_TYPE_CUDA) { + buf = amData->_allocators.at(allocatorType)(length); + } else { + ucxx_debug("Unsupported memory type %d", allocatorType); + internal::RecvAmMessage recvAmMessage(amData, ep, req, nullptr); + recvAmMessage.callback(nullptr, UCS_ERR_UNSUPPORTED); + return UCS_ERR_UNSUPPORTED; + } + + auto recvAmMessage = std::make_shared(amData, ep, req, buf); + + ucp_request_param_t request_param = {.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_USER_DATA | + UCP_OP_ATTR_FLAG_NO_IMM_CMPL, + .cb = {.recv_am = _recvCompletedCallback}, + .user_data = recvAmMessage.get()}; + + ucs_status_ptr_t status = + ucp_am_recv_data_nbx(worker->getHandle(), data, buf->data(), length, &request_param); + + ucxx_debug("am rndv: ep %p len %lu", ep, length); + + if (req->isCompleted()) { + // The request completed/errored immediately + ucs_status_t s = UCS_PTR_STATUS(status); + recvAmMessage->callback(nullptr, s); + + return s; + } else { + // The request will be handled by the callback + recvAmMessage->setUcpRequest(status); + amData->_registerInflightRequest(req); + + { + std::lock_guard lock(amData->_mutex); + amData->_recvAmMessageMap.emplace(req.get(), recvAmMessage); + } + + return UCS_INPROGRESS; + } + } else { + ucxx_debug("am eager copying %lu bytes with ep %p", length, ep); + + buf = amData->_allocators.at(UCS_MEMORY_TYPE_HOST)(length); + if (length > 0) memcpy(buf->data(), data, length); + + internal::RecvAmMessage recvAmMessage(amData, ep, req, buf); + recvAmMessage.callback(nullptr, UCS_OK); + return UCS_OK; + } +} + +std::shared_ptr RequestAM::getRecvBuffer() { return _buffer; } + +void RequestAM::request() +{ + static const ucp_tag_t tagMask = -1; + + ucp_request_param_t param = {.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_FLAGS | + UCP_OP_ATTR_FIELD_USER_DATA, + .flags = UCP_AM_SEND_FLAG_REPLY, + .datatype = ucp_dt_make_contig(1), + .user_data = this}; + + // TODO: add CUDA support + _sendHeader = UCS_MEMORY_TYPE_HOST; + + if (_delayedSubmission->_send) { + param.cb.send = amSendCallback; + _request = ucp_am_send_nbx(_endpoint->getHandle(), + 0, + &_sendHeader, + sizeof(_sendHeader), + _delayedSubmission->_buffer, + _delayedSubmission->_length, + ¶m); + } else { + throw ucxx::UnsupportedError( + "Receiving active messages must be handled by the worker's callback"); + } +} + +void RequestAM::populateDelayedSubmission() +{ + request(); + + if (_enablePythonFuture) + ucxx_trace_req_f(_ownerString.c_str(), + _request, + _operationName.c_str(), + "buffer %p, size %lu, future %p, future handle %p, populateDelayedSubmission", + _delayedSubmission->_buffer, + _delayedSubmission->_length, + _future.get(), + _future->getHandle()); + else + ucxx_trace_req_f(_ownerString.c_str(), + _request, + _operationName.c_str(), + "buffer %p, size %lu, populateDelayedSubmission", + _delayedSubmission->_buffer, + _delayedSubmission->_length); + + process(); +} + +} // namespace ucxx From b34d383d76fd3e7cbbb88739b0724a653d37d80d Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 15 May 2023 12:01:39 -0700 Subject: [PATCH 03/34] Register AM handler in `ucxx::Worker` --- cpp/include/ucxx/worker.h | 60 +++++++++++++++++++++++++++++++++++++++ cpp/src/worker.cpp | 54 ++++++++++++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 1 deletion(-) diff --git a/cpp/include/ucxx/worker.h b/cpp/include/ucxx/worker.h index bb1e3ac7..f66787f0 100644 --- a/cpp/include/ucxx/worker.h +++ b/cpp/include/ucxx/worker.h @@ -25,8 +25,14 @@ namespace ucxx { class Address; +class Buffer; class Endpoint; class Listener; +class RequestAM; + +namespace internal { +class AmData; +} // namespace internal class Worker : public Component { private: @@ -48,6 +54,12 @@ class Worker : public Component { std::shared_ptr _delayedSubmissionCollection{ nullptr}; ///< Collection of enqueued delayed submissions + friend std::shared_ptr createRequestAMRecv( + std::shared_ptr endpointOrWorker, + const bool enablePythonFuture, + RequestCallbackUserFunction callbackFunction, + RequestCallbackUserData callbackData); + protected: bool _enableFuture{ false}; ///< Boolean identifying whether the worker was created with future capability @@ -55,6 +67,8 @@ class Worker : public Component { std::queue> _futuresPool{}; ///< Futures pool to prevent running out of fresh futures std::shared_ptr _notifier{nullptr}; ///< Notifier object + std::shared_ptr + _amData; ///< Worker data made available to Active Messages callback private: /** @@ -65,6 +79,23 @@ class Worker : public Component { */ void drainWorkerTagRecv(); + /** + * @brief Get active message receive request. + * + * Returns an active message request from the pool if the worker has already begun + * handling a request with the active messages callback, otherwise creates a new request + * that is later populated with status and buffer by the active messages callback. + * + * @param[in] ep the endpoint handle where receiving the message, the same handle that + * will later be used to reply to the message. + * @param[in] createAmRecvRequestFunction function to create a new request if one is not + * already availale in the pool. + * + * @returns Request to be subsequently checked for the completion state and data. + */ + std::shared_ptr getAmRecv( + ucp_ep_h ep, std::function()> createAmRecvRequestFunction); + /** * @brief Stop the progress thread if running without raising warnings. * @@ -643,6 +674,35 @@ class Worker : public Component { std::shared_ptr createListener(uint16_t port, ucp_listener_conn_callback_t callback, void* callbackArgs); + + /** + * @brief Register allocator for active messages. + * + * Register a new allocator for active messages. By default, only one allocator is defined + * for host memory (`UCS_MEMORY_TYPE_HOST`), and is used as a fallback when an allocator + * for the source's memory type is unavailable. In many certain circumstances relying + * exclusively on the host allocator is undesirable, for example when transferring CUDA + * buffers the destination is always going to be a host buffer and prevent the use of + * transports like NVLink or InfiniBand+GPUDirectRDMA. For that reason it's important that + * the user defines those allocators that are important for the application. + * + * If the `memoryType` has already been registered, the previous allocator will be + * replaced by the new one. Be careful when doing this after transfers have started, there + * are no guarantees that inflight messages have not already been allocated with the old + * allocator for that type. + * + * @code{.cpp} + * // context is `std::shared_ptr` + * auto worker = context->createWorker(false); + * + * worker->registerAmAllocator(`UCS_MEMORY_TYPE_CUDA`, ucxx::RMMBuffer); + * @endcode + * + * @param[in] memoryType the memory type the allocator will be used for. + * @param[in] allocator the allocator callbable that will be used to allocate new + * active message buffers. + */ + void registerAmAllocator(ucs_memory_type_t memoryType, AmAllocatorType allocator); }; } // namespace ucxx diff --git a/cpp/src/worker.cpp b/cpp/src/worker.cpp index 72cda21e..89fa133c 100644 --- a/cpp/src/worker.cpp +++ b/cpp/src/worker.cpp @@ -15,6 +15,9 @@ #include #include +#include +#include +#include #include #include #include @@ -34,6 +37,24 @@ Worker::Worker(std::shared_ptr context, const bool enableDelayedSubmiss if (enableDelayedSubmission) _delayedSubmissionCollection = std::make_shared(); + if (context->getFeatureFlags() & UCP_FEATURE_AM) { + unsigned int AM_MSG_ID = 0; + _amData = std::make_shared(); + _amData->_registerInflightRequest = [this](std::shared_ptr req) { + this->registerInflightRequest(req); + }; + registerAmAllocator(UCS_MEMORY_TYPE_HOST, + [](size_t length) { return std::make_shared(length); }); + + ucp_am_handler_param_t am_handler_param = {.field_mask = UCP_AM_HANDLER_PARAM_FIELD_ID | + UCP_AM_HANDLER_PARAM_FIELD_CB | + UCP_AM_HANDLER_PARAM_FIELD_ARG, + .id = AM_MSG_ID, + .cb = RequestAM::recvCallback, + .arg = _amData.get()}; + utils::ucsErrorThrow(ucp_worker_set_am_recv_handler(_handle, &am_handler_param)); + } + ucxx_trace("Worker created: %p, enableDelayedSubmission: %d, enableFuture: %d", this, enableDelayedSubmission, @@ -81,10 +102,36 @@ void Worker::drainWorkerTagRecv() } } +std::shared_ptr Worker::getAmRecv( + ucp_ep_h ep, std::function()> createAmRecvRequestFunction) +{ + std::lock_guard lock(_amData->_mutex); + + auto& recvPool = _amData->_recvPool; + auto& recvWait = _amData->_recvWait; + + if (recvPool.find(ep) != recvPool.end() && !recvPool.at(ep).empty()) { + auto req = recvPool.at(ep).front(); + recvPool.at(ep).pop(); + return req; + } else { + auto req = createAmRecvRequestFunction(); + recvWait.try_emplace(ep, std::queue>()); + recvWait.at(ep).push(req); + return req; + } +} + std::shared_ptr createWorker(std::shared_ptr context, const bool enableDelayedSubmission) { - return std::shared_ptr(new Worker(context, enableDelayedSubmission)); + auto worker = std::shared_ptr(new Worker(context, enableDelayedSubmission)); + + // We can only get a `shared_ptr` for the Active Messages callback after it's + // been created, thus this cannot be in the constructor. + if (worker->_amData != nullptr) worker->_amData->_worker = worker; + + return worker; } Worker::~Worker() @@ -368,4 +415,9 @@ std::shared_ptr Worker::createListener(uint16_t port, return listener; } +void Worker::registerAmAllocator(ucs_memory_type_t memoryType, AmAllocatorType allocator) +{ + _amData->_allocators.insert_or_assign(memoryType, allocator); +} + } // namespace ucxx From c9be9bdedd5ec2452634647f8aeef07334794c8a Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 15 May 2023 12:02:23 -0700 Subject: [PATCH 04/34] Expose Active Messages via `ucxx::Endpoint` --- cpp/include/ucxx/endpoint.h | 51 +++++++++++++++++++++++++++++++++++++ cpp/src/endpoint.cpp | 21 +++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/cpp/include/ucxx/endpoint.h b/cpp/include/ucxx/endpoint.h index a8ba3bd7..91feea9b 100644 --- a/cpp/include/ucxx/endpoint.h +++ b/cpp/include/ucxx/endpoint.h @@ -253,6 +253,57 @@ class Endpoint : public Component { */ void setCloseCallback(std::function closeCallback, void* closeCallbackArg); + /** + * @brief Enqueue an active message send operation. + * + * Enqueue an active message send operation, returning a `std::shared` that + * can be later awaited and checked for errors. This is a non-blocking operation, and the + * status of the transfer must be verified from the resulting request object before the + * data can be released. + * + * Using a Python future may be requested by specifying `enablePythonFuture`. If a + * Python future is requested, the Python application must then await on this future to + * ensure the transfer has completed. Requires UCXX Python support. + * + * @param[in] buffer a raw pointer to the data to be sent. + * @param[in] length the size in bytes of the tag message to be sent. + * @param[in] enablePythonFuture whether a python future should be created and + * subsequently notified. + * @param[in] callbackFunction user-defined callback function to call upon completion. + * @param[in] callbackData user-defined data to pass to the `callbackFunction`. + * + * @returns Request to be subsequently checked for the completion and its state. + */ + std::shared_ptr amSend(void* buffer, + size_t length, + const bool enablePythonFuture = false, + RequestCallbackUserFunction callbackFunction = nullptr, + RequestCallbackUserData callbackData = nullptr); + + /** + * @brief Enqueue an active message receive operation. + * + * Enqueue an active message receive operation, returning a `std::shared` + * that can be later awaited and checked for errors, making data available via the return + * value's `getRecvBuffer()` method once the operation completes successfully. This is a + * non-blocking operation, and the status of the transfer must be verified from the + * resulting request object before the data can be consumed. + * + * Using a Python future may be requested by specifying `enablePythonFuture`. If a + * Python future is requested, the Python application must then await on this future to + * ensure the transfer has completed. Requires UCXX Python support. + * + * @param[in] enablePythonFuture whether a python future should be created and + * subsequently notified. + * @param[in] callbackFunction user-defined callback function to call upon completion. + * @param[in] callbackData user-defined data to pass to the `callbackFunction`. + * + * @returns Request to be subsequently checked for the completion state and data. + */ + std::shared_ptr amRecv(const bool enablePythonFuture = false, + RequestCallbackUserFunction callbackFunction = nullptr, + RequestCallbackUserData callbackData = nullptr); + /** * @brief Enqueue a stream send operation. * diff --git a/cpp/src/endpoint.cpp b/cpp/src/endpoint.cpp index c2ec9e00..8c5b486b 100644 --- a/cpp/src/endpoint.cpp +++ b/cpp/src/endpoint.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -191,6 +192,26 @@ void Endpoint::removeInflightRequest(const Request* const request) size_t Endpoint::cancelInflightRequests() { return _inflightRequests->cancelAll(); } +std::shared_ptr Endpoint::amSend(void* buffer, + size_t length, + const bool enablePythonFuture, + RequestCallbackUserFunction callbackFunction, + RequestCallbackUserData callbackData) +{ + auto endpoint = std::dynamic_pointer_cast(shared_from_this()); + return registerInflightRequest(createRequestAMSend( + endpoint, buffer, length, enablePythonFuture, callbackFunction, callbackData)); +} + +std::shared_ptr Endpoint::amRecv(const bool enablePythonFuture, + RequestCallbackUserFunction callbackFunction, + RequestCallbackUserData callbackData) +{ + auto endpoint = std::dynamic_pointer_cast(shared_from_this()); + return registerInflightRequest( + createRequestAMRecv(endpoint, enablePythonFuture, callbackFunction, callbackData)); +} + std::shared_ptr Endpoint::streamSend(void* buffer, size_t length, const bool enablePythonFuture) From 8cfce9952c4b05dd08873e18f64cc98ae887e687 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 15 May 2023 12:02:46 -0700 Subject: [PATCH 05/34] Add C++ tests for Active Messages --- cpp/tests/request.cpp | 23 +++++++++++++++++++++++ cpp/tests/worker.cpp | 27 +++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/cpp/tests/request.cpp b/cpp/tests/request.cpp index 7958a20b..2072773d 100644 --- a/cpp/tests/request.cpp +++ b/cpp/tests/request.cpp @@ -134,6 +134,29 @@ class RequestTest } }; +TEST_P(RequestTest, ProgressAm) +{ + if (_progressMode == ProgressMode::Wait) { + GTEST_SKIP() << "Interrupting UCP worker progress operation in wait mode is not possible"; + } + + allocate(1, false); + + // Submit and wait for transfers to complete + std::vector> requests; + requests.push_back(_ep->amSend(_sendPtr[0], _messageSize)); + requests.push_back(_ep->amRecv()); + waitRequests(_worker, requests, _progressWorker); + + auto recvReq = requests[1]; + _recvPtr[0] = recvReq->getRecvBuffer()->data(); + + copyResults(); + + // Assert data correctness + ASSERT_THAT(_recv[0], ContainerEq(_send[0])); +} + TEST_P(RequestTest, ProgressStream) { allocate(); diff --git a/cpp/tests/worker.cpp b/cpp/tests/worker.cpp index e1e08842..c008fb86 100644 --- a/cpp/tests/worker.cpp +++ b/cpp/tests/worker.cpp @@ -72,6 +72,33 @@ TEST_F(WorkerTest, TagProbe) ASSERT_TRUE(_worker->tagProbe(0)); } +TEST_P(WorkerProgressTest, ProgressAm) +{ + if (_progressMode == ProgressMode::Wait) { + // TODO: Is this the same reason as TagMulti? + GTEST_SKIP() << "Wait mode not supported"; + } + + auto ep = _worker->createEndpointFromWorkerAddress(_worker->getAddress()); + + std::vector send{123}; + + std::vector> requests; + requests.push_back(ep->amSend(send.data(), send.size() * sizeof(int))); + requests.push_back(ep->amRecv()); + waitRequests(_worker, requests, _progressWorker); + + auto recvReq = requests[1]; + auto recvBuffer = recvReq->getRecvBuffer(); + + ASSERT_EQ(recvBuffer->getType(), ucxx::BufferType::Host); + ASSERT_EQ(recvBuffer->getSize(), send.size() * sizeof(int)); + + std::vector recvAbstract(reinterpret_cast(recvBuffer->data()), + reinterpret_cast(recvBuffer->data()) + send.size()); + ASSERT_EQ(recvAbstract[0], send[0]); +} + TEST_P(WorkerProgressTest, ProgressStream) { auto ep = _worker->createEndpointFromWorkerAddress(_worker->getAddress()); From 7725db957baaff938ff6eabaf0d6fa4ff483b794 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 15 May 2023 12:36:34 -0700 Subject: [PATCH 06/34] Document `ucxx::RequestAM` --- cpp/include/ucxx/constructors.h | 2 +- cpp/include/ucxx/internal/request_am.h | 55 ++++++++--- cpp/include/ucxx/request.h | 8 ++ cpp/include/ucxx/request_am.h | 128 ++++++++++++++++++++++++- cpp/include/ucxx/worker.h | 12 +-- cpp/src/request_am.cpp | 21 ++-- 6 files changed, 193 insertions(+), 33 deletions(-) diff --git a/cpp/include/ucxx/constructors.h b/cpp/include/ucxx/constructors.h index e75a0ea1..5dd0dd80 100644 --- a/cpp/include/ucxx/constructors.h +++ b/cpp/include/ucxx/constructors.h @@ -61,7 +61,7 @@ std::shared_ptr createRequestAMSend(std::shared_ptr endpoin RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData); -std::shared_ptr createRequestAMRecv(std::shared_ptr endpointOrWorker, +std::shared_ptr createRequestAMRecv(std::shared_ptr endpoint, const bool enablePythonFuture, RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData); diff --git a/cpp/include/ucxx/internal/request_am.h b/cpp/include/ucxx/internal/request_am.h index 4fdbf0c5..9b18faee 100644 --- a/cpp/include/ucxx/internal/request_am.h +++ b/cpp/include/ucxx/internal/request_am.h @@ -26,18 +26,47 @@ class AmData; class RecvAmMessage { public: - internal::AmData* _amData{nullptr}; - ucp_ep_h _ep{nullptr}; - std::shared_ptr _request{nullptr}; - std::shared_ptr _buffer{nullptr}; + internal::AmData* _amData{nullptr}; ///< Active messages data + ucp_ep_h _ep{nullptr}; ///< Handle containing address of the reply endpoint + std::shared_ptr _request{ + nullptr}; ///< Request which will later be notified/delivered to user + std::shared_ptr _buffer{nullptr}; ///< Buffer containing the received data + /** + * @brief Constructor of `ucxx::RecvAmMessage`. + * + * Construct the object, setting attributes that are later needed by the callback. + * + * @param[in] amData active messages worker data. + * @param[in] ep handle containing address of the reply endpoint (i.e., endpoint + * where user is requesting to receive). + * @param[in] request request to be later notified/delivered to user. + * @param[in] buffer buffer containing the received data + */ RecvAmMessage(internal::AmData* amData, ucp_ep_h ep, std::shared_ptr request, std::shared_ptr buffer); + /** + * @brief Set the UCP request. + * + * Set the underlying UCP request (`_request` attribute) of the `RequestAM`. + * + * @param[in] request the UCP request associated to the active message receive operation. + */ void setUcpRequest(void* request); + /** + * @brief Execute the `ucxx::Request::callback()`. + * + * Execute the `ucxx::Request::callback()` method to set the status of the request, the + * buffer containing the data received and release the reference to this object from + * `AmData`. + * + * @param[in] request the UCP request associated to the active message receive operation. + * @param[in] status the completion status of the UCP request. + */ void callback(void* request, ucs_status_t status); }; @@ -46,13 +75,17 @@ typedef std::unordered_map> RecvAmMes class AmData { public: - std::shared_ptr _worker{nullptr}; - AmPoolType _recvPool{}; - AmPoolType _recvWait{}; - RecvAmMessageMapType _recvAmMessageMap{}; - std::mutex _mutex{}; - std::function)> _registerInflightRequest{}; - std::unordered_map _allocators{}; + std::shared_ptr _worker{ + nullptr}; ///< The worker to which the Active Message callback belongs + AmPoolType _recvPool{}; ///< The pool of completed receive requests (waiting for user request) + AmPoolType _recvWait{}; ///< The pool of user receive requests (waiting for message arrival) + RecvAmMessageMapType + _recvAmMessageMap{}; ///< The active messages waiting to be handled by callback + std::mutex _mutex{}; ///< Mutex to provide access to pools/maps + std::function)> + _registerInflightRequest{}; ///< Worker function to register inflight requests with + std::unordered_map + _allocators{}; ///< Default and user-defined active message allocators }; } // namespace internal diff --git a/cpp/include/ucxx/request.h b/cpp/include/ucxx/request.h index 87017668..763582a0 100644 --- a/cpp/include/ucxx/request.h +++ b/cpp/include/ucxx/request.h @@ -193,6 +193,14 @@ class Request : public Component { */ const std::string& getOwnerString() const; + /** + * @brief Get the received buffer. + * + * This method is used to get the received buffer for applicable derived classes (e.g., + * `RequestAM` receive operations), in all other cases this will return `nullptr`. + * + * @return The received buffer (if applicable) or `nullptr`. + */ virtual std::shared_ptr getRecvBuffer(); }; diff --git a/cpp/include/ucxx/request_am.h b/cpp/include/ucxx/request_am.h index 76994e94..171d6bf6 100644 --- a/cpp/include/ucxx/request_am.h +++ b/cpp/include/ucxx/request_am.h @@ -27,6 +27,28 @@ class RequestAM : public Request { ucs_memory_type_t _sendHeader{}; ///< The header to send std::shared_ptr _buffer{nullptr}; ///< The AM received message buffer + /** + * @brief Private constructor of `ucxx::RequestAM` send. + * + * This is the internal implementation of `ucxx::RequestAM` send constructor, made private + * not to be called directly. This constructor is made private to ensure all UCXX objects + * are shared pointers and the correct lifetime management of each one. + * + * Instead the user should use one of the following: + * + * - `ucxx::Endpoint::amSend()` + * - `ucxx::createRequestAMSend()` + * + * @throws ucxx::Error if `endpoint` is not a valid `std::shared_ptr`. + * + * @param[in] endpoint the parent endpoint. + * @param[in] buffer a raw pointer to the data to be sent. + * @param[in] length the size in bytes of the active message to be sent. + * @param[in] enablePythonFuture whether a python future should be created and + * subsequently notified. + * @param[in] callbackFunction user-defined callback function to call upon completion. + * @param[in] callbackData user-defined data to pass to the `callbackFunction`. + */ RequestAM(std::shared_ptr endpoint, void* buffer, size_t length, @@ -34,12 +56,58 @@ class RequestAM : public Request { RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr); + /** + * @brief Private constructor of `ucxx::RequestAM` receive. + * + * This is the internal implementation of `ucxx::RequestAM` receive constructor, made + * private not to be called directly. This constructor is made private to ensure all UCXX + * objects are shared pointers and the correct lifetime management of each one. + * + * Instead the user should use one of the following: + * + * - `ucxx::Endpoint::amRecv()` + * - `ucxx::createRequestAMRecv()` + * + * @throws ucxx::Error if `endpointOrWorker` is not a valid + * `std::shared_ptr` or + * `std::shared_ptr`. + * + * @param[in] endpointOrWorker the parent component, which may either be a + * `std::shared_ptr` or + * `std::shared_ptr`. + * @param[in] enablePythonFuture whether a python future should be created and + * subsequently notified. + * @param[in] callbackFunction user-defined callback function to call upon completion. + * @param[in] callbackData user-defined data to pass to the `callbackFunction`. + */ RequestAM(std::shared_ptr endpointOrWorker, const bool enablePythonFuture = false, RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr); public: + /** + * @brief Constructor for `std::shared_ptr` send. + * + * The constructor for a `std::shared_ptr` object, creating a send active + * message request, returning a pointer to a request object that can be later awaited and + * checked for errors. This is a non-blocking operation, and the status of the transfer + * must be verified from the resulting request object before the data can be + * released. + * + * @throws ucxx::Error if `endpoint` is not a valid + * `std::shared_ptr`. + * + * @param[in] endpoint the parent endpoint. + * @param[in] buffer a raw pointer to the data to be transferred. + * @param[in] length the size in bytes of the tag message to be transferred. + * @param[in] enablePythonFuture whether a python future should be created and + * subsequently notified. + * @param[in] callbackFunction user-defined callback function to call upon completion. + * @param[in] callbackData user-defined data to pass to the `callbackFunction`. + * + * @returns The `shared_ptr` object + */ friend std::shared_ptr createRequestAMSend( std::shared_ptr endpoint, void* buffer, @@ -48,18 +116,72 @@ class RequestAM : public Request { RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData); + /** + * @brief Constructor for `std::shared_ptr` send. + * + * The constructor for a `std::shared_ptr` object, creating a receive + * active message request, returning a pointer to a request object that can be later + * awaited and checked for errors. This is a non-blocking operation, and the status of + * the transfer must be verified from the resulting request object before the data can be + * consumed, the data is available via the `getRecvBuffer()` method if the transfer + * completed successfully. + * + * @throws ucxx::Error if `endpoint` is not a valid + * `std::shared_ptr`. + * + * @param[in] endpoint the parent endpoint. + * @param[in] enablePythonFuture whether a python future should be created and + * subsequently notified. + * @param[in] callbackFunction user-defined callback function to call upon completion. + * @param[in] callbackData user-defined data to pass to the `callbackFunction`. + * + * @returns The `shared_ptr` object + */ friend std::shared_ptr createRequestAMRecv( - std::shared_ptr endpointOrWorker, + std::shared_ptr endpoint, const bool enablePythonFuture, RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData); virtual void populateDelayedSubmission(); + /** + * @brief Create and submit an active message send request. + * + * This is the method that should be called to actually submit an active message send + * request. It is meant to be called from `populateDelayedSubmission()`, which is decided + * at the discretion of `std::shared_ptr`. See `populateDelayedSubmission()` + * for more details. + */ void request(); - static void amSendCallback(void* request, ucs_status_t status, void* user_data); - + /** + * @brief Receive callback registered by `ucxx::Worker`. + * + * This is the receive callback registered by the `ucxx::Worker` to handle incoming active + * messages. For each incoming active message, a proper buffer will be allocated based on + * the header sent by the remote endpoint using the default allocator or one registered by + * the user via `ucxx::Worker::registerAmAllocator()`. Following that, the message is + * immediately received onto the buffer and a `UCS_OK` or the proper error status is set + * onto the `RequestAM` that is returned to the user, or will be later handled by another + * callback when the message is ready. If the callback is executed when a user has already + * requested received of the active message, the buffer and status will be set on the + * earliest request, otherwise a new request is created and saved in a pool that will be + * already populated and ready for consumption or waiting for the internal callback when + * requested. + * + * This is always called by `ucp_worker_progress()`, and thus will happen in the same + * thread that is called from, when using the worker progress thread, this is called from + * the progress thread. + * + * param[in,out] arg pointer to the `AmData` object held by the `ucxx::Worker` who + * registered this callback. + * param[in] header pointer to the header containing the sender buffer's memory type. + * param[in] header_length length in bytes of the receive header. + * param[in] data pointer to the buffer containing the remote endpoint's send data. + * param[in] length the length in bytes of the message to be received. + * param[in] param UCP parameters of the active message being received. + */ static ucs_status_t recvCallback(void* arg, const void* header, size_t header_length, diff --git a/cpp/include/ucxx/worker.h b/cpp/include/ucxx/worker.h index f66787f0..48a4f34f 100644 --- a/cpp/include/ucxx/worker.h +++ b/cpp/include/ucxx/worker.h @@ -55,7 +55,7 @@ class Worker : public Component { nullptr}; ///< Collection of enqueued delayed submissions friend std::shared_ptr createRequestAMRecv( - std::shared_ptr endpointOrWorker, + std::shared_ptr endpoint, const bool enablePythonFuture, RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData); @@ -680,11 +680,11 @@ class Worker : public Component { * * Register a new allocator for active messages. By default, only one allocator is defined * for host memory (`UCS_MEMORY_TYPE_HOST`), and is used as a fallback when an allocator - * for the source's memory type is unavailable. In many certain circumstances relying - * exclusively on the host allocator is undesirable, for example when transferring CUDA - * buffers the destination is always going to be a host buffer and prevent the use of - * transports like NVLink or InfiniBand+GPUDirectRDMA. For that reason it's important that - * the user defines those allocators that are important for the application. + * for the source's memory type is unavailable. In many circumstances relying exclusively + * on the host allocator is undesirable, for example when transferring CUDA buffers the + * destination is always going to be a host buffer and prevent the use of transports such + * as NVLink or InfiniBand+GPUDirectRDMA. For that reason it's important that the user + * defines those allocators that are important for the application. * * If the `memoryType` has already been registered, the previous allocator will be * replaced by the new one. Be careful when doing this after transfers have started, there diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index e026b813..028c4307 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -10,7 +10,7 @@ #include #include -#include +#include #include namespace ucxx { @@ -28,17 +28,16 @@ std::shared_ptr createRequestAMSend( } std::shared_ptr createRequestAMRecv( - std::shared_ptr endpointOrWorker, + std::shared_ptr endpoint, const bool enablePythonFuture = false, RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr) { - auto endpoint = std::dynamic_pointer_cast(endpointOrWorker); - auto worker = endpoint->getWorker(endpoint->getParent()); + auto worker = endpoint->getWorker(endpoint->getParent()); - auto createRequest = [endpointOrWorker, enablePythonFuture, callbackFunction, callbackData]() { + auto createRequest = [endpoint, enablePythonFuture, callbackFunction, callbackData]() { return std::shared_ptr( - new RequestAM(endpointOrWorker, enablePythonFuture, callbackFunction, callbackData)); + new RequestAM(endpoint, enablePythonFuture, callbackFunction, callbackData)); }; return worker->getAmRecv(endpoint->getHandle(), createRequest); } @@ -54,8 +53,6 @@ RequestAM::RequestAM(std::shared_ptr endpoint, std::string("amSend"), enablePythonFuture) { - if (_endpoint == nullptr) throw ucxx::Error("An endpoint is required to send active messages"); - _callback = callbackFunction; _callbackData = callbackData; @@ -76,10 +73,10 @@ RequestAM::RequestAM(std::shared_ptr endpointOrWorker, _callbackData = callbackData; } -void RequestAM::amSendCallback(void* request, ucs_status_t status, void* user_data) +static void _amSendCallback(void* request, ucs_status_t status, void* user_data) { Request* req = reinterpret_cast(user_data); - ucxx_trace_req_f(req->getOwnerString().c_str(), request, "amSend", "amSendCallback"); + ucxx_trace_req_f(req->getOwnerString().c_str(), request, "amSend", "_amSendCallback"); req->callback(request, status); } @@ -101,7 +98,7 @@ ucs_status_t RequestAM::recvCallback(void* arg, size_t length, const ucp_am_recv_param_t* param) { - internal::AMData* amData = reinterpret_cast(arg); + internal::AmData* amData = reinterpret_cast(arg); auto worker = amData->_worker; auto& recvPool = amData->_recvPool; auto& recvWait = amData->_recvWait; @@ -204,7 +201,7 @@ void RequestAM::request() _sendHeader = UCS_MEMORY_TYPE_HOST; if (_delayedSubmission->_send) { - param.cb.send = amSendCallback; + param.cb.send = _amSendCallback; _request = ucp_am_send_nbx(_endpoint->getHandle(), 0, &_sendHeader, From d771d05fb3ed602ccbb9d8593d22510c30e8b044 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 16 May 2023 09:48:12 -0700 Subject: [PATCH 07/34] Use `shared_ptr` for `AmData::_worker`` --- cpp/include/ucxx/internal/request_am.h | 3 +-- cpp/src/request_am.cpp | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/include/ucxx/internal/request_am.h b/cpp/include/ucxx/internal/request_am.h index 9b18faee..fc00427f 100644 --- a/cpp/include/ucxx/internal/request_am.h +++ b/cpp/include/ucxx/internal/request_am.h @@ -75,8 +75,7 @@ typedef std::unordered_map> RecvAmMes class AmData { public: - std::shared_ptr _worker{ - nullptr}; ///< The worker to which the Active Message callback belongs + std::weak_ptr _worker{}; ///< The worker to which the Active Message callback belongs AmPoolType _recvPool{}; ///< The pool of completed receive requests (waiting for user request) AmPoolType _recvWait{}; ///< The pool of user receive requests (waiting for message arrival) RecvAmMessageMapType diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index 028c4307..995e6f96 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -99,7 +99,7 @@ ucs_status_t RequestAM::recvCallback(void* arg, const ucp_am_recv_param_t* param) { internal::AmData* amData = reinterpret_cast(arg); - auto worker = amData->_worker; + auto worker = amData->_worker.lock(); auto& recvPool = amData->_recvPool; auto& recvWait = amData->_recvWait; From 6bcb5d47f770482ff8066e5be6cdece6c9b286bc Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 17 May 2023 04:12:07 -0700 Subject: [PATCH 08/34] Add Python interface for Active Messages --- cpp/python/src/worker.cpp | 11 ++- python/ucxx/_lib/libucxx.pyx | 44 +++++++++++ python/ucxx/_lib/tests/test_server_client.py | 25 +++++-- python/ucxx/_lib/ucxx_api.pxd | 7 ++ python/ucxx/_lib_async/endpoint.py | 77 +++++++++++++++++++- 5 files changed, 155 insertions(+), 9 deletions(-) diff --git a/cpp/python/src/worker.cpp b/cpp/python/src/worker.cpp index c62fbafd..41fe1b32 100644 --- a/cpp/python/src/worker.cpp +++ b/cpp/python/src/worker.cpp @@ -9,6 +9,7 @@ #include +#include #include #include #include @@ -31,8 +32,14 @@ std::shared_ptr<::ucxx::Worker> createWorker(std::shared_ptr context, const bool enableDelayedSubmission, const bool enableFuture) { - return std::shared_ptr<::ucxx::Worker>( - new ::ucxx::python::Worker(context, enableDelayedSubmission, enableFuture)); + auto worker = std::shared_ptr<::ucxx::python::Worker>( + new ::ucxx::python::Worker(context, enableDelayedSubmission)); + + // We can only get a `shared_ptr` for the Active Messages callback after it's + // been created, thus this cannot be in the constructor. + if (worker->_amData != nullptr) worker->_amData->_worker = worker; + + return worker; } void Worker::populateFuturesPool() diff --git a/python/ucxx/_lib/libucxx.pyx b/python/ucxx/_lib/libucxx.pyx index d4b010ec..17fd0544 100644 --- a/python/ucxx/_lib/libucxx.pyx +++ b/python/ucxx/_lib/libucxx.pyx @@ -652,6 +652,22 @@ cdef class UCXRequest(): else: await self.wait_yield() + def get_recv_buffer(self): + cdef shared_ptr[Buffer] buf + cdef BufferType bufType + + with nogil: + buf = self._request.get().getRecvBuffer() + bufType = buf.get().getType() + + # If buf == NULL, it's not allocated by the request but rather the user + if buf == NULL: + return None + elif bufType == BufferType.RMM: + return _get_rmm_buffer(buf.get()) + else: + return _get_host_buffer(buf.get()) + cdef class UCXBufferRequest: cdef: @@ -930,6 +946,34 @@ cdef class UCXEndpoint(): with nogil: self._endpoint.get().close() + def am_send(self, Array arr): + cdef void* buf = arr.ptr + cdef size_t nbytes = arr.nbytes + cdef shared_ptr[Request] req + + if not self._context_feature_flags & Feature.AM.value: + raise ValueError("UCXContext must be created with `Feature.AM`") + + with nogil: + req = self._endpoint.get().amSend( + buf, + nbytes, + self._enable_python_future + ) + + return UCXRequest(&req, self._enable_python_future) + + def am_recv(self): + cdef shared_ptr[Request] req + + if not self._context_feature_flags & Feature.AM.value: + raise ValueError("UCXContext must be created with `Feature.AM`") + + with nogil: + req = self._endpoint.get().amRecv(self._enable_python_future) + + return UCXRequest(&req, self._enable_python_future) + def stream_send(self, Array arr): cdef void* buf = arr.ptr cdef size_t nbytes = arr.nbytes diff --git a/python/ucxx/_lib/tests/test_server_client.py b/python/ucxx/_lib/tests/test_server_client.py index d7cbcf52..79723723 100644 --- a/python/ucxx/_lib/tests/test_server_client.py +++ b/python/ucxx/_lib/tests/test_server_client.py @@ -17,14 +17,18 @@ def _send(ep, api, message): - if api == "stream": + if api == "am": + return ep.am_send(message) + elif api == "stream": return ep.stream_send(message) else: return ep.tag_send(message, tag=0) def _recv(ep, api, message): - if api == "stream": + if api == "am": + return ep.am_recv() + elif api == "stream": return ep.stream_recv(message) else: return ep.tag_recv(message, tag=0) @@ -38,7 +42,9 @@ def _echo_server(get_queue, put_queue, transfer_api, msg_size, progress_mode): outside of the callback function. """ feature_flags = [ucx_api.Feature.WAKEUP] - if transfer_api == "stream": + if transfer_api == "am": + feature_flags.append(ucx_api.Feature.AM) + elif transfer_api == "stream": feature_flags.append(ucx_api.Feature.STREAM) else: feature_flags.append(ucx_api.Feature.TAG) @@ -78,6 +84,8 @@ def _listener_handler(conn_request): # it back again. requests = [_recv(ep[0], transfer_api, msg)] wait_requests(worker, progress_mode, requests) + if transfer_api == "am": + msg = Array(requests[0].get_recv_buffer()) requests = [_send(ep[0], transfer_api, msg)] wait_requests(worker, progress_mode, requests) @@ -92,6 +100,8 @@ def _listener_handler(conn_request): def _echo_client(transfer_api, msg_size, progress_mode, port): feature_flags = [ucx_api.Feature.WAKEUP] + if transfer_api == "am": + feature_flags.append(ucx_api.Feature.AM) if transfer_api == "stream": feature_flags.append(ucx_api.Feature.STREAM) else: @@ -127,10 +137,15 @@ def _echo_client(transfer_api, msg_size, progress_mode, port): ] wait_requests(worker, progress_mode, requests) - assert recv_msg == send_msg + if transfer_api == "am": + recv_msg = requests[1].get_recv_buffer() + + assert bytes(recv_msg) == send_msg + else: + assert recv_msg == send_msg -@pytest.mark.parametrize("transfer_api", ["stream", "tag"]) +@pytest.mark.parametrize("transfer_api", ["am", "stream", "tag"]) @pytest.mark.parametrize("msg_size", [10, 2**24]) @pytest.mark.parametrize("progress_mode", ["blocking", "thread"]) def test_server_client(transfer_api, msg_size, progress_mode): diff --git a/python/ucxx/_lib/ucxx_api.pxd b/python/ucxx/_lib/ucxx_api.pxd index 1c4918fe..6f8c0046 100644 --- a/python/ucxx/_lib/ucxx_api.pxd +++ b/python/ucxx/_lib/ucxx_api.pxd @@ -244,6 +244,12 @@ cdef extern from "" namespace "ucxx" nogil: cdef cppclass Endpoint(Component): ucp_ep_h getHandle() void close() + shared_ptr[Request] amSend( + void* buffer, size_t length, bint enable_python_future + ) except +raise_py_error + shared_ptr[Request] amRecv( + bint enable_python_future + ) except +raise_py_error shared_ptr[Request] streamSend( void* buffer, size_t length, bint enable_python_future ) except +raise_py_error @@ -289,6 +295,7 @@ cdef extern from "" namespace "ucxx" nogil: ucs_status_t getStatus() void checkError() except +raise_py_error void* getFuture() except +raise_py_error + shared_ptr[Buffer] getRecvBuffer() except +raise_py_error cdef extern from "" namespace "ucxx" nogil: diff --git a/python/ucxx/_lib_async/endpoint.py b/python/ucxx/_lib_async/endpoint.py index e224f9c4..c7e00505 100644 --- a/python/ucxx/_lib_async/endpoint.py +++ b/python/ucxx/_lib_async/endpoint.py @@ -85,6 +85,43 @@ async def close(self): await asyncio.sleep(0) self.abort() + async def am_send(self, buffer): + """Send `buffer` to connected peer via active messages. + + Parameters + ---------- + buffer: exposing the buffer protocol or array/cuda interface + The buffer to send. Raise ValueError if buffer is smaller + than nbytes. + """ + self._ep.raise_on_error() + if self.closed(): + raise UCXCloseError("Endpoint closed") + if not isinstance(buffer, Array): + buffer = Array(buffer) + + # Optimization to eliminate producing logger string overhead + if logger.isEnabledFor(logging.DEBUG): + nbytes = buffer.nbytes + log = "[AM Send #%03d] ep: %s, nbytes: %d, type: %s" % ( + self._send_count, + hex(self.uid), + nbytes, + type(buffer.obj), + ) + logger.debug(log) + + self._send_count += 1 + + try: + request = self._ep.am_send(buffer) + return await request.wait() + except UCXCanceled as e: + # If self._ep has already been closed and destroyed, we reraise the + # UCXCanceled exception. + if self._ep is None: + raise e + # @ucx_api.nvtx_annotate("UCXPY_SEND", color="green", domain="ucxpy") async def send(self, buffer, tag=None, force_tag=False): """Send `buffer` to connected peer. @@ -94,7 +131,6 @@ async def send(self, buffer, tag=None, force_tag=False): buffer: exposing the buffer protocol or array/cuda interface The buffer to send. Raise ValueError if buffer is smaller than nbytes. - tag: hashable, optional tag: hashable, optional Set a tag that the receiver must match. Currently the tag is hashed together with the internal Endpoint tag that is @@ -146,7 +182,6 @@ async def send_multi(self, buffers, tag=None, force_tag=False): buffer: exposing the buffer protocol or array/cuda interface The buffer to send. Raise ValueError if buffer is smaller than nbytes. - tag: hashable, optional tag: hashable, optional Set a tag that the receiver must match. Currently the tag is hashed together with the internal Endpoint tag that is @@ -214,6 +249,44 @@ async def send_obj(self, obj, tag=None): await self.send(nbytes, tag=tag) await self.send(obj, tag=tag) + async def am_recv(self): + """Receive from connected peer via active messages.""" + # if not self._ctx.worker.tag_probe(tag): + # self._ep.raise_on_error() + # if self.closed(): + # raise UCXCloseError("Endpoint closed") + + # Optimization to eliminate producing logger string overhead + if logger.isEnabledFor(logging.DEBUG): + log = "[AM Recv #%03d] ep: %s" % ( + self._recv_count, + hex(self.uid), + ) + logger.debug(log) + + self._recv_count += 1 + + req = self._ep.am_recv() + await req.wait() + buffer = req.get_recv_buffer() + + if logger.isEnabledFor(logging.DEBUG): + log = "[AM Recv Completed #%03d] ep: %s, nbytes: %d, type: %s" % ( + self._recv_count, + hex(self.uid), + buffer.nbytes, + type(buffer), + ) + logger.debug(log) + + self._finished_recv_count += 1 + if ( + self._close_after_n_recv is not None + and self._finished_recv_count >= self._close_after_n_recv + ): + self.abort() + return buffer + # @ucx_api.nvtx_annotate("UCXPY_RECV", color="red", domain="ucxpy") async def recv(self, buffer, tag=None, force_tag=False): """Receive from connected peer into `buffer`. From d6d03c570241c7c6149bb8e095df10c40e411e25 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 17 May 2023 05:26:35 -0700 Subject: [PATCH 09/34] Add `ucxx::Worker::amProbe` --- cpp/include/ucxx/worker.h | 23 ++++++++++++++++++++++- cpp/src/worker.cpp | 7 ++++++- cpp/tests/worker.cpp | 20 ++++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/cpp/include/ucxx/worker.h b/cpp/include/ucxx/worker.h index 48a4f34f..22a6a1c5 100644 --- a/cpp/include/ucxx/worker.h +++ b/cpp/include/ucxx/worker.h @@ -550,7 +550,7 @@ class Worker : public Component { * * @returns `true` if any uncaught messages were received, `false` otherwise. */ - bool tagProbe(ucp_tag_t tag); + bool tagProbe(const ucp_tag_t tag) const; /** * @brief Enqueue a tag receive operation. @@ -703,6 +703,27 @@ class Worker : public Component { * active message buffers. */ void registerAmAllocator(ucs_memory_type_t memoryType, AmAllocatorType allocator); + + /** + * @brief Check for uncaught active messages. + * + * Checks the worker for any uncaught active messages. An uncaught active message is any + * active message that has been fully or partially received by the worker, but not matched + * by a corresponding `createRequestAMRecv()` call. + * + * @code{.cpp} + * // `worker` is `std::shared_ptr` + * // `ep` is a remote `std::shared_ptramProbe(ep->getHandle())); + * + * ep->amSend(buffer, length); + * + * assert(worker->amProbe(0)); + * @endcode + * + * @returns `true` if any uncaught messages were received, `false` otherwise. + */ + bool amProbe(const ucp_ep_h endpointHandle) const; }; } // namespace ucxx diff --git a/cpp/src/worker.cpp b/cpp/src/worker.cpp index 89fa133c..6bc0a4a0 100644 --- a/cpp/src/worker.cpp +++ b/cpp/src/worker.cpp @@ -360,7 +360,7 @@ void Worker::removeInflightRequest(const Request* const request) } } -bool Worker::tagProbe(ucp_tag_t tag) +bool Worker::tagProbe(const ucp_tag_t tag) const { ucp_tag_recv_info_t info; ucp_tag_message_h tag_message = ucp_tag_probe_nb(_handle, tag, -1, 0, &info); @@ -420,4 +420,9 @@ void Worker::registerAmAllocator(ucs_memory_type_t memoryType, AmAllocatorType a _amData->_allocators.insert_or_assign(memoryType, allocator); } +bool Worker::amProbe(const ucp_ep_h endpointHandle) const +{ + return _amData->_recvPool.find(endpointHandle) != _amData->_recvPool.end(); +} + } // namespace ucxx diff --git a/cpp/tests/worker.cpp b/cpp/tests/worker.cpp index c008fb86..9424fe67 100644 --- a/cpp/tests/worker.cpp +++ b/cpp/tests/worker.cpp @@ -72,6 +72,26 @@ TEST_F(WorkerTest, TagProbe) ASSERT_TRUE(_worker->tagProbe(0)); } +TEST_F(WorkerTest, AmProbe) +{ + auto progressWorker = getProgressFunction(_worker, ProgressMode::Polling); + auto ep = _worker->createEndpointFromWorkerAddress(_worker->getAddress()); + + ASSERT_FALSE(_worker->amProbe(ep->getHandle())); + + std::vector buf{123}; + std::vector> requests; + requests.push_back(ep->amSend(buf.data(), buf.size() * sizeof(int))); + waitRequests(_worker, requests, progressWorker); + + // Attempt to progress worker 10 times (arbitrarily defined). + // TODO: Maybe a timeout would fit best. + for (size_t i = 0; i < 10 && !_worker->tagProbe(0); ++i) + progressWorker(); + + ASSERT_TRUE(_worker->amProbe(ep->getHandle())); +} + TEST_P(WorkerProgressTest, ProgressAm) { if (_progressMode == ProgressMode::Wait) { From 89ed4f2a34499f86535f743624a600e3fb7cf041 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 17 May 2023 07:26:51 -0700 Subject: [PATCH 10/34] Add Python `amProbe` support --- python/ucxx/_lib/libucxx.pyx | 14 +++++ python/ucxx/_lib/tests/test_probe.py | 60 ++++++++++++++++------ python/ucxx/_lib/ucxx_api.pxd | 4 +- python/ucxx/_lib_async/endpoint.py | 8 +-- python/ucxx/_lib_async/tests/test_probe.py | 5 +- 5 files changed, 65 insertions(+), 26 deletions(-) diff --git a/python/ucxx/_lib/libucxx.pyx b/python/ucxx/_lib/libucxx.pyx index 17fd0544..2f4b6bec 100644 --- a/python/ucxx/_lib/libucxx.pyx +++ b/python/ucxx/_lib/libucxx.pyx @@ -946,6 +946,20 @@ cdef class UCXEndpoint(): with nogil: self._endpoint.get().close() + def am_probe(self): + cdef ucp_ep_h handle + cdef shared_ptr[Component] parent + cdef shared_ptr[Worker] worker + cdef bint ep_matched + + with nogil: + handle = self._endpoint.get().getHandle() + parent = self._endpoint.get().getParent() + worker = self._endpoint.get().getWorker(parent) + ep_matched = worker.get().amProbe(handle) + + return ep_matched + def am_send(self, Array arr): cdef void* buf = arr.ptr cdef size_t nbytes = arr.nbytes diff --git a/python/ucxx/_lib/tests/test_probe.py b/python/ucxx/_lib/tests/test_probe.py index 5cb74223..929720e4 100644 --- a/python/ucxx/_lib/tests/test_probe.py +++ b/python/ucxx/_lib/tests/test_probe.py @@ -3,6 +3,7 @@ import multiprocessing as mp +import pytest from ucxx._lib import libucxx as ucx_api from ucxx._lib.arr import Array from ucxx.testing import terminate_process, wait_requests @@ -13,14 +14,17 @@ DataMessage = bytearray(b"0" * 10) -def _server_probe(queue): +def _server_probe(queue, transfer_api): """Server that probes and receives message after client disconnected. Note that since it is illegal to call progress() in callback functions, we keep a reference to the endpoint after the listener callback has terminated, this way we can progress even after Python blocking calls. """ - ctx = ucx_api.UCXContext(feature_flags=(ucx_api.Feature.TAG,)) + feature_flags = ( + ucx_api.Feature.AM if transfer_api == "am" else ucx_api.Feature.TAG, + ) + ctx = ucx_api.UCXContext(feature_flags=feature_flags) worker = ucx_api.UCXWorker(ctx) # Keep endpoint to be used from outside the listener callback @@ -42,8 +46,13 @@ def _listener_handler(conn_request): ep = ep[0] # Ensure wireup and inform client before it can disconnect - wireup = bytearray(len(WireupMessage)) - wait_requests(worker, "blocking", ep.tag_recv(Array(wireup), tag=0)) + if transfer_api == "am": + wireup_req = ep.am_recv() + wait_requests(worker, "blocking", wireup_req) + wireup = bytes(wireup_req.get_recv_buffer()) + else: + wireup = bytearray(len(WireupMessage)) + wait_requests(worker, "blocking", ep.tag_recv(Array(wireup), tag=0)) queue.put("wireup completed") # Ensure client has disconnected -- endpoint is not alive anymore @@ -51,17 +60,27 @@ def _listener_handler(conn_request): worker.progress() # Probe/receive message even after the remote endpoint has disconnected - while worker.tag_probe(0) is False: - worker.progress() - received = bytearray(len(DataMessage)) - wait_requests(worker, "blocking", ep.tag_recv(Array(received), tag=0)) + if transfer_api == "am": + while ep.am_probe() is False: + worker.progress() + recv_req = ep.am_recv() + wait_requests(worker, "blocking", recv_req) + received = bytes(recv_req.get_recv_buffer()) + else: + while worker.tag_probe(0) is False: + worker.progress() + received = bytearray(len(DataMessage)) + wait_requests(worker, "blocking", ep.tag_recv(Array(received), tag=0)) assert wireup == WireupMessage assert received == DataMessage -def _client_probe(queue): - ctx = ucx_api.UCXContext(feature_flags=(ucx_api.Feature.TAG,)) +def _client_probe(queue, transfer_api): + feature_flags = ( + ucx_api.Feature.AM if transfer_api == "am" else ucx_api.Feature.TAG, + ) + ctx = ucx_api.UCXContext(feature_flags=feature_flags) worker = ucx_api.UCXWorker(ctx) port = queue.get() ep = ucx_api.UCXEndpoint.create( @@ -71,21 +90,28 @@ def _client_probe(queue): endpoint_error_handling=True, ) - requests = [ - ep.tag_send(Array(WireupMessage), tag=0), - ep.tag_send(Array(DataMessage), tag=0), - ] + if transfer_api == "am": + requests = [ + ep.am_send(Array(WireupMessage)), + ep.am_send(Array(DataMessage)), + ] + else: + requests = [ + ep.tag_send(Array(WireupMessage), tag=0), + ep.tag_send(Array(DataMessage), tag=0), + ] wait_requests(worker, "blocking", requests) # Wait for wireup before disconnecting assert queue.get() == "wireup completed" -def test_message_probe(): +@pytest.mark.parametrize("transfer_api", ["am", "tag"]) +def test_message_probe(transfer_api): queue = mp.Queue() - server = mp.Process(target=_server_probe, args=(queue,)) + server = mp.Process(target=_server_probe, args=(queue, transfer_api)) server.start() - client = mp.Process(target=_client_probe, args=(queue,)) + client = mp.Process(target=_client_probe, args=(queue, transfer_api)) client.start() client.join(timeout=10) server.join(timeout=10) diff --git a/python/ucxx/_lib/ucxx_api.pxd b/python/ucxx/_lib/ucxx_api.pxd index 6f8c0046..e8a6e6d2 100644 --- a/python/ucxx/_lib/ucxx_api.pxd +++ b/python/ucxx/_lib/ucxx_api.pxd @@ -226,7 +226,7 @@ cdef extern from "" namespace "ucxx" nogil: ) except +raise_py_error void stopProgressThread() except +raise_py_error size_t cancelInflightRequests() except +raise_py_error - bint tagProbe(ucp_tag_t) + bint tagProbe(ucp_tag_t) const void setProgressThreadStartCallback( function[void(void*)] callback, void* callbackArg ) @@ -240,6 +240,7 @@ cdef extern from "" namespace "ucxx" nogil: void* buffer, size_t length, ucp_tag_t tag, bint enable_python_future ) except +raise_py_error bint isFutureEnabled() const + bint amProbe(ucp_ep_h) const cdef cppclass Endpoint(Component): ucp_ep_h getHandle() @@ -277,6 +278,7 @@ cdef extern from "" namespace "ucxx" nogil: void setCloseCallback( function[void(void*)] close_callback, void* close_callback_arg ) + shared_ptr[Worker] getWorker(shared_ptr[Component] worker_or_listener) cdef cppclass Listener(Component): shared_ptr[Endpoint] createEndpointFromConnRequest( diff --git a/python/ucxx/_lib_async/endpoint.py b/python/ucxx/_lib_async/endpoint.py index c7e00505..1dff0c78 100644 --- a/python/ucxx/_lib_async/endpoint.py +++ b/python/ucxx/_lib_async/endpoint.py @@ -251,10 +251,10 @@ async def send_obj(self, obj, tag=None): async def am_recv(self): """Receive from connected peer via active messages.""" - # if not self._ctx.worker.tag_probe(tag): - # self._ep.raise_on_error() - # if self.closed(): - # raise UCXCloseError("Endpoint closed") + if not self._ep.am_probe(): + self._ep.raise_on_error() + if self.closed(): + raise UCXCloseError("Endpoint closed") # Optimization to eliminate producing logger string overhead if logger.isEnabledFor(logging.DEBUG): diff --git a/python/ucxx/_lib_async/tests/test_probe.py b/python/ucxx/_lib_async/tests/test_probe.py index a6e524b6..c56d06cc 100644 --- a/python/ucxx/_lib_async/tests/test_probe.py +++ b/python/ucxx/_lib_async/tests/test_probe.py @@ -12,9 +12,6 @@ @pytest.mark.parametrize("transfer_api", ["am", "tag"]) @pytest.mark.xfail(reason="https://github.com/rapidsai/ucxx/issues/19") async def test_message_probe(transfer_api): - if transfer_api == "am": - pytest.skip("AM not implemented yet") - msg = bytearray(b"0" * 10) async def server_node(ep): @@ -25,7 +22,7 @@ async def server_node(ep): if transfer_api == "am": assert ep._ep.am_probe() is True - received = await ep.am_recv() + received = bytes(await ep.am_recv()) else: assert ep._ctx.worker.tag_probe(ep._tags["msg_recv"]) is True received = bytearray(10) From 09040e7493b872023a95f48976d0a82957ea3394 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 17 May 2023 09:45:13 -0700 Subject: [PATCH 11/34] Add different memory types support for Active Messages --- cpp/include/ucxx/constructors.h | 1 + cpp/include/ucxx/delayed_submission.h | 17 +++++++--- cpp/include/ucxx/endpoint.h | 2 ++ cpp/include/ucxx/request_am.h | 4 +++ cpp/src/delayed_submission.cpp | 5 +-- cpp/src/endpoint.cpp | 3 +- cpp/src/request_am.cpp | 31 ++++++++++------- cpp/tests/request.cpp | 48 +++++++++++++++++++++------ cpp/tests/worker.cpp | 4 +-- 9 files changed, 82 insertions(+), 33 deletions(-) diff --git a/cpp/include/ucxx/constructors.h b/cpp/include/ucxx/constructors.h index 5dd0dd80..4596e322 100644 --- a/cpp/include/ucxx/constructors.h +++ b/cpp/include/ucxx/constructors.h @@ -57,6 +57,7 @@ std::shared_ptr createWorker(std::shared_ptr context, std::shared_ptr createRequestAMSend(std::shared_ptr endpoint, void* buffer, size_t length, + ucs_memory_type_t memoryType, const bool enablePythonFuture, RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData); diff --git a/cpp/include/ucxx/delayed_submission.h b/cpp/include/ucxx/delayed_submission.h index 9b6da2be..a65e428b 100644 --- a/cpp/include/ucxx/delayed_submission.h +++ b/cpp/include/ucxx/delayed_submission.h @@ -25,6 +25,7 @@ class DelayedSubmission { void* _buffer{nullptr}; ///< Raw pointer to data buffer size_t _length{0}; ///< Length of the message in bytes ucp_tag_t _tag{0}; ///< Tag to match + ucs_memory_type_t _memoryType{UCS_MEMORY_TYPE_UNKNOWN}; ///< Buffer memory type DelayedSubmission() = delete; @@ -42,12 +43,18 @@ class DelayedSubmission { * a multi-threaded application for blocking while waiting for the UCX spinlock, since * all transfer operations may be pushed to the worker progress thread. * - * @param[in] send whether this is a send (`true`) or receive (`false`) operation. - * @param[in] buffer a raw pointer to the data being transferred. - * @param[in] length the size in bytes of the message being transfer. - * @param[in] tag tag to match for this operation (only applies for tag operations). + * @param[in] send whether this is a send (`true`) or receive (`false`) operation. + * @param[in] buffer a raw pointer to the data being transferred. + * @param[in] length the size in bytes of the message being transfer. + * @param[in] tag tag to match for this operation (only applies for tag + * operations). + * @param[in] memoryType the memory type of the buffer. */ - DelayedSubmission(const bool send, void* buffer, const size_t length, const ucp_tag_t tag = 0); + DelayedSubmission(const bool send, + void* buffer, + const size_t length, + const ucp_tag_t tag = 0, + const ucs_memory_type_t memoryType = UCS_MEMORY_TYPE_UNKNOWN); }; class DelayedSubmissionCollection { diff --git a/cpp/include/ucxx/endpoint.h b/cpp/include/ucxx/endpoint.h index 91feea9b..b93d9dc4 100644 --- a/cpp/include/ucxx/endpoint.h +++ b/cpp/include/ucxx/endpoint.h @@ -267,6 +267,7 @@ class Endpoint : public Component { * * @param[in] buffer a raw pointer to the data to be sent. * @param[in] length the size in bytes of the tag message to be sent. + * @param[in] memoryType the memory type of the buffer. * @param[in] enablePythonFuture whether a python future should be created and * subsequently notified. * @param[in] callbackFunction user-defined callback function to call upon completion. @@ -276,6 +277,7 @@ class Endpoint : public Component { */ std::shared_ptr amSend(void* buffer, size_t length, + ucs_memory_type_t memoryType, const bool enablePythonFuture = false, RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr); diff --git a/cpp/include/ucxx/request_am.h b/cpp/include/ucxx/request_am.h index 171d6bf6..cb61cb05 100644 --- a/cpp/include/ucxx/request_am.h +++ b/cpp/include/ucxx/request_am.h @@ -44,6 +44,7 @@ class RequestAM : public Request { * @param[in] endpoint the parent endpoint. * @param[in] buffer a raw pointer to the data to be sent. * @param[in] length the size in bytes of the active message to be sent. + * @param[in] memoryType the memory type of the buffer. * @param[in] enablePythonFuture whether a python future should be created and * subsequently notified. * @param[in] callbackFunction user-defined callback function to call upon completion. @@ -52,6 +53,7 @@ class RequestAM : public Request { RequestAM(std::shared_ptr endpoint, void* buffer, size_t length, + ucs_memory_type_t memoryType, const bool enablePythonFuture = false, RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr); @@ -101,6 +103,7 @@ class RequestAM : public Request { * @param[in] endpoint the parent endpoint. * @param[in] buffer a raw pointer to the data to be transferred. * @param[in] length the size in bytes of the tag message to be transferred. + * @param[in] memoryType the memory type of the buffer. * @param[in] enablePythonFuture whether a python future should be created and * subsequently notified. * @param[in] callbackFunction user-defined callback function to call upon completion. @@ -112,6 +115,7 @@ class RequestAM : public Request { std::shared_ptr endpoint, void* buffer, size_t length, + ucs_memory_type_t memoryType, const bool enablePythonFuture, RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData); diff --git a/cpp/src/delayed_submission.cpp b/cpp/src/delayed_submission.cpp index f7eb57c4..b809a0d7 100644 --- a/cpp/src/delayed_submission.cpp +++ b/cpp/src/delayed_submission.cpp @@ -16,8 +16,9 @@ namespace ucxx { DelayedSubmission::DelayedSubmission(const bool send, void* buffer, const size_t length, - const ucp_tag_t tag) - : _send(send), _buffer(buffer), _length(length), _tag(tag) + const ucp_tag_t tag, + const ucs_memory_type_t memoryType) + : _send(send), _buffer(buffer), _length(length), _tag(tag), _memoryType(memoryType) { } diff --git a/cpp/src/endpoint.cpp b/cpp/src/endpoint.cpp index 8c5b486b..2b4d1733 100644 --- a/cpp/src/endpoint.cpp +++ b/cpp/src/endpoint.cpp @@ -194,13 +194,14 @@ size_t Endpoint::cancelInflightRequests() { return _inflightRequests->cancelAll( std::shared_ptr Endpoint::amSend(void* buffer, size_t length, + ucs_memory_type_t memoryType, const bool enablePythonFuture, RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData) { auto endpoint = std::dynamic_pointer_cast(shared_from_this()); return registerInflightRequest(createRequestAMSend( - endpoint, buffer, length, enablePythonFuture, callbackFunction, callbackData)); + endpoint, buffer, length, memoryType, enablePythonFuture, callbackFunction, callbackData)); } std::shared_ptr Endpoint::amRecv(const bool enablePythonFuture, diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index 995e6f96..5fc86a71 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -19,12 +19,13 @@ std::shared_ptr createRequestAMSend( std::shared_ptr endpoint, void* buffer, size_t length, + ucs_memory_type_t memoryType = UCS_MEMORY_TYPE_HOST, const bool enablePythonFuture = false, RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr) { - return std::shared_ptr( - new RequestAM(endpoint, buffer, length, enablePythonFuture, callbackFunction, callbackData)); + return std::shared_ptr(new RequestAM( + endpoint, buffer, length, memoryType, enablePythonFuture, callbackFunction, callbackData)); } std::shared_ptr createRequestAMRecv( @@ -45,11 +46,12 @@ std::shared_ptr createRequestAMRecv( RequestAM::RequestAM(std::shared_ptr endpoint, void* buffer, size_t length, + ucs_memory_type_t memoryType, const bool enablePythonFuture, RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData) : Request(endpoint, - std::make_shared(true, buffer, length), + std::make_shared(true, buffer, length, 0, memoryType), std::string("amSend"), enablePythonFuture) { @@ -132,15 +134,20 @@ ucs_status_t RequestAM::recvCallback(void* arg, } if (is_rndv) { - if (allocatorType == UCS_MEMORY_TYPE_HOST || allocatorType == UCS_MEMORY_TYPE_CUDA) { - buf = amData->_allocators.at(allocatorType)(length); - } else { - ucxx_debug("Unsupported memory type %d", allocatorType); - internal::RecvAmMessage recvAmMessage(amData, ep, req, nullptr); - recvAmMessage.callback(nullptr, UCS_ERR_UNSUPPORTED); - return UCS_ERR_UNSUPPORTED; + if (amData->_allocators.find(allocatorType) == amData->_allocators.end()) { + // TODO: Is a hard failure better? + // ucxx_debug("Unsupported memory type %d", allocatorType); + // internal::RecvAmMessage recvAmMessage(amData, ep, req, nullptr); + // recvAmMessage.callback(nullptr, UCS_ERR_UNSUPPORTED); + // return UCS_ERR_UNSUPPORTED; + + ucxx_trace_req("No allocator registered for memory type %d, falling back to host memory.", + allocatorType); + allocatorType = UCS_MEMORY_TYPE_HOST; } + buf = amData->_allocators.at(allocatorType)(length); + auto recvAmMessage = std::make_shared(amData, ep, req, buf); ucp_request_param_t request_param = {.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | @@ -197,8 +204,8 @@ void RequestAM::request() .datatype = ucp_dt_make_contig(1), .user_data = this}; - // TODO: add CUDA support - _sendHeader = UCS_MEMORY_TYPE_HOST; + _sendHeader = _delayedSubmission->_memoryType; + ucxx_debug("RequestAM::request memtype: %d", _sendHeader); if (_delayedSubmission->_send) { param.cb.send = _amSendCallback; diff --git a/cpp/tests/request.cpp b/cpp/tests/request.cpp index 2072773d..2aafde20 100644 --- a/cpp/tests/request.cpp +++ b/cpp/tests/request.cpp @@ -21,20 +21,22 @@ using ::testing::Combine; using ::testing::ContainerEq; using ::testing::Values; -class RequestTest - : public ::testing::TestWithParam> { +class RequestTest : public ::testing::TestWithParam< + std::tuple> { protected: - std::shared_ptr _context{ - ucxx::createContext({}, ucxx::Context::defaultFeatureFlags)}; + std::shared_ptr _context{nullptr}; std::shared_ptr _worker{nullptr}; std::shared_ptr _ep{nullptr}; std::function _progressWorker; ucxx::BufferType _bufferType; + ucs_memory_type_t _memoryType; + bool _registerCustomAmAllocator; bool _enableDelayedSubmission; ProgressMode _progressMode; size_t _messageLength; size_t _messageSize; + size_t _rndvThresh{8192}; size_t _numBuffers{0}; std::vector> _send; @@ -52,10 +54,18 @@ class RequestTest #endif } - std::tie(_bufferType, _enableDelayedSubmission, _progressMode, _messageLength) = GetParam(); + std::tie(_bufferType, + _registerCustomAmAllocator, + _enableDelayedSubmission, + _progressMode, + _messageLength) = GetParam(); + _memoryType = + (_bufferType == ucxx::BufferType::RMM) ? UCS_MEMORY_TYPE_CUDA : UCS_MEMORY_TYPE_HOST; _messageSize = _messageLength * sizeof(int); - _worker = _context->createWorker(_enableDelayedSubmission); + _context = ucxx::createContext({{"RNDV_THRESH", std::to_string(_rndvThresh)}}, + ucxx::Context::defaultFeatureFlags); + _worker = _context->createWorker(_enableDelayedSubmission); if (_progressMode == ProgressMode::Blocking) { _worker->initBlockingProgressMode(); @@ -140,17 +150,29 @@ TEST_P(RequestTest, ProgressAm) GTEST_SKIP() << "Interrupting UCP worker progress operation in wait mode is not possible"; } + if (_registerCustomAmAllocator && _memoryType == UCS_MEMORY_TYPE_CUDA) { + _worker->registerAmAllocator(UCS_MEMORY_TYPE_CUDA, [](size_t length) { + return std::make_shared(length); + }); + } + allocate(1, false); // Submit and wait for transfers to complete std::vector> requests; - requests.push_back(_ep->amSend(_sendPtr[0], _messageSize)); + requests.push_back(_ep->amSend(_sendPtr[0], _messageSize, _memoryType)); requests.push_back(_ep->amRecv()); waitRequests(_worker, requests, _progressWorker); auto recvReq = requests[1]; _recvPtr[0] = recvReq->getRecvBuffer()->data(); + // Messages larger than `_rndvThresh` are rendezvous and will use custom allocator, + // smaller messages are eager and will always be host-allocated. + ASSERT_THAT(recvReq->getRecvBuffer()->getType(), + (_registerCustomAmAllocator && _messageSize >= _rndvThresh) ? _bufferType + : ucxx::BufferType::Host); + copyResults(); // Assert data correctness @@ -269,39 +291,43 @@ TEST_P(RequestTest, TagUserCallback) INSTANTIATE_TEST_SUITE_P(ProgressModes, RequestTest, Combine(Values(ucxx::BufferType::Host), + Values(false), Values(false), Values(ProgressMode::Polling, ProgressMode::Blocking, // ProgressMode::Wait, // Hangs on Stream ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking), - Values(1, 1024, 1048576))); + Values(1, 1024, 2048, 1048576))); INSTANTIATE_TEST_SUITE_P(DelayedSubmission, RequestTest, Combine(Values(ucxx::BufferType::Host), + Values(false), Values(true), Values(ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking), - Values(1, 1024, 1048576))); + Values(1, 1024, 2048, 1048576))); #if UCXX_ENABLE_RMM INSTANTIATE_TEST_SUITE_P(RMMProgressModes, RequestTest, Combine(Values(ucxx::BufferType::RMM), + Values(false, true), Values(false), Values(ProgressMode::Polling, ProgressMode::Blocking, // ProgressMode::Wait, // Hangs on Stream ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking), - Values(1, 1024, 1048576))); + Values(1, 1024, 2048, 1048576))); INSTANTIATE_TEST_SUITE_P(RMMDelayedSubmission, RequestTest, Combine(Values(ucxx::BufferType::RMM), + Values(false, true), Values(true), Values(ProgressMode::ThreadPolling, ProgressMode::ThreadBlocking), - Values(1, 1024, 1048576))); + Values(1, 1024, 2048, 1048576))); #endif } // namespace diff --git a/cpp/tests/worker.cpp b/cpp/tests/worker.cpp index 9424fe67..508b2006 100644 --- a/cpp/tests/worker.cpp +++ b/cpp/tests/worker.cpp @@ -81,7 +81,7 @@ TEST_F(WorkerTest, AmProbe) std::vector buf{123}; std::vector> requests; - requests.push_back(ep->amSend(buf.data(), buf.size() * sizeof(int))); + requests.push_back(ep->amSend(buf.data(), buf.size() * sizeof(int), UCS_MEMORY_TYPE_HOST)); waitRequests(_worker, requests, progressWorker); // Attempt to progress worker 10 times (arbitrarily defined). @@ -104,7 +104,7 @@ TEST_P(WorkerProgressTest, ProgressAm) std::vector send{123}; std::vector> requests; - requests.push_back(ep->amSend(send.data(), send.size() * sizeof(int))); + requests.push_back(ep->amSend(send.data(), send.size() * sizeof(int), UCS_MEMORY_TYPE_HOST)); requests.push_back(ep->amRecv()); waitRequests(_worker, requests, _progressWorker); From e6d96b93ba8ecc4643a21135feb901c98925cc62 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 18 May 2023 05:59:26 -0700 Subject: [PATCH 12/34] Add RMM AM allocator to Python --- cpp/src/request.cpp | 5 +- cpp/src/worker.cpp | 2 + python/ucxx/_lib/libucxx.pyx | 16 +++- python/ucxx/_lib/ucxx_api.pxd | 18 +++- .../_lib_async/tests/test_send_recv_am.py | 96 +++++++++++++++++++ 5 files changed, 133 insertions(+), 4 deletions(-) create mode 100644 python/ucxx/_lib_async/tests/test_send_recv_am.py diff --git a/cpp/src/request.cpp b/cpp/src/request.cpp index 2e19c1ca..e0ad1c16 100644 --- a/cpp/src/request.cpp +++ b/cpp/src/request.cpp @@ -69,7 +69,7 @@ void Request::cancel() ucs_status_string(status)); } else { ucxx_trace_req_f(_ownerString.c_str(), _request, _operationName.c_str(), "canceling"); - ucp_request_cancel(_worker->getHandle(), _request); + if (_request != nullptr) ucp_request_cancel(_worker->getHandle(), _request); } } else { auto status = _status.load(); @@ -105,10 +105,11 @@ void Request::callback(void* request, ucs_status_t status) _callback.target()); if (_callback) _callback(status, _callbackData); - ucp_request_free(request); + if (request != nullptr) ucp_request_free(request); ucxx_trace("Request completed: %p, handle: %p", this, request); setStatus(status); + ucxx_trace("Request %p, isCompleted: %d", this, isCompleted()); } void Request::process() diff --git a/cpp/src/worker.cpp b/cpp/src/worker.cpp index 6bc0a4a0..2266d8cf 100644 --- a/cpp/src/worker.cpp +++ b/cpp/src/worker.cpp @@ -417,6 +417,8 @@ std::shared_ptr Worker::createListener(uint16_t port, void Worker::registerAmAllocator(ucs_memory_type_t memoryType, AmAllocatorType allocator) { + if (_amData == nullptr) + throw std::runtime_error("Active Messages wasn not enabled during context creation"); _amData->_allocators.insert_or_assign(memoryType, allocator); } diff --git a/python/ucxx/_lib/libucxx.pyx b/python/ucxx/_lib/libucxx.pyx index 2f4b6bec..a7c760ae 100644 --- a/python/ucxx/_lib/libucxx.pyx +++ b/python/ucxx/_lib/libucxx.pyx @@ -18,6 +18,7 @@ from libcpp.functional cimport function from libcpp.map cimport map as cpp_map from libcpp.memory cimport ( dynamic_pointer_cast, + make_shared, make_unique, shared_ptr, unique_ptr, @@ -60,6 +61,11 @@ def _get_host_buffer(uintptr_t recv_buffer_ptr): return ptr_to_ndarray(host_buffer.release(), size) +cdef shared_ptr[Buffer] _rmm_am_allocator(size_t length): + cdef shared_ptr[RMMBuffer] rmm_buffer = make_shared[RMMBuffer](length) + return dynamic_pointer_cast[Buffer, RMMBuffer](rmm_buffer) + + ############################################################################### # Exceptions # ############################################################################### @@ -434,6 +440,10 @@ cdef class UCXWorker(): ): cdef bint ucxx_enable_delayed_submission = enable_delayed_submission cdef bint ucxx_enable_python_future = enable_python_future + cdef AmAllocatorType rmm_am_allocator + + self._context_feature_flags = (context.feature_flags) + with nogil: self._worker = createPythonWorker( context._context, @@ -442,7 +452,9 @@ cdef class UCXWorker(): ) self._enable_python_future = self._worker.get().isFutureEnabled() - self._context_feature_flags = (context.feature_flags) + if self._context_feature_flags & UCP_FEATURE_AM: + rmm_am_allocator = (&_rmm_am_allocator) + self._worker.get().registerAmAllocator(UCS_MEMORY_TYPE_CUDA, rmm_am_allocator) @property def handle(self): @@ -963,6 +975,7 @@ cdef class UCXEndpoint(): def am_send(self, Array arr): cdef void* buf = arr.ptr cdef size_t nbytes = arr.nbytes + cdef bint cuda_array = arr.cuda cdef shared_ptr[Request] req if not self._context_feature_flags & Feature.AM.value: @@ -972,6 +985,7 @@ cdef class UCXEndpoint(): req = self._endpoint.get().amSend( buf, nbytes, + UCS_MEMORY_TYPE_CUDA if cuda_array else UCS_MEMORY_TYPE_HOST, self._enable_python_future ) diff --git a/python/ucxx/_lib/ucxx_api.pxd b/python/ucxx/_lib/ucxx_api.pxd index e8a6e6d2..cde89643 100644 --- a/python/ucxx/_lib/ucxx_api.pxd +++ b/python/ucxx/_lib/ucxx_api.pxd @@ -71,9 +71,15 @@ cdef extern from "ucp/api/ucp.h" nogil: ctypedef enum ucs_status_t: pass + ctypedef enum ucs_memory_type_t: + pass + # Constants ucs_status_t UCS_OK + ucs_memory_type_t UCS_MEMORY_TYPE_HOST + ucs_memory_type_t UCS_MEMORY_TYPE_CUDA + int UCP_FEATURE_TAG int UCP_FEATURE_WAKEUP int UCP_FEATURE_STREAM @@ -152,16 +158,19 @@ cdef extern from "" namespace "ucxx" nogil: Invalid "ucxx::BufferType::Invalid" cdef cppclass Buffer: + Buffer(const BufferType bufferType, const size_t size_t) BufferType getType() size_t getSize() cdef cppclass HostBuffer: + HostBuffer(const size_t size_t) BufferType getType() size_t getSize() void* release() except +raise_py_error void* data() except +raise_py_error cdef cppclass RMMBuffer: + RMMBuffer(const size_t size_t) BufferType getType() size_t getSize() unique_ptr[device_buffer] release() except +raise_py_error @@ -177,6 +186,12 @@ cdef extern from "" namespace "ucxx" nogil: cdef extern from "" namespace "ucxx" nogil: + # Using function[Buffer] here doesn't seem possible due to Cython bugs/limitations. The + # workaround is to use a raw C function pointer and let it be parsed by the compiler. + # See https://github.com/cython/cython/issues/2041 and + # https://github.com/cython/cython/issues/3193 + ctypedef shared_ptr[Buffer] (*AmAllocatorType)(size_t) + ctypedef cpp_unordered_map[string, string] ConfigMap shared_ptr[Context] createContext( @@ -241,12 +256,13 @@ cdef extern from "" namespace "ucxx" nogil: ) except +raise_py_error bint isFutureEnabled() const bint amProbe(ucp_ep_h) const + void registerAmAllocator(ucs_memory_type_t memoryType, AmAllocatorType allocator) cdef cppclass Endpoint(Component): ucp_ep_h getHandle() void close() shared_ptr[Request] amSend( - void* buffer, size_t length, bint enable_python_future + void* buffer, size_t length, ucs_memory_type_t memory_type, bint enable_python_future ) except +raise_py_error shared_ptr[Request] amRecv( bint enable_python_future diff --git a/python/ucxx/_lib_async/tests/test_send_recv_am.py b/python/ucxx/_lib_async/tests/test_send_recv_am.py new file mode 100644 index 00000000..25ac4efc --- /dev/null +++ b/python/ucxx/_lib_async/tests/test_send_recv_am.py @@ -0,0 +1,96 @@ +import asyncio +from functools import partial + +import numpy as np +import pytest +from utils import wait_listener_client_handlers + +import ucxx + +msg_sizes = [0] + [2**i for i in range(0, 25, 4)] + + +def _bytearray_assert_equal(a, b): + assert a == b + + +def get_data(): + ret = [ + { + "allocator": bytearray, + "generator": lambda n: bytearray(b"m" * n), + "validator": lambda recv, exp: _bytearray_assert_equal(bytes(recv), exp), + "memory_type": "host", + }, + { + "allocator": partial(np.ones, dtype=np.uint8), + "generator": partial(np.arange, dtype=np.int64), + "validator": lambda recv, exp: np.testing.assert_equal( + recv.view(np.int64), exp + ), + "memory_type": "host", + }, + ] + + try: + import cupy as cp + + ret.append( + { + "allocator": partial(cp.ones, dtype=np.uint8), + "generator": partial(cp.arange, dtype=np.int64), + "validator": lambda recv, exp: cp.testing.assert_array_equal( + cp.asarray(recv).view(np.int64), exp + ), + "memory_type": "cuda", + } + ) + except ImportError: + pass + + return ret + + +def simple_server(size, recv): + async def server(ep): + recv.append(await ep.am_recv()) + await ep.close() + + return server + + +@pytest.mark.asyncio +@pytest.mark.parametrize("size", msg_sizes) +@pytest.mark.parametrize("progress_mode", ["polling", "thread", "thread-polling"]) +@pytest.mark.parametrize("recv_wait", [True, False]) +@pytest.mark.parametrize("data", get_data()) +async def test_send_recv_am(size, progress_mode, recv_wait, data): + rndv_thresh = 8192 + ucxx.init(options={"RNDV_THRESH": str(rndv_thresh)}, progress_mode=progress_mode) + + msg = data["generator"](size) + + recv = [] + listener = ucxx.create_listener(simple_server(size, recv)) + num_clients = 1 + clients = [ + await ucxx.create_endpoint(ucxx.get_address(), listener.port) + for i in range(num_clients) + ] + for c in clients: + if recv_wait: + # By sleeping here we ensure that the listener's + # ep.am_recv call will have to wait, rather than return + # immediately as receive data is already available. + await asyncio.sleep(1) + await c.am_send(msg) + for c in clients: + await c.close() + await wait_listener_client_handlers(listener) + + if data["memory_type"] == "cuda" and msg.nbytes < rndv_thresh: + # Eager messages are always received on the host, if no custom host + # allocator is registered, UCXX defaults to `np.array`. + np.testing.assert_equal(recv[0].view(np.int64), msg.get()) + else: + data["validator"](recv[0], msg) From 457c454f27a7897dfc0fac6d83b93ced85ca740f Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 19 May 2023 05:28:08 -0700 Subject: [PATCH 13/34] Make class name consistent --- cpp/include/ucxx/constructors.h | 6 ++--- cpp/include/ucxx/internal/request_am.h | 12 ++++----- cpp/include/ucxx/request.h | 2 +- cpp/include/ucxx/request_am.h | 34 +++++++++++++------------- cpp/include/ucxx/worker.h | 10 ++++---- cpp/src/endpoint.cpp | 4 +-- cpp/src/internal/request_am.cpp | 2 +- cpp/src/request_am.cpp | 31 ++++++++++++----------- cpp/src/worker.cpp | 8 +++--- 9 files changed, 54 insertions(+), 55 deletions(-) diff --git a/cpp/include/ucxx/constructors.h b/cpp/include/ucxx/constructors.h index 4596e322..9cc5ad40 100644 --- a/cpp/include/ucxx/constructors.h +++ b/cpp/include/ucxx/constructors.h @@ -19,7 +19,7 @@ class Future; class Listener; class Notifier; class Request; -class RequestAM; +class RequestAm; class RequestStream; class RequestTag; class RequestTagMulti; @@ -54,7 +54,7 @@ std::shared_ptr createWorker(std::shared_ptr context, const bool enableDelayedSubmission); // Transfers -std::shared_ptr createRequestAMSend(std::shared_ptr endpoint, +std::shared_ptr createRequestAmSend(std::shared_ptr endpoint, void* buffer, size_t length, ucs_memory_type_t memoryType, @@ -62,7 +62,7 @@ std::shared_ptr createRequestAMSend(std::shared_ptr endpoin RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData); -std::shared_ptr createRequestAMRecv(std::shared_ptr endpoint, +std::shared_ptr createRequestAmRecv(std::shared_ptr endpoint, const bool enablePythonFuture, RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData); diff --git a/cpp/include/ucxx/internal/request_am.h b/cpp/include/ucxx/internal/request_am.h index fc00427f..a568e0ef 100644 --- a/cpp/include/ucxx/internal/request_am.h +++ b/cpp/include/ucxx/internal/request_am.h @@ -16,7 +16,7 @@ namespace ucxx { class Buffer; class InflightRequests; -class RequestAM; +class RequestAm; class Request; class Worker; @@ -28,7 +28,7 @@ class RecvAmMessage { public: internal::AmData* _amData{nullptr}; ///< Active messages data ucp_ep_h _ep{nullptr}; ///< Handle containing address of the reply endpoint - std::shared_ptr _request{ + std::shared_ptr _request{ nullptr}; ///< Request which will later be notified/delivered to user std::shared_ptr _buffer{nullptr}; ///< Buffer containing the received data @@ -45,13 +45,13 @@ class RecvAmMessage { */ RecvAmMessage(internal::AmData* amData, ucp_ep_h ep, - std::shared_ptr request, + std::shared_ptr request, std::shared_ptr buffer); /** * @brief Set the UCP request. * - * Set the underlying UCP request (`_request` attribute) of the `RequestAM`. + * Set the underlying UCP request (`_request` attribute) of the `RequestAm`. * * @param[in] request the UCP request associated to the active message receive operation. */ @@ -70,8 +70,8 @@ class RecvAmMessage { void callback(void* request, ucs_status_t status); }; -typedef std::unordered_map>> AmPoolType; -typedef std::unordered_map> RecvAmMessageMapType; +typedef std::unordered_map>> AmPoolType; +typedef std::unordered_map> RecvAmMessageMapType; class AmData { public: diff --git a/cpp/include/ucxx/request.h b/cpp/include/ucxx/request.h index 763582a0..c6c70571 100644 --- a/cpp/include/ucxx/request.h +++ b/cpp/include/ucxx/request.h @@ -197,7 +197,7 @@ class Request : public Component { * @brief Get the received buffer. * * This method is used to get the received buffer for applicable derived classes (e.g., - * `RequestAM` receive operations), in all other cases this will return `nullptr`. + * `RequestAm` receive operations), in all other cases this will return `nullptr`. * * @return The received buffer (if applicable) or `nullptr`. */ diff --git a/cpp/include/ucxx/request_am.h b/cpp/include/ucxx/request_am.h index cb61cb05..ecf3f845 100644 --- a/cpp/include/ucxx/request_am.h +++ b/cpp/include/ucxx/request_am.h @@ -20,7 +20,7 @@ namespace internal { class RecvAmMessage; } // namespace internal -class RequestAM : public Request { +class RequestAm : public Request { private: friend class internal::RecvAmMessage; @@ -28,16 +28,16 @@ class RequestAM : public Request { std::shared_ptr _buffer{nullptr}; ///< The AM received message buffer /** - * @brief Private constructor of `ucxx::RequestAM` send. + * @brief Private constructor of `ucxx::RequestAm` send. * - * This is the internal implementation of `ucxx::RequestAM` send constructor, made private + * This is the internal implementation of `ucxx::RequestAm` send constructor, made private * not to be called directly. This constructor is made private to ensure all UCXX objects * are shared pointers and the correct lifetime management of each one. * * Instead the user should use one of the following: * * - `ucxx::Endpoint::amSend()` - * - `ucxx::createRequestAMSend()` + * - `ucxx::createRequestAmSend()` * * @throws ucxx::Error if `endpoint` is not a valid `std::shared_ptr`. * @@ -50,7 +50,7 @@ class RequestAM : public Request { * @param[in] callbackFunction user-defined callback function to call upon completion. * @param[in] callbackData user-defined data to pass to the `callbackFunction`. */ - RequestAM(std::shared_ptr endpoint, + RequestAm(std::shared_ptr endpoint, void* buffer, size_t length, ucs_memory_type_t memoryType, @@ -59,16 +59,16 @@ class RequestAM : public Request { RequestCallbackUserData callbackData = nullptr); /** - * @brief Private constructor of `ucxx::RequestAM` receive. + * @brief Private constructor of `ucxx::RequestAm` receive. * - * This is the internal implementation of `ucxx::RequestAM` receive constructor, made + * This is the internal implementation of `ucxx::RequestAm` receive constructor, made * private not to be called directly. This constructor is made private to ensure all UCXX * objects are shared pointers and the correct lifetime management of each one. * * Instead the user should use one of the following: * * - `ucxx::Endpoint::amRecv()` - * - `ucxx::createRequestAMRecv()` + * - `ucxx::createRequestAmRecv()` * * @throws ucxx::Error if `endpointOrWorker` is not a valid * `std::shared_ptr` or @@ -82,16 +82,16 @@ class RequestAM : public Request { * @param[in] callbackFunction user-defined callback function to call upon completion. * @param[in] callbackData user-defined data to pass to the `callbackFunction`. */ - RequestAM(std::shared_ptr endpointOrWorker, + RequestAm(std::shared_ptr endpointOrWorker, const bool enablePythonFuture = false, RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr); public: /** - * @brief Constructor for `std::shared_ptr` send. + * @brief Constructor for `std::shared_ptr` send. * - * The constructor for a `std::shared_ptr` object, creating a send active + * The constructor for a `std::shared_ptr` object, creating a send active * message request, returning a pointer to a request object that can be later awaited and * checked for errors. This is a non-blocking operation, and the status of the transfer * must be verified from the resulting request object before the data can be @@ -109,9 +109,9 @@ class RequestAM : public Request { * @param[in] callbackFunction user-defined callback function to call upon completion. * @param[in] callbackData user-defined data to pass to the `callbackFunction`. * - * @returns The `shared_ptr` object + * @returns The `shared_ptr` object */ - friend std::shared_ptr createRequestAMSend( + friend std::shared_ptr createRequestAmSend( std::shared_ptr endpoint, void* buffer, size_t length, @@ -121,9 +121,9 @@ class RequestAM : public Request { RequestCallbackUserData callbackData); /** - * @brief Constructor for `std::shared_ptr` send. + * @brief Constructor for `std::shared_ptr` receive. * - * The constructor for a `std::shared_ptr` object, creating a receive + * The constructor for a `std::shared_ptr` object, creating a receive * active message request, returning a pointer to a request object that can be later * awaited and checked for errors. This is a non-blocking operation, and the status of * the transfer must be verified from the resulting request object before the data can be @@ -141,7 +141,7 @@ class RequestAM : public Request { * * @returns The `shared_ptr` object */ - friend std::shared_ptr createRequestAMRecv( + friend std::shared_ptr createRequestAmRecv( std::shared_ptr endpoint, const bool enablePythonFuture, RequestCallbackUserFunction callbackFunction, @@ -167,7 +167,7 @@ class RequestAM : public Request { * the header sent by the remote endpoint using the default allocator or one registered by * the user via `ucxx::Worker::registerAmAllocator()`. Following that, the message is * immediately received onto the buffer and a `UCS_OK` or the proper error status is set - * onto the `RequestAM` that is returned to the user, or will be later handled by another + * onto the `RequestAm` that is returned to the user, or will be later handled by another * callback when the message is ready. If the callback is executed when a user has already * requested received of the active message, the buffer and status will be set on the * earliest request, otherwise a new request is created and saved in a pool that will be diff --git a/cpp/include/ucxx/worker.h b/cpp/include/ucxx/worker.h index 22a6a1c5..159249b3 100644 --- a/cpp/include/ucxx/worker.h +++ b/cpp/include/ucxx/worker.h @@ -28,7 +28,7 @@ class Address; class Buffer; class Endpoint; class Listener; -class RequestAM; +class RequestAm; namespace internal { class AmData; @@ -54,7 +54,7 @@ class Worker : public Component { std::shared_ptr _delayedSubmissionCollection{ nullptr}; ///< Collection of enqueued delayed submissions - friend std::shared_ptr createRequestAMRecv( + friend std::shared_ptr createRequestAmRecv( std::shared_ptr endpoint, const bool enablePythonFuture, RequestCallbackUserFunction callbackFunction, @@ -93,8 +93,8 @@ class Worker : public Component { * * @returns Request to be subsequently checked for the completion state and data. */ - std::shared_ptr getAmRecv( - ucp_ep_h ep, std::function()> createAmRecvRequestFunction); + std::shared_ptr getAmRecv( + ucp_ep_h ep, std::function()> createAmRecvRequestFunction); /** * @brief Stop the progress thread if running without raising warnings. @@ -709,7 +709,7 @@ class Worker : public Component { * * Checks the worker for any uncaught active messages. An uncaught active message is any * active message that has been fully or partially received by the worker, but not matched - * by a corresponding `createRequestAMRecv()` call. + * by a corresponding `createRequestAmRecv()` call. * * @code{.cpp} * // `worker` is `std::shared_ptr` diff --git a/cpp/src/endpoint.cpp b/cpp/src/endpoint.cpp index 2b4d1733..d074ff57 100644 --- a/cpp/src/endpoint.cpp +++ b/cpp/src/endpoint.cpp @@ -200,7 +200,7 @@ std::shared_ptr Endpoint::amSend(void* buffer, RequestCallbackUserData callbackData) { auto endpoint = std::dynamic_pointer_cast(shared_from_this()); - return registerInflightRequest(createRequestAMSend( + return registerInflightRequest(createRequestAmSend( endpoint, buffer, length, memoryType, enablePythonFuture, callbackFunction, callbackData)); } @@ -210,7 +210,7 @@ std::shared_ptr Endpoint::amRecv(const bool enablePythonFuture, { auto endpoint = std::dynamic_pointer_cast(shared_from_this()); return registerInflightRequest( - createRequestAMRecv(endpoint, enablePythonFuture, callbackFunction, callbackData)); + createRequestAmRecv(endpoint, enablePythonFuture, callbackFunction, callbackData)); } std::shared_ptr Endpoint::streamSend(void* buffer, diff --git a/cpp/src/internal/request_am.cpp b/cpp/src/internal/request_am.cpp index 5006a86a..873d22e2 100644 --- a/cpp/src/internal/request_am.cpp +++ b/cpp/src/internal/request_am.cpp @@ -13,7 +13,7 @@ namespace internal { RecvAmMessage::RecvAmMessage(internal::AmData* amData, ucp_ep_h ep, - std::shared_ptr request, + std::shared_ptr request, std::shared_ptr buffer) : _amData(amData), _ep(ep), _request(request), _buffer(buffer) { diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index 5fc86a71..c13dd9de 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -15,7 +15,7 @@ namespace ucxx { -std::shared_ptr createRequestAMSend( +std::shared_ptr createRequestAmSend( std::shared_ptr endpoint, void* buffer, size_t length, @@ -24,11 +24,11 @@ std::shared_ptr createRequestAMSend( RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr) { - return std::shared_ptr(new RequestAM( + return std::shared_ptr(new RequestAm( endpoint, buffer, length, memoryType, enablePythonFuture, callbackFunction, callbackData)); } -std::shared_ptr createRequestAMRecv( +std::shared_ptr createRequestAmRecv( std::shared_ptr endpoint, const bool enablePythonFuture = false, RequestCallbackUserFunction callbackFunction = nullptr, @@ -37,13 +37,13 @@ std::shared_ptr createRequestAMRecv( auto worker = endpoint->getWorker(endpoint->getParent()); auto createRequest = [endpoint, enablePythonFuture, callbackFunction, callbackData]() { - return std::shared_ptr( - new RequestAM(endpoint, enablePythonFuture, callbackFunction, callbackData)); + return std::shared_ptr( + new RequestAm(endpoint, enablePythonFuture, callbackFunction, callbackData)); }; return worker->getAmRecv(endpoint->getHandle(), createRequest); } -RequestAM::RequestAM(std::shared_ptr endpoint, +RequestAm::RequestAm(std::shared_ptr endpoint, void* buffer, size_t length, ucs_memory_type_t memoryType, @@ -65,7 +65,7 @@ RequestAM::RequestAM(std::shared_ptr endpoint, std::bind(std::mem_fn(&Request::populateDelayedSubmission), this)); } -RequestAM::RequestAM(std::shared_ptr endpointOrWorker, +RequestAm::RequestAm(std::shared_ptr endpointOrWorker, const bool enablePythonFuture, RequestCallbackUserFunction callbackFunction, RequestCallbackUserData callbackData) @@ -93,7 +93,7 @@ static void _recvCompletedCallback(void* request, recvAmMessage->callback(request, status); } -ucs_status_t RequestAM::recvCallback(void* arg, +ucs_status_t RequestAm::recvCallback(void* arg, const void* header, size_t header_length, void* data, @@ -109,14 +109,14 @@ ucs_status_t RequestAM::recvCallback(void* arg, ucxx_error("UCP_AM_RECV_ATTR_FIELD_REPLY_EP not set"); ucp_ep_h ep = param->reply_ep; - recvPool.try_emplace(ep, std::queue>()); + recvPool.try_emplace(ep, std::queue>()); bool is_rndv = param->recv_attr & UCP_AM_RECV_ATTR_FLAG_RNDV; std::shared_ptr buf{nullptr}; auto allocatorType = *reinterpret_cast(header); - std::shared_ptr req{nullptr}; + std::shared_ptr req{nullptr}; { std::lock_guard lock(amData->_mutex); @@ -126,8 +126,8 @@ ucs_status_t RequestAM::recvCallback(void* arg, recvWait.at(ep).pop(); ucxx_trace_req("amRecv recvWait: %p", req.get()); } else { - req = std::shared_ptr( - new RequestAM(worker, worker->isFutureEnabled(), nullptr, nullptr)); + req = std::shared_ptr( + new RequestAm(worker, worker->isFutureEnabled(), nullptr, nullptr)); recvPool.at(ep).push(req); ucxx_trace_req("amRecv recvPool: %p", req.get()); } @@ -191,9 +191,9 @@ ucs_status_t RequestAM::recvCallback(void* arg, } } -std::shared_ptr RequestAM::getRecvBuffer() { return _buffer; } +std::shared_ptr RequestAm::getRecvBuffer() { return _buffer; } -void RequestAM::request() +void RequestAm::request() { static const ucp_tag_t tagMask = -1; @@ -205,7 +205,6 @@ void RequestAM::request() .user_data = this}; _sendHeader = _delayedSubmission->_memoryType; - ucxx_debug("RequestAM::request memtype: %d", _sendHeader); if (_delayedSubmission->_send) { param.cb.send = _amSendCallback; @@ -222,7 +221,7 @@ void RequestAM::request() } } -void RequestAM::populateDelayedSubmission() +void RequestAm::populateDelayedSubmission() { request(); diff --git a/cpp/src/worker.cpp b/cpp/src/worker.cpp index 2266d8cf..61535b2e 100644 --- a/cpp/src/worker.cpp +++ b/cpp/src/worker.cpp @@ -50,7 +50,7 @@ Worker::Worker(std::shared_ptr context, const bool enableDelayedSubmiss UCP_AM_HANDLER_PARAM_FIELD_CB | UCP_AM_HANDLER_PARAM_FIELD_ARG, .id = AM_MSG_ID, - .cb = RequestAM::recvCallback, + .cb = RequestAm::recvCallback, .arg = _amData.get()}; utils::ucsErrorThrow(ucp_worker_set_am_recv_handler(_handle, &am_handler_param)); } @@ -102,8 +102,8 @@ void Worker::drainWorkerTagRecv() } } -std::shared_ptr Worker::getAmRecv( - ucp_ep_h ep, std::function()> createAmRecvRequestFunction) +std::shared_ptr Worker::getAmRecv( + ucp_ep_h ep, std::function()> createAmRecvRequestFunction) { std::lock_guard lock(_amData->_mutex); @@ -116,7 +116,7 @@ std::shared_ptr Worker::getAmRecv( return req; } else { auto req = createAmRecvRequestFunction(); - recvWait.try_emplace(ep, std::queue>()); + recvWait.try_emplace(ep, std::queue>()); recvWait.at(ep).push(req); return req; } From dece623ff828971fb5a069358b1e4e4815614ae5 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 19 May 2023 06:37:59 -0700 Subject: [PATCH 14/34] Fix `ucxx::RequestAm` trace logging --- cpp/include/ucxx/internal/request_am.h | 2 ++ cpp/python/src/worker.cpp | 9 ++++++++- cpp/src/request_am.cpp | 26 +++++++++++++++++++++++--- cpp/src/worker.cpp | 9 ++++++++- 4 files changed, 41 insertions(+), 5 deletions(-) diff --git a/cpp/include/ucxx/internal/request_am.h b/cpp/include/ucxx/internal/request_am.h index a568e0ef..0645ef44 100644 --- a/cpp/include/ucxx/internal/request_am.h +++ b/cpp/include/ucxx/internal/request_am.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -76,6 +77,7 @@ typedef std::unordered_map> RecvAmMes class AmData { public: std::weak_ptr _worker{}; ///< The worker to which the Active Message callback belongs + std::string _ownerString{}; ///< The owner string used for logging AmPoolType _recvPool{}; ///< The pool of completed receive requests (waiting for user request) AmPoolType _recvWait{}; ///< The pool of user receive requests (waiting for message arrival) RecvAmMessageMapType diff --git a/cpp/python/src/worker.cpp b/cpp/python/src/worker.cpp index 41fe1b32..3ec8a92e 100644 --- a/cpp/python/src/worker.cpp +++ b/cpp/python/src/worker.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -37,7 +38,13 @@ std::shared_ptr<::ucxx::Worker> createWorker(std::shared_ptr context, // We can only get a `shared_ptr` for the Active Messages callback after it's // been created, thus this cannot be in the constructor. - if (worker->_amData != nullptr) worker->_amData->_worker = worker; + if (worker->_amData != nullptr) { + worker->_amData->_worker = worker; + + std::stringstream ownerStream; + ownerStream << "worker " << worker->getHandle(); + worker->_amData->_ownerString = ownerStream.str(); + } return worker; } diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index c13dd9de..50ca3913 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -4,6 +4,7 @@ */ #include #include +#include #include #include @@ -102,6 +103,7 @@ ucs_status_t RequestAm::recvCallback(void* arg, { internal::AmData* amData = reinterpret_cast(arg); auto worker = amData->_worker.lock(); + auto& ownerString = amData->_ownerString; auto& recvPool = amData->_recvPool; auto& recvWait = amData->_recvWait; @@ -159,7 +161,16 @@ ucs_status_t RequestAm::recvCallback(void* arg, ucs_status_ptr_t status = ucp_am_recv_data_nbx(worker->getHandle(), data, buf->data(), length, &request_param); - ucxx_debug("am rndv: ep %p len %lu", ep, length); + ucxx_trace_req_f( + ownerString.c_str(), + status, + "amRecv rndv", + "ep %p, buffer %p, size %lu, future %p, future handle %p, populateDelayedSubmission", + ep, + buf->data(), + length, + req->_future.get(), + req->_future->getHandle()); if (req->isCompleted()) { // The request completed/errored immediately @@ -180,11 +191,20 @@ ucs_status_t RequestAm::recvCallback(void* arg, return UCS_INPROGRESS; } } else { - ucxx_debug("am eager copying %lu bytes with ep %p", length, ep); - buf = amData->_allocators.at(UCS_MEMORY_TYPE_HOST)(length); if (length > 0) memcpy(buf->data(), data, length); + ucxx_trace_req_f( + ownerString.c_str(), + nullptr, + "amRecv eager", + "ep: %p, buffer %p, size %lu, future %p, future handle %p, populateDelayedSubmission", + ep, + buf->data(), + length, + req->_future.get(), + req->_future->getHandle()); + internal::RecvAmMessage recvAmMessage(amData, ep, req, buf); recvAmMessage.callback(nullptr, UCS_OK); return UCS_OK; diff --git a/cpp/src/worker.cpp b/cpp/src/worker.cpp index 61535b2e..867d7e1e 100644 --- a/cpp/src/worker.cpp +++ b/cpp/src/worker.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -129,7 +130,13 @@ std::shared_ptr createWorker(std::shared_ptr context, // We can only get a `shared_ptr` for the Active Messages callback after it's // been created, thus this cannot be in the constructor. - if (worker->_amData != nullptr) worker->_amData->_worker = worker; + if (worker->_amData != nullptr) { + worker->_amData->_worker = worker; + + std::stringstream ownerStream; + ownerStream << "worker " << worker->getHandle(); + worker->_amData->_ownerString = ownerStream.str(); + } return worker; } From bbf6e27c014efea57e6cb0484d3b663dbb1e258e Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 1 Jun 2023 13:52:13 -0700 Subject: [PATCH 15/34] Docstrings fixes --- cpp/include/ucxx/endpoint.h | 19 ++++++++++--------- cpp/include/ucxx/worker.h | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/include/ucxx/endpoint.h b/cpp/include/ucxx/endpoint.h index b93d9dc4..94764d9b 100644 --- a/cpp/include/ucxx/endpoint.h +++ b/cpp/include/ucxx/endpoint.h @@ -256,10 +256,10 @@ class Endpoint : public Component { /** * @brief Enqueue an active message send operation. * - * Enqueue an active message send operation, returning a `std::shared` that - * can be later awaited and checked for errors. This is a non-blocking operation, and the - * status of the transfer must be verified from the resulting request object before the - * data can be released. + * Enqueue an active message send operation, returning a `std::shared_ptr` + * that can be later awaited and checked for errors. This is a non-blocking operation, and + * the status of the transfer must be verified from the resulting request object before + * the data can be released. * * Using a Python future may be requested by specifying `enablePythonFuture`. If a * Python future is requested, the Python application must then await on this future to @@ -285,11 +285,12 @@ class Endpoint : public Component { /** * @brief Enqueue an active message receive operation. * - * Enqueue an active message receive operation, returning a `std::shared` - * that can be later awaited and checked for errors, making data available via the return - * value's `getRecvBuffer()` method once the operation completes successfully. This is a - * non-blocking operation, and the status of the transfer must be verified from the - * resulting request object before the data can be consumed. + * Enqueue an active message receive operation, returning a + * `std::shared_ptr` that can be later awaited and checked for errors, + * making data available via the return value's `getRecvBuffer()` method once the + * operation completes successfully. This is a non-blocking operation, and the status of + * the transfer must be verified from the resulting request object before the data can be + * consumed. * * Using a Python future may be requested by specifying `enablePythonFuture`. If a * Python future is requested, the Python application must then await on this future to diff --git a/cpp/include/ucxx/worker.h b/cpp/include/ucxx/worker.h index 159249b3..e185b8a2 100644 --- a/cpp/include/ucxx/worker.h +++ b/cpp/include/ucxx/worker.h @@ -699,7 +699,7 @@ class Worker : public Component { * @endcode * * @param[in] memoryType the memory type the allocator will be used for. - * @param[in] allocator the allocator callbable that will be used to allocate new + * @param[in] allocator the allocator callable that will be used to allocate new * active message buffers. */ void registerAmAllocator(ucs_memory_type_t memoryType, AmAllocatorType allocator); From ffdac108417c3ff73ff9314cb3f304989273d932 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 1 Jun 2023 13:54:16 -0700 Subject: [PATCH 16/34] Delete `ucxx::internal::RecvAmMessage` default/move/copy constructors --- cpp/include/ucxx/internal/request_am.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/include/ucxx/internal/request_am.h b/cpp/include/ucxx/internal/request_am.h index 0645ef44..73c6d8dd 100644 --- a/cpp/include/ucxx/internal/request_am.h +++ b/cpp/include/ucxx/internal/request_am.h @@ -33,6 +33,12 @@ class RecvAmMessage { nullptr}; ///< Request which will later be notified/delivered to user std::shared_ptr _buffer{nullptr}; ///< Buffer containing the received data + RecvAmMessage() = delete; + RecvAmMessage(const RecvAmMessage&) = delete; + RecvAmMessage& operator=(RecvAmMessage const&) = delete; + RecvAmMessage(RecvAmMessage&& o) = delete; + RecvAmMessage& operator=(RecvAmMessage&& o) = delete; + /** * @brief Constructor of `ucxx::RecvAmMessage`. * From bb4ff771285a4a0f8f3f784cd2d0d62e1478c73a Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 2 Jun 2023 12:24:44 -0700 Subject: [PATCH 17/34] Use more appropriate typecasting --- cpp/src/request_am.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index 50ca3913..ac31b689 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -101,7 +101,7 @@ ucs_status_t RequestAm::recvCallback(void* arg, size_t length, const ucp_am_recv_param_t* param) { - internal::AmData* amData = reinterpret_cast(arg); + internal::AmData* amData = static_cast(arg); auto worker = amData->_worker.lock(); auto& ownerString = amData->_ownerString; auto& recvPool = amData->_recvPool; @@ -116,7 +116,7 @@ ucs_status_t RequestAm::recvCallback(void* arg, bool is_rndv = param->recv_attr & UCP_AM_RECV_ATTR_FLAG_RNDV; std::shared_ptr buf{nullptr}; - auto allocatorType = *reinterpret_cast(header); + auto allocatorType = *static_cast(header); std::shared_ptr req{nullptr}; From cbd99946090ccb42a839db6ec3cd7c660d11c6b2 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 2 Jun 2023 12:27:02 -0700 Subject: [PATCH 18/34] Move declaration to initialization --- cpp/src/request_am.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index ac31b689..94fd1124 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -148,7 +148,7 @@ ucs_status_t RequestAm::recvCallback(void* arg, allocatorType = UCS_MEMORY_TYPE_HOST; } - buf = amData->_allocators.at(allocatorType)(length); + std::shared_ptr buf = amData->_allocators.at(allocatorType)(length); auto recvAmMessage = std::make_shared(amData, ep, req, buf); @@ -191,7 +191,7 @@ ucs_status_t RequestAm::recvCallback(void* arg, return UCS_INPROGRESS; } } else { - buf = amData->_allocators.at(UCS_MEMORY_TYPE_HOST)(length); + std::shared_ptr buf = amData->_allocators.at(UCS_MEMORY_TYPE_HOST)(length); if (length > 0) memcpy(buf->data(), data, length); ucxx_trace_req_f( From 5f6a207d441841f6c8b58e2cc278336db16362aa Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 2 Jun 2023 12:38:05 -0700 Subject: [PATCH 19/34] Improve handling of `recvPool` and `recvWait` --- cpp/src/request_am.cpp | 11 ++++++----- cpp/src/worker.cpp | 13 +++++++------ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index 94fd1124..9a80e450 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -111,7 +111,6 @@ ucs_status_t RequestAm::recvCallback(void* arg, ucxx_error("UCP_AM_RECV_ATTR_FIELD_REPLY_EP not set"); ucp_ep_h ep = param->reply_ep; - recvPool.try_emplace(ep, std::queue>()); bool is_rndv = param->recv_attr & UCP_AM_RECV_ATTR_FLAG_RNDV; @@ -123,14 +122,16 @@ ucs_status_t RequestAm::recvCallback(void* arg, { std::lock_guard lock(amData->_mutex); - if (recvWait.find(ep) != recvWait.end() && !recvWait[ep].empty()) { - req = recvWait.at(ep).front(); - recvWait.at(ep).pop(); + auto reqs = recvWait.find(ep); + if (reqs != recvWait.end() && !reqs->second.empty()) { + req = reqs->second.front(); + reqs->second.pop(); ucxx_trace_req("amRecv recvWait: %p", req.get()); } else { req = std::shared_ptr( new RequestAm(worker, worker->isFutureEnabled(), nullptr, nullptr)); - recvPool.at(ep).push(req); + auto [queue, _] = recvPool.try_emplace(ep, std::queue>()); + queue->second.push(req); ucxx_trace_req("amRecv recvPool: %p", req.get()); } } diff --git a/cpp/src/worker.cpp b/cpp/src/worker.cpp index 54d37f4f..c94329fa 100644 --- a/cpp/src/worker.cpp +++ b/cpp/src/worker.cpp @@ -114,14 +114,15 @@ std::shared_ptr Worker::getAmRecv( auto& recvPool = _amData->_recvPool; auto& recvWait = _amData->_recvWait; - if (recvPool.find(ep) != recvPool.end() && !recvPool.at(ep).empty()) { - auto req = recvPool.at(ep).front(); - recvPool.at(ep).pop(); + auto reqs = recvPool.find(ep); + if (reqs != recvPool.end() && !reqs->second.empty()) { + auto req = reqs->second.front(); + reqs->second.pop(); return req; } else { - auto req = createAmRecvRequestFunction(); - recvWait.try_emplace(ep, std::queue>()); - recvWait.at(ep).push(req); + auto req = createAmRecvRequestFunction(); + auto [queue, _] = recvWait.try_emplace(ep, std::queue>()); + queue->second.push(req); return req; } } From 29a5516ae4ccbec9708f96201c6c999121144490 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 2 Jun 2023 12:54:03 -0700 Subject: [PATCH 20/34] Improve `waitRequests` C++ test util template --- cpp/tests/include/utils.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/tests/include/utils.h b/cpp/tests/include/utils.h index ca6cf36f..9110af50 100644 --- a/cpp/tests/include/utils.h +++ b/cpp/tests/include/utils.h @@ -23,15 +23,14 @@ enum class ProgressMode { void createCudaContextCallback(void* callbackArg); template -void waitRequests(std::shared_ptr worker, - const std::vector>& requests, - const std::function& progressWorker) +inline void waitRequests(std::shared_ptr worker, + const std::vector>& requests, + const std::function& progressWorker) { auto remainingRequests = requests; while (!remainingRequests.empty()) { auto updatedRequests = std::exchange(remainingRequests, decltype(remainingRequests)()); - remainingRequests.clear(); - for (auto& r : updatedRequests) { + for (auto const& r : updatedRequests) { if (progressWorker) progressWorker(); if (!r->isCompleted()) remainingRequests.push_back(r); From bd1787f0b17d505f6e4ec60037fc605d057bcd01 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 2 Jun 2023 12:59:08 -0700 Subject: [PATCH 21/34] Improve Python logger string formatting --- python/ucxx/_lib_async/endpoint.py | 38 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/python/ucxx/_lib_async/endpoint.py b/python/ucxx/_lib_async/endpoint.py index 1dff0c78..9097f981 100644 --- a/python/ucxx/_lib_async/endpoint.py +++ b/python/ucxx/_lib_async/endpoint.py @@ -59,7 +59,7 @@ def abort(self): To do that, use `Endpoint.close()` """ if self._ep is not None: - logger.debug("Endpoint.abort(): %s" % hex(self.uid)) + logger.debug("Endpoint.abort(): 0x%x" % self.uid) self._ep.close() self._ep = None self._ctx = None @@ -103,9 +103,9 @@ async def am_send(self, buffer): # Optimization to eliminate producing logger string overhead if logger.isEnabledFor(logging.DEBUG): nbytes = buffer.nbytes - log = "[AM Send #%03d] ep: %s, nbytes: %d, type: %s" % ( + log = "[AM Send #%03d] ep: 0x%x, nbytes: %d, type: %s" % ( self._send_count, - hex(self.uid), + self.uid, nbytes, type(buffer.obj), ) @@ -154,10 +154,10 @@ async def send(self, buffer, tag=None, force_tag=False): # Optimization to eliminate producing logger string overhead if logger.isEnabledFor(logging.DEBUG): nbytes = buffer.nbytes - log = "[Send #%03d] ep: %s, tag: %s, nbytes: %d, type: %s" % ( + log = "[Send #%03d] ep: 0x%x, tag: 0x%x, nbytes: %d, type: %s" % ( self._send_count, - hex(self.uid), - hex(tag), + self.uid, + tag, nbytes, type(buffer.obj), ) @@ -205,10 +205,10 @@ async def send_multi(self, buffers, tag=None, force_tag=False): # Optimization to eliminate producing logger string overhead if logger.isEnabledFor(logging.DEBUG): - log = "[Send Multi #%03d] ep: %s, tag: %s, nbytes: %s, type: %s" % ( + log = "[Send Multi #%03d] ep: 0x%x, tag: 0x%x, nbytes: %s, type: %s" % ( self._send_count, - hex(self.uid), - hex(tag), + self.uid, + tag, tuple([b.nbytes for b in buffers]), # nbytes, tuple([type(b.obj) for b in buffers]), ) @@ -258,9 +258,9 @@ async def am_recv(self): # Optimization to eliminate producing logger string overhead if logger.isEnabledFor(logging.DEBUG): - log = "[AM Recv #%03d] ep: %s" % ( + log = "[AM Recv #%03d] ep: 0x%x" % ( self._recv_count, - hex(self.uid), + self.uid, ) logger.debug(log) @@ -271,9 +271,9 @@ async def am_recv(self): buffer = req.get_recv_buffer() if logger.isEnabledFor(logging.DEBUG): - log = "[AM Recv Completed #%03d] ep: %s, nbytes: %d, type: %s" % ( + log = "[AM Recv Completed #%03d] ep: 0x%x, nbytes: %d, type: %s" % ( self._recv_count, - hex(self.uid), + self.uid, buffer.nbytes, type(buffer), ) @@ -323,10 +323,10 @@ async def recv(self, buffer, tag=None, force_tag=False): # Optimization to eliminate producing logger string overhead if logger.isEnabledFor(logging.DEBUG): nbytes = buffer.nbytes - log = "[Recv #%03d] ep: %s, tag: %s, nbytes: %d, type: %s" % ( + log = "[Recv #%03d] ep: 0x%x, tag: 0x%x, nbytes: %d, type: %s" % ( self._recv_count, - hex(self.uid), - hex(tag), + self.uid, + tag, nbytes, type(buffer.obj), ) @@ -373,10 +373,10 @@ async def recv_multi(self, tag=None, force_tag=False): # Optimization to eliminate producing logger string overhead if logger.isEnabledFor(logging.DEBUG): - log = "[Recv Multi #%03d] ep: %s, tag: %s" % ( + log = "[Recv Multi #%03d] ep: 0x%x, tag: 0x%x" % ( self._recv_count, - hex(self.uid), - hex(tag), + self.uid, + tag, ) logger.debug(log) From d7dcea00b25a3ce04b0f389f5ec633b43e11303a Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 5 Jun 2023 07:35:08 -0700 Subject: [PATCH 22/34] Inherit `ucxx::Component` by `ucxx::DelayedSubmission` --- cpp/include/ucxx/delayed_submission.h | 3 ++- cpp/src/internal/request_am.cpp | 1 + cpp/src/request_am.cpp | 5 ++++- cpp/src/request_stream.cpp | 5 ++++- cpp/src/request_tag.cpp | 19 +++++++++++-------- 5 files changed, 22 insertions(+), 11 deletions(-) diff --git a/cpp/include/ucxx/delayed_submission.h b/cpp/include/ucxx/delayed_submission.h index a65e428b..3948b8d3 100644 --- a/cpp/include/ucxx/delayed_submission.h +++ b/cpp/include/ucxx/delayed_submission.h @@ -11,6 +11,7 @@ #include +#include #include namespace ucxx { @@ -19,7 +20,7 @@ typedef std::function DelayedSubmissionCallbackType; typedef std::shared_ptr DelayedSubmissionCallbackPtrType; -class DelayedSubmission { +class DelayedSubmission : public Component { public: bool _send{false}; ///< Whether this is a send (`true`) operation or recv (`false`) void* _buffer{nullptr}; ///< Raw pointer to data buffer diff --git a/cpp/src/internal/request_am.cpp b/cpp/src/internal/request_am.cpp index 873d22e2..442871b9 100644 --- a/cpp/src/internal/request_am.cpp +++ b/cpp/src/internal/request_am.cpp @@ -19,6 +19,7 @@ RecvAmMessage::RecvAmMessage(internal::AmData* amData, { _request->_delayedSubmission = std::make_shared(false, _buffer->data(), _buffer->getSize()); + _request->_delayedSubmission->setParent(_request); } void RecvAmMessage::setUcpRequest(void* request) { _request->_request = request; } diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index 9a80e450..8a86c389 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -25,8 +25,11 @@ std::shared_ptr createRequestAmSend( RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr) { - return std::shared_ptr(new RequestAm( + auto req = std::shared_ptr(new RequestAm( endpoint, buffer, length, memoryType, enablePythonFuture, callbackFunction, callbackData)); + + req->_delayedSubmission->setParent(req); + return req; } std::shared_ptr createRequestAmRecv( diff --git a/cpp/src/request_stream.cpp b/cpp/src/request_stream.cpp index fd9ecffe..61bcf16d 100644 --- a/cpp/src/request_stream.cpp +++ b/cpp/src/request_stream.cpp @@ -38,8 +38,11 @@ std::shared_ptr createRequestStream(std::shared_ptr end size_t length, const bool enablePythonFuture = false) { - return std::shared_ptr( + auto req = std::shared_ptr( new RequestStream(endpoint, send, buffer, length, enablePythonFuture)); + + req->_delayedSubmission->setParent(req); + return req; } void RequestStream::request() diff --git a/cpp/src/request_tag.cpp b/cpp/src/request_tag.cpp index 12130c9f..cf747783 100644 --- a/cpp/src/request_tag.cpp +++ b/cpp/src/request_tag.cpp @@ -22,14 +22,17 @@ std::shared_ptr createRequestTag(std::shared_ptr endpoint RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr) { - return std::shared_ptr(new RequestTag(endpointOrWorker, - send, - buffer, - length, - tag, - enablePythonFuture, - callbackFunction, - callbackData)); + auto req = std::shared_ptr(new RequestTag(endpointOrWorker, + send, + buffer, + length, + tag, + enablePythonFuture, + callbackFunction, + callbackData)); + + req->_delayedSubmission->setParent(req); + return req; } RequestTag::RequestTag(std::shared_ptr endpointOrWorker, From 6eb5f95b48c9f407d32a9a29267082ecaf1884bd Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 5 Jun 2023 07:45:25 -0700 Subject: [PATCH 23/34] Clarify the status of `ucxx::Request::getRecvBuffer()` --- cpp/include/ucxx/request.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/include/ucxx/request.h b/cpp/include/ucxx/request.h index c6c70571..3f4e6eb2 100644 --- a/cpp/include/ucxx/request.h +++ b/cpp/include/ucxx/request.h @@ -197,7 +197,11 @@ class Request : public Component { * @brief Get the received buffer. * * This method is used to get the received buffer for applicable derived classes (e.g., - * `RequestAm` receive operations), in all other cases this will return `nullptr`. + * `RequestAm` receive operations), in all other cases this will return `nullptr`. Before + * getting the received buffer it's necessary to check that the request completed + * successfully either by validating `getStatus() == UCS_OK` or by checking the request + * completed with `isCompleted() == true` and that it did not error with `checkError()`, + * if any of those is unsuccessful this call returns `nullptr`. * * @return The received buffer (if applicable) or `nullptr`. */ From 877004522267f3cf0c0c7df5b4c3eb62a53f550c Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 6 Jun 2023 03:12:09 -0700 Subject: [PATCH 24/34] Fix for API changes --- cpp/src/request_am.cpp | 2 +- python/ucxx/_lib/libucxx.pyx | 4 +--- python/ucxx/_lib/ucxx_api.pxd | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index 8a86c389..00f025a8 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -38,7 +38,7 @@ std::shared_ptr createRequestAmRecv( RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr) { - auto worker = endpoint->getWorker(endpoint->getParent()); + auto worker = endpoint->getWorker(); auto createRequest = [endpoint, enablePythonFuture, callbackFunction, callbackData]() { return std::shared_ptr( diff --git a/python/ucxx/_lib/libucxx.pyx b/python/ucxx/_lib/libucxx.pyx index 6053112d..ec94fc25 100644 --- a/python/ucxx/_lib/libucxx.pyx +++ b/python/ucxx/_lib/libucxx.pyx @@ -981,14 +981,12 @@ cdef class UCXEndpoint(): def am_probe(self): cdef ucp_ep_h handle - cdef shared_ptr[Component] parent cdef shared_ptr[Worker] worker cdef bint ep_matched with nogil: handle = self._endpoint.get().getHandle() - parent = self._endpoint.get().getParent() - worker = self._endpoint.get().getWorker(parent) + worker = self._endpoint.get().getWorker() ep_matched = worker.get().amProbe(handle) return ep_matched diff --git a/python/ucxx/_lib/ucxx_api.pxd b/python/ucxx/_lib/ucxx_api.pxd index 053453c2..2bd5e1a7 100644 --- a/python/ucxx/_lib/ucxx_api.pxd +++ b/python/ucxx/_lib/ucxx_api.pxd @@ -243,7 +243,7 @@ cdef extern from "" namespace "ucxx" nogil: ) except +raise_py_error void stopProgressThread() except +raise_py_error size_t cancelInflightRequests() except +raise_py_error - bint tagProbe(ucp_tag_t) const + bint tagProbe(const ucp_tag_t) const void setProgressThreadStartCallback( function[void(void*)] callback, void* callbackArg ) @@ -297,7 +297,7 @@ cdef extern from "" namespace "ucxx" nogil: void setCloseCallback( function[void(void*)] close_callback, void* close_callback_arg ) - shared_ptr[Worker] getWorker(shared_ptr[Component] worker_or_listener) + shared_ptr[Worker] getWorker() cdef cppclass Listener(Component): shared_ptr[Endpoint] createEndpointFromConnRequest( From f52f145431b193d2d10cf4b3da15d4dd625ba383 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 6 Jun 2023 03:12:28 -0700 Subject: [PATCH 25/34] Pass missing `enableFuture` argument to `ucxx::Python::createWorker()` --- cpp/python/src/worker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/python/src/worker.cpp b/cpp/python/src/worker.cpp index 887d557e..9b8233df 100644 --- a/cpp/python/src/worker.cpp +++ b/cpp/python/src/worker.cpp @@ -34,7 +34,7 @@ std::shared_ptr<::ucxx::Worker> createWorker(std::shared_ptr context, const bool enableFuture) { auto worker = std::shared_ptr<::ucxx::python::Worker>( - new ::ucxx::python::Worker(context, enableDelayedSubmission)); + new ::ucxx::python::Worker(context, enableDelayedSubmission, enableFuture)); // We can only get a `shared_ptr` for the Active Messages callback after it's // been created, thus this cannot be in the constructor. From e7e1290520341e68c0d4995030951b4bf6560007 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 7 Jun 2023 01:35:48 -0700 Subject: [PATCH 26/34] Replace `posix` transport with `self` in `test_check_transport` For some reason the `posix` transport isn't available in CI, thus replace it with `self`, given the exact transport is not the object of `test_check_transport`. --- python/ucxx/_lib_async/tests/test_info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ucxx/_lib_async/tests/test_info.py b/python/ucxx/_lib_async/tests/test_info.py index f95c4f47..d3ab8b59 100644 --- a/python/ucxx/_lib_async/tests/test_info.py +++ b/python/ucxx/_lib_async/tests/test_info.py @@ -25,11 +25,11 @@ def test_worker_info(): @pytest.mark.parametrize( "transports", - ["posix", "tcp", "posix,tcp"], + ["self", "tcp", "self,tcp"], ) def test_check_transport(transports): transports_list = transports.split(",") - inactive_transports = list(set(["posix", "tcp"]) - set(transports_list)) + inactive_transports = list(set(["self", "tcp"]) - set(transports_list)) ucxx.reset() options = {"TLS": transports, "NET_DEVICES": "all"} From 373802ed6b1fb8d841fda2035deca10be5975b60 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 7 Jun 2023 02:54:55 -0700 Subject: [PATCH 27/34] Fix null pointer dereferencing in log --- cpp/src/request_am.cpp | 56 +++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index 00f025a8..39c9a40e 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -165,16 +165,24 @@ ucs_status_t RequestAm::recvCallback(void* arg, ucs_status_ptr_t status = ucp_am_recv_data_nbx(worker->getHandle(), data, buf->data(), length, &request_param); - ucxx_trace_req_f( - ownerString.c_str(), - status, - "amRecv rndv", - "ep %p, buffer %p, size %lu, future %p, future handle %p, populateDelayedSubmission", - ep, - buf->data(), - length, - req->_future.get(), - req->_future->getHandle()); + if (req->_enablePythonFuture) + ucxx_trace_req_f(ownerString.c_str(), + status, + "amRecv rndv", + "ep %p, buffer %p, size %lu, future %p, future handle %p, recvCallback", + ep, + buf->data(), + length, + req->_future.get(), + req->_future->getHandle()); + else + ucxx_trace_req_f(ownerString.c_str(), + status, + "amRecv rndv", + "ep %p, buffer %p, size %lu, recvCallback", + ep, + buf->data(), + length); if (req->isCompleted()) { // The request completed/errored immediately @@ -198,16 +206,24 @@ ucs_status_t RequestAm::recvCallback(void* arg, std::shared_ptr buf = amData->_allocators.at(UCS_MEMORY_TYPE_HOST)(length); if (length > 0) memcpy(buf->data(), data, length); - ucxx_trace_req_f( - ownerString.c_str(), - nullptr, - "amRecv eager", - "ep: %p, buffer %p, size %lu, future %p, future handle %p, populateDelayedSubmission", - ep, - buf->data(), - length, - req->_future.get(), - req->_future->getHandle()); + if (req->_enablePythonFuture) + ucxx_trace_req_f(ownerString.c_str(), + nullptr, + "amRecv eager", + "ep: %p, buffer %p, size %lu, future %p, future handle %p, recvCallback", + ep, + buf->data(), + length, + req->_future.get(), + req->_future->getHandle()); + else + ucxx_trace_req_f(ownerString.c_str(), + nullptr, + "amRecv eager", + "ep: %p, buffer %p, size %lu, recvCallback", + ep, + buf->data(), + length); internal::RecvAmMessage recvAmMessage(amData, ep, req, buf); recvAmMessage.callback(nullptr, UCS_OK); From 5c55f82cc1f52641f7fb2d7622aae25d26f6e9de Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 7 Jun 2023 05:03:28 -0700 Subject: [PATCH 28/34] Revert "Inherit `ucxx::Component` by `ucxx::DelayedSubmission`" This reverts commit d7dcea00b25a3ce04b0f389f5ec633b43e11303a. --- cpp/include/ucxx/delayed_submission.h | 3 +-- cpp/src/internal/request_am.cpp | 1 - cpp/src/request_am.cpp | 5 +---- cpp/src/request_stream.cpp | 5 +---- cpp/src/request_tag.cpp | 19 ++++++++----------- 5 files changed, 11 insertions(+), 22 deletions(-) diff --git a/cpp/include/ucxx/delayed_submission.h b/cpp/include/ucxx/delayed_submission.h index 3948b8d3..a65e428b 100644 --- a/cpp/include/ucxx/delayed_submission.h +++ b/cpp/include/ucxx/delayed_submission.h @@ -11,7 +11,6 @@ #include -#include #include namespace ucxx { @@ -20,7 +19,7 @@ typedef std::function DelayedSubmissionCallbackType; typedef std::shared_ptr DelayedSubmissionCallbackPtrType; -class DelayedSubmission : public Component { +class DelayedSubmission { public: bool _send{false}; ///< Whether this is a send (`true`) operation or recv (`false`) void* _buffer{nullptr}; ///< Raw pointer to data buffer diff --git a/cpp/src/internal/request_am.cpp b/cpp/src/internal/request_am.cpp index 442871b9..873d22e2 100644 --- a/cpp/src/internal/request_am.cpp +++ b/cpp/src/internal/request_am.cpp @@ -19,7 +19,6 @@ RecvAmMessage::RecvAmMessage(internal::AmData* amData, { _request->_delayedSubmission = std::make_shared(false, _buffer->data(), _buffer->getSize()); - _request->_delayedSubmission->setParent(_request); } void RecvAmMessage::setUcpRequest(void* request) { _request->_request = request; } diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index 39c9a40e..aa435be8 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -25,11 +25,8 @@ std::shared_ptr createRequestAmSend( RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr) { - auto req = std::shared_ptr(new RequestAm( + return std::shared_ptr(new RequestAm( endpoint, buffer, length, memoryType, enablePythonFuture, callbackFunction, callbackData)); - - req->_delayedSubmission->setParent(req); - return req; } std::shared_ptr createRequestAmRecv( diff --git a/cpp/src/request_stream.cpp b/cpp/src/request_stream.cpp index c5baca3e..6cfb5f61 100644 --- a/cpp/src/request_stream.cpp +++ b/cpp/src/request_stream.cpp @@ -38,11 +38,8 @@ std::shared_ptr createRequestStream(std::shared_ptr end size_t length, const bool enablePythonFuture = false) { - auto req = std::shared_ptr( + return std::shared_ptr( new RequestStream(endpoint, send, buffer, length, enablePythonFuture)); - - req->_delayedSubmission->setParent(req); - return req; } void RequestStream::request() diff --git a/cpp/src/request_tag.cpp b/cpp/src/request_tag.cpp index cf747783..12130c9f 100644 --- a/cpp/src/request_tag.cpp +++ b/cpp/src/request_tag.cpp @@ -22,17 +22,14 @@ std::shared_ptr createRequestTag(std::shared_ptr endpoint RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr) { - auto req = std::shared_ptr(new RequestTag(endpointOrWorker, - send, - buffer, - length, - tag, - enablePythonFuture, - callbackFunction, - callbackData)); - - req->_delayedSubmission->setParent(req); - return req; + return std::shared_ptr(new RequestTag(endpointOrWorker, + send, + buffer, + length, + tag, + enablePythonFuture, + callbackFunction, + callbackData)); } RequestTag::RequestTag(std::shared_ptr endpointOrWorker, From 88e8b349406468bda9d68aae893d430f72e75c4e Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 5 Jun 2023 04:42:47 -0700 Subject: [PATCH 29/34] Ensure `ucxx::Request` lifetime for delayed submission --- cpp/include/ucxx/delayed_submission.h | 9 ++++---- cpp/include/ucxx/worker.h | 5 ++++- cpp/src/delayed_submission.cpp | 17 +++++++-------- cpp/src/request_am.cpp | 16 +++++++------- cpp/src/request_stream.cpp | 17 ++++++++------- cpp/src/request_tag.cpp | 30 ++++++++++++++------------- cpp/src/worker.cpp | 5 +++-- 7 files changed, 54 insertions(+), 45 deletions(-) diff --git a/cpp/include/ucxx/delayed_submission.h b/cpp/include/ucxx/delayed_submission.h index a65e428b..01595913 100644 --- a/cpp/include/ucxx/delayed_submission.h +++ b/cpp/include/ucxx/delayed_submission.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -17,8 +18,6 @@ namespace ucxx { typedef std::function DelayedSubmissionCallbackType; -typedef std::shared_ptr DelayedSubmissionCallbackPtrType; - class DelayedSubmission { public: bool _send{false}; ///< Whether this is a send (`true`) operation or recv (`false`) @@ -59,7 +58,7 @@ class DelayedSubmission { class DelayedSubmissionCollection { private: - std::vector + std::vector, DelayedSubmissionCallbackType>> _collection{}; ///< The collection of all known delayed submission operations. std::mutex _mutex{}; ///< Mutex to provide access to the collection. @@ -94,10 +93,12 @@ class DelayedSubmissionCollection { * Register a request for delayed submission with a callback that will be executed when * the request is in fact submitted when `process()` is called. * + * @param[in] request the request to which the callback belongs, ensuring it remains + * alive until the callback is invoked. * @param[in] callback the callback that will be executed by `process()` when the * operation is submitted. */ - void registerRequest(DelayedSubmissionCallbackType callback); + void registerRequest(std::shared_ptr request, DelayedSubmissionCallbackType callback); }; } // namespace ucxx diff --git a/cpp/include/ucxx/worker.h b/cpp/include/ucxx/worker.h index 922e4e74..b5fee215 100644 --- a/cpp/include/ucxx/worker.h +++ b/cpp/include/ucxx/worker.h @@ -386,10 +386,13 @@ class Worker : public Component { * thread, thus decreasing computation on the caller thread, but potentially increasing * transfer latency. * + * @param[in] request the request to which the callback belongs, ensuring it remains + * alive until the callback is invoked. * @param[in] callback the callback set to execute the UCP transfer routine during the * worker thread loop. */ - void registerDelayedSubmission(DelayedSubmissionCallbackType callback); + void registerDelayedSubmission(std::shared_ptr request, + DelayedSubmissionCallbackType callback); /** * @brief Inquire if worker has been created with delayed submission enabled. diff --git a/cpp/src/delayed_submission.cpp b/cpp/src/delayed_submission.cpp index b809a0d7..aa962464 100644 --- a/cpp/src/delayed_submission.cpp +++ b/cpp/src/delayed_submission.cpp @@ -35,26 +35,25 @@ void DelayedSubmissionCollection::process() toProcess = std::move(_collection); } - for (auto& callbackPtr : toProcess) { - auto& callback = *callbackPtr; + for (auto& pair : toProcess) { + auto& req = pair.first; + auto& callback = pair.second; - ucxx_trace_req("Submitting request: %p", callback.target)>()); + ucxx_trace_req("Submitting request: %p", req.get()); if (callback) callback(); } } } -void DelayedSubmissionCollection::registerRequest(DelayedSubmissionCallbackType callback) +void DelayedSubmissionCollection::registerRequest(std::shared_ptr request, + DelayedSubmissionCallbackType callback) { - auto r = std::make_shared(callback); - { std::lock_guard lock(_mutex); - _collection.push_back(r); + _collection.push_back(std::make_pair(request, callback)); } - ucxx_trace_req("Registered submit request: %p", - callback.target)>()); + ucxx_trace_req("Registered submit request: %p", request.get()); } } // namespace ucxx diff --git a/cpp/src/request_am.cpp b/cpp/src/request_am.cpp index aa435be8..c09bfcab 100644 --- a/cpp/src/request_am.cpp +++ b/cpp/src/request_am.cpp @@ -25,8 +25,16 @@ std::shared_ptr createRequestAmSend( RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr) { - return std::shared_ptr(new RequestAm( + auto req = std::shared_ptr(new RequestAm( endpoint, buffer, length, memoryType, enablePythonFuture, callbackFunction, callbackData)); + + // A delayed notification request is not populated immediately, instead it is + // delayed to allow the worker progress thread to set its status, and more + // importantly the Python future later on, so that we don't need the GIL here. + req->_worker->registerDelayedSubmission( + req, std::bind(std::mem_fn(&Request::populateDelayedSubmission), req.get())); + + return req; } std::shared_ptr createRequestAmRecv( @@ -58,12 +66,6 @@ RequestAm::RequestAm(std::shared_ptr endpoint, { _callback = callbackFunction; _callbackData = callbackData; - - // A delayed notification request is not populated immediately, instead it is - // delayed to allow the worker progress thread to set its status, and more - // importantly the Python future later on, so that we don't need the GIL here. - _worker->registerDelayedSubmission( - std::bind(std::mem_fn(&Request::populateDelayedSubmission), this)); } RequestAm::RequestAm(std::shared_ptr endpointOrWorker, diff --git a/cpp/src/request_stream.cpp b/cpp/src/request_stream.cpp index 6cfb5f61..c93b447d 100644 --- a/cpp/src/request_stream.cpp +++ b/cpp/src/request_stream.cpp @@ -23,13 +23,6 @@ RequestStream::RequestStream(std::shared_ptr endpoint, enablePythonFuture), _length(length) { - auto worker = endpoint->getWorker(); - - // A delayed notification request is not populated immediately, instead it is - // delayed to allow the worker progress thread to set its status, and more - // importantly the Python future later on, so that we don't need the GIL here. - worker->registerDelayedSubmission( - std::bind(std::mem_fn(&Request::populateDelayedSubmission), this)); } std::shared_ptr createRequestStream(std::shared_ptr endpoint, @@ -38,8 +31,16 @@ std::shared_ptr createRequestStream(std::shared_ptr end size_t length, const bool enablePythonFuture = false) { - return std::shared_ptr( + auto req = std::shared_ptr( new RequestStream(endpoint, send, buffer, length, enablePythonFuture)); + + // A delayed notification request is not populated immediately, instead it is + // delayed to allow the worker progress thread to set its status, and more + // importantly the Python future later on, so that we don't need the GIL here. + req->_worker->registerDelayedSubmission( + req, std::bind(std::mem_fn(&Request::populateDelayedSubmission), req.get())); + + return req; } void RequestStream::request() diff --git a/cpp/src/request_tag.cpp b/cpp/src/request_tag.cpp index 12130c9f..b6a4cf5e 100644 --- a/cpp/src/request_tag.cpp +++ b/cpp/src/request_tag.cpp @@ -22,14 +22,22 @@ std::shared_ptr createRequestTag(std::shared_ptr endpoint RequestCallbackUserFunction callbackFunction = nullptr, RequestCallbackUserData callbackData = nullptr) { - return std::shared_ptr(new RequestTag(endpointOrWorker, - send, - buffer, - length, - tag, - enablePythonFuture, - callbackFunction, - callbackData)); + auto req = std::shared_ptr(new RequestTag(endpointOrWorker, + send, + buffer, + length, + tag, + enablePythonFuture, + callbackFunction, + callbackData)); + + // A delayed notification request is not populated immediately, instead it is + // delayed to allow the worker progress thread to set its status, and more + // importantly the Python future later on, so that we don't need the GIL here. + req->_worker->registerDelayedSubmission( + req, std::bind(std::mem_fn(&Request::populateDelayedSubmission), req.get())); + + return req; } RequestTag::RequestTag(std::shared_ptr endpointOrWorker, @@ -50,12 +58,6 @@ RequestTag::RequestTag(std::shared_ptr endpointOrWorker, throw ucxx::Error("An endpoint is required to send tag messages"); _callback = callbackFunction; _callbackData = callbackData; - - // A delayed notification request is not populated immediately, instead it is - // delayed to allow the worker progress thread to set its status, and more - // importantly the Python future later on, so that we don't need the GIL here. - _worker->registerDelayedSubmission( - std::bind(std::mem_fn(&Request::populateDelayedSubmission), this)); } void RequestTag::callback(void* request, ucs_status_t status, const ucp_tag_recv_info_t* info) diff --git a/cpp/src/worker.cpp b/cpp/src/worker.cpp index c94329fa..946ec8cd 100644 --- a/cpp/src/worker.cpp +++ b/cpp/src/worker.cpp @@ -261,12 +261,13 @@ bool Worker::progress() return ret; } -void Worker::registerDelayedSubmission(DelayedSubmissionCallbackType callback) +void Worker::registerDelayedSubmission(std::shared_ptr request, + DelayedSubmissionCallbackType callback) { if (_delayedSubmissionCollection == nullptr) { callback(); } else { - _delayedSubmissionCollection->registerRequest(callback); + _delayedSubmissionCollection->registerRequest(request, callback); /* Waking the progress event is needed here because the UCX request is * not dispatched immediately. Thus we must signal the progress task so From 11769eac6900757be28ca898a64a3bcf503b03b5 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 31 May 2023 12:05:07 -0700 Subject: [PATCH 30/34] Prevent enabling delayed submission in non-thread progress mode --- python/ucxx/_lib_async/application_context.py | 16 +++++-- python/ucxx/_lib_async/tests/test_worker.py | 43 ++++++++++++++----- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/python/ucxx/_lib_async/application_context.py b/python/ucxx/_lib_async/application_context.py index d0c2ab5a..653ff6cd 100644 --- a/python/ucxx/_lib_async/application_context.py +++ b/python/ucxx/_lib_async/application_context.py @@ -41,7 +41,8 @@ def __init__( self.progress_mode = ApplicationContext._check_progress_mode(progress_mode) enable_delayed_submission = ApplicationContext._check_enable_delayed_submission( - enable_delayed_submission + enable_delayed_submission, + self.progress_mode, ) enable_python_future = ApplicationContext._check_enable_python_future( enable_python_future, self.progress_mode @@ -84,7 +85,7 @@ def _check_progress_mode(progress_mode): return progress_mode @staticmethod - def _check_enable_delayed_submission(enable_delayed_submission): + def _check_enable_delayed_submission(enable_delayed_submission, progress_mode): if enable_delayed_submission is None: if "UCXPY_ENABLE_DELAYED_SUBMISSION" in os.environ: explicit_enable_delayed_submission = ( @@ -93,10 +94,19 @@ def _check_enable_delayed_submission(enable_delayed_submission): else True ) else: - explicit_enable_delayed_submission = True + explicit_enable_delayed_submission = progress_mode.startswith("thread") else: explicit_enable_delayed_submission = enable_delayed_submission + if ( + not progress_mode.startswith("thread") + and explicit_enable_delayed_submission + ): + raise ValueError( + f"Delayed submission requested, but {progress_mode} does not " + "support it, 'thread' or 'thread-polling' progress mode required." + ) + return explicit_enable_delayed_submission @staticmethod diff --git a/python/ucxx/_lib_async/tests/test_worker.py b/python/ucxx/_lib_async/tests/test_worker.py index 71c33074..2dd73422 100644 --- a/python/ucxx/_lib_async/tests/test_worker.py +++ b/python/ucxx/_lib_async/tests/test_worker.py @@ -15,15 +15,27 @@ async def test_worker_capabilities_args( enable_delayed_submission, enable_python_future ): - ucxx.init( - enable_delayed_submission=enable_delayed_submission, - enable_python_future=enable_python_future, - ) + progress_mode = os.getenv("UCXPY_PROGRESS_MODE", "thread") - worker = ucxx.core._get_ctx().worker + if enable_delayed_submission and not progress_mode.startswith("thread"): + with pytest.raises(ValueError, match="Delayed submission requested, but"): + ucxx.init( + enable_delayed_submission=enable_delayed_submission, + enable_python_future=enable_python_future, + ) + else: + ucxx.init( + enable_delayed_submission=enable_delayed_submission, + enable_python_future=enable_python_future, + ) - assert worker.is_delayed_submission_enabled() is enable_delayed_submission - assert worker.is_python_future_enabled() is enable_python_future + worker = ucxx.core._get_ctx().worker + + assert worker.is_delayed_submission_enabled() is enable_delayed_submission + if progress_mode.startswith("thread"): + assert worker.is_python_future_enabled() is enable_python_future + else: + assert worker.is_python_future_enabled() is False @pytest.mark.asyncio @@ -39,9 +51,18 @@ async def test_worker_capabilities_env(enable_delayed_submission, enable_python_ "UCXPY_ENABLE_PYTHON_FUTURE": "1" if enable_python_future else "0", }, ): - ucxx.init() + progress_mode = os.getenv("UCXPY_PROGRESS_MODE", "thread") - worker = ucxx.core._get_ctx().worker + if enable_delayed_submission and not progress_mode.startswith("thread"): + with pytest.raises(ValueError, match="Delayed submission requested, but"): + ucxx.init() + else: + ucxx.init() - assert worker.is_delayed_submission_enabled() is enable_delayed_submission - assert worker.is_python_future_enabled() is enable_python_future + worker = ucxx.core._get_ctx().worker + + assert worker.is_delayed_submission_enabled() is enable_delayed_submission + if progress_mode.startswith("thread"): + assert worker.is_python_future_enabled() is enable_python_future + else: + assert worker.is_python_future_enabled() is False From 09ebeda50c78450f2477768b9e47eece50618f69 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 9 Jun 2023 09:18:03 -0700 Subject: [PATCH 31/34] Remove now irrelevant TODO comment --- cpp/src/request_tag_multi.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/request_tag_multi.cpp b/cpp/src/request_tag_multi.cpp index eca92695..1883edbb 100644 --- a/cpp/src/request_tag_multi.cpp +++ b/cpp/src/request_tag_multi.cpp @@ -202,7 +202,6 @@ void RequestTagMulti::callback(ucs_status_t status) { if (_send) throw std::runtime_error("Send requests cannot call callback()"); - // TODO: Remove arg ucxx_trace_req("RequestTagMulti::callback request: %p, tag: %lx", this, _tag); if (_bufferRequests.empty()) { From bd5ac6623d661cacebb4a84823f052c51f27d0ea Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 9 Jun 2023 09:58:35 -0700 Subject: [PATCH 32/34] Remove `const` qualifier from `ucxx::Worker::tagProbe()` Required until progressing the worker is not necessary within the call anymore. --- cpp/include/ucxx/worker.h | 2 +- cpp/src/worker.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/ucxx/worker.h b/cpp/include/ucxx/worker.h index b5fee215..520be6c7 100644 --- a/cpp/include/ucxx/worker.h +++ b/cpp/include/ucxx/worker.h @@ -569,7 +569,7 @@ class Worker : public Component { * * @returns `true` if any uncaught messages were received, `false` otherwise. */ - bool tagProbe(const ucp_tag_t tag) const; + bool tagProbe(const ucp_tag_t tag); /** * @brief Enqueue a tag receive operation. diff --git a/cpp/src/worker.cpp b/cpp/src/worker.cpp index dff72429..bc7baf82 100644 --- a/cpp/src/worker.cpp +++ b/cpp/src/worker.cpp @@ -375,7 +375,7 @@ void Worker::removeInflightRequest(const Request* const request) } } -bool Worker::tagProbe(const ucp_tag_t tag) const +bool Worker::tagProbe(const ucp_tag_t tag) { // TODO: Fix temporary workaround, if progress thread is active we must wait for it // to progress the worker instead. From a24ca65d7c5ce7d953aca77f99f87227a1195531 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 9 Jun 2023 20:29:03 +0200 Subject: [PATCH 33/34] Replace `std::make_pair` by initializer list Co-authored-by: Lawrence Mitchell --- cpp/src/delayed_submission.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/delayed_submission.cpp b/cpp/src/delayed_submission.cpp index aa962464..ee768333 100644 --- a/cpp/src/delayed_submission.cpp +++ b/cpp/src/delayed_submission.cpp @@ -51,7 +51,7 @@ void DelayedSubmissionCollection::registerRequest(std::shared_ptr reque { { std::lock_guard lock(_mutex); - _collection.push_back(std::make_pair(request, callback)); + _collection.push_back({request, callback}); } ucxx_trace_req("Registered submit request: %p", request.get()); } From 3a3aa038e9f359df897dd0027bc078329e40b9f0 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 9 Jun 2023 11:29:43 -0700 Subject: [PATCH 34/34] Limit progress mode for `test_send_recv_am` Follow the default of other tests for now, relying on the default progress mode or that specified via environment variables. --- python/ucxx/_lib_async/tests/test_send_recv_am.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/ucxx/_lib_async/tests/test_send_recv_am.py b/python/ucxx/_lib_async/tests/test_send_recv_am.py index 25ac4efc..3a250448 100644 --- a/python/ucxx/_lib_async/tests/test_send_recv_am.py +++ b/python/ucxx/_lib_async/tests/test_send_recv_am.py @@ -61,12 +61,11 @@ async def server(ep): @pytest.mark.asyncio @pytest.mark.parametrize("size", msg_sizes) -@pytest.mark.parametrize("progress_mode", ["polling", "thread", "thread-polling"]) @pytest.mark.parametrize("recv_wait", [True, False]) @pytest.mark.parametrize("data", get_data()) -async def test_send_recv_am(size, progress_mode, recv_wait, data): +async def test_send_recv_am(size, recv_wait, data): rndv_thresh = 8192 - ucxx.init(options={"RNDV_THRESH": str(rndv_thresh)}, progress_mode=progress_mode) + ucxx.init(options={"RNDV_THRESH": str(rndv_thresh)}) msg = data["generator"](size)