Skip to content

Commit

Permalink
[SYCL][CUDA] Remove pi Event Callback implementation
Browse files Browse the repository at this point in the history
Since introduction of host tasks in #1471,
`piEventCallback` and related functionality is not required
by the SYCL-RT.
Removing the implementation of this behaviour from the CUDA
backend simplifies the submission of operations to streams and
overall increases performance.

Signed-off-by: Ruyman Reyes <[email protected]>
  • Loading branch information
Ruyk committed May 20, 2020
1 parent ae3fd5c commit 1ef4cd6
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 219 deletions.
154 changes: 29 additions & 125 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,13 +276,15 @@ _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
isStarted_{false}, evEnd_{nullptr}, evStart_{nullptr}, evQueued_{nullptr},
queue_{queue}, context_{context} {

if (is_native_event()) {
if (type != PI_COMMAND_TYPE_USER) {
PI_CHECK_ERROR(cuEventCreate(&evEnd_, CU_EVENT_DEFAULT));

if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
}
} else {
cl::sycl::detail::pi::die("User-defined events not implemented");
}

if (queue_ != nullptr) {
Expand All @@ -303,7 +305,7 @@ pi_result _pi_event::start() {
pi_result result;

try {
if (is_native_event() && queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
// NOTE: This relies on the default stream to be unused.
result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
result = PI_CHECK_ERROR(cuEventRecord(evStart_, queue_->get()));
Expand All @@ -313,8 +315,6 @@ pi_result _pi_event::start() {
}

isStarted_ = true;
// let observers know that the event is "submitted"
trigger_callback(get_execution_status());
return result;
}

Expand Down Expand Up @@ -351,37 +351,16 @@ pi_result _pi_event::record() {

pi_result result = PI_INVALID_OPERATION;

if (is_native_event()) {

if (!queue_) {
return PI_INVALID_QUEUE;
}
if (!queue_) {
return PI_INVALID_QUEUE;
}

CUstream cuStream = queue_->get();
CUstream cuStream = queue_->get();

try {
result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));

result = cuda_piEventRetain(this);
try {
result = PI_CHECK_ERROR(cuLaunchHostFunc(
cuStream,
[](void *userData) {
pi_event event = reinterpret_cast<pi_event>(userData);
event->set_event_complete();
cuda_piEventRelease(event);
},
this));
} catch (...) {
// If host function fails to enqueue we must release the event here
result = cuda_piEventRelease(this);
throw;
}
} catch (pi_result error) {
result = error;
}
} else {
result = PI_SUCCESS;
try {
result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
} catch (pi_result error) {
result = error;
}

if (result == PI_SUCCESS) {
Expand All @@ -392,65 +371,23 @@ pi_result _pi_event::record() {
}

pi_result _pi_event::wait() {

pi_result retErr;
if (is_native_event()) {
try {
retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
isCompleted_ = true;
} catch (pi_result error) {
retErr = error;
}
} else {

while (!is_completed()) {
// wait for user event to complete
}
retErr = PI_SUCCESS;
try {
retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
isCompleted_ = true;
} catch (pi_result error) {
retErr = error;
}

auto is_success = retErr == PI_SUCCESS;
auto status = is_success ? get_execution_status() : pi_int32(retErr);

trigger_callback(status);

return retErr;
}

// makes all future work submitted to queue wait for all work captured in event.
pi_result enqueueEventWait(pi_queue queue, pi_event event) {
if (event->is_native_event()) {

// for native events, the cuStreamWaitEvent call is used.
// This makes all future work submitted to stream wait for all
// work captured in event.

return PI_CHECK_ERROR(cuStreamWaitEvent(queue->get(), event->get(), 0));

} else {

// for user events, we enqueue a callback. When invoked, the
// callback will block until the user event is marked as
// completed.

static auto user_wait_func = [](void *user_data) {
// The host function must not make any CUDA API calls.
auto event = static_cast<pi_event>(user_data);

// busy wait for user event to complete
event->wait();

// this function does not need the event to be kept alive
// anymore
cuda_piEventRelease(event);
};

// retain event to ensure it is still alive when the
// user_wait_func callback is invoked
cuda_piEventRetain(event);

return PI_CHECK_ERROR(cuLaunchHostFunc(queue->get(), user_wait_func, event));
}
// for native events, the cuStreamWaitEvent call is used.
// This makes all future work submitted to stream wait for all
// work captured in event.
return PI_CHECK_ERROR(cuStreamWaitEvent(queue->get(), event->get(), 0));
}

_pi_program::_pi_program(pi_context ctxt)
Expand Down Expand Up @@ -2763,37 +2700,13 @@ pi_result cuda_piEventSetCallback(pi_event event,
pi_int32 command_exec_callback_type,
pfn_notify notify, void *user_data) {

assert(event);
assert(notify);
assert(command_exec_callback_type == PI_EVENT_SUBMITTED ||
command_exec_callback_type == PI_EVENT_RUNNING ||
command_exec_callback_type == PI_EVENT_COMPLETE);
event_callback callback(pi_event_status(command_exec_callback_type), notify,
user_data);

event->set_event_callback(callback);

cl::sycl::detail::pi::die("Event Callback not implemented");
return PI_SUCCESS;
}

pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {

assert(execution_status >= PI_EVENT_COMPLETE &&
execution_status <= PI_EVENT_QUEUED);

if (!event || event->is_native_event()) {
return PI_INVALID_EVENT;
}

if (execution_status == PI_EVENT_COMPLETE) {
return event->set_event_complete();
} else if (execution_status < 0) {
// TODO: A negative integer value causes all enqueued commands that wait
// on this user event to be terminated.
cl::sycl::detail::pi::die("cuda_piEventSetStatus support for negative execution_status not "
"implemented.");
}

cl::sycl::detail::pi::die("Event Set Status not implemented");
return PI_INVALID_VALUE;
}

Expand Down Expand Up @@ -2821,19 +2734,13 @@ pi_result cuda_piEventRelease(pi_event event) {
if (event->decrement_reference_count() == 0) {
std::unique_ptr<_pi_event> event_ptr{event};
pi_result result = PI_INVALID_EVENT;

if (event->is_native_event()) {
try {
ScopedContext active(event->get_context());
auto cuEvent = event->get();
result = PI_CHECK_ERROR(cuEventDestroy(cuEvent));
} catch (...) {
result = PI_OUT_OF_RESOURCES;
}
} else {
result = PI_SUCCESS;
try {
ScopedContext active(event->get_context());
auto cuEvent = event->get();
result = PI_CHECK_ERROR(cuEventDestroy(cuEvent));
} catch (...) {
result = PI_OUT_OF_RESOURCES;
}

return result;
}

Expand Down Expand Up @@ -2888,9 +2795,6 @@ pi_result cuda_piEnqueueEventsWait(pi_queue command_queue,
/// \return PI_SUCCESS on success. PI_INVALID_EVENT if given a user event.
pi_result cuda_piextEventGetNativeHandle(pi_event event,
pi_native_handle *nativeHandle) {
if (event->is_user_event()) {
return PI_INVALID_EVENT;
}
*nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
return PI_SUCCESS;
}
Expand Down
101 changes: 7 additions & 94 deletions sycl/plugins/cuda/pi_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,37 +302,6 @@ struct _pi_queue {

typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
void *userData);

class event_callback {
public:
void trigger_callback(pi_event event, pi_int32 currentEventStatus) const {

auto validParameters = callback_ && event;

// As a pi_event_status value approaches 0, it gets closer to completion.
// If the calling pi_event's status is less than or equal to the event
// status the user is interested in, invoke the callback anyway. The event
// will have passed through that state anyway.
auto validStatus = currentEventStatus <= observedEventStatus_;

if (validParameters && validStatus) {

callback_(event, currentEventStatus, userData_);
}
}

event_callback(pi_event_status status, pfn_notify callback, void *userData)
: observedEventStatus_{status}, callback_{callback}, userData_{userData} {
}

pi_event_status get_status() const noexcept { return observedEventStatus_; }

private:
pi_event_status observedEventStatus_;
pfn_notify callback_;
void *userData_;
};

/// PI Event mapping to CUevent
///
class _pi_event {
Expand All @@ -347,41 +316,6 @@ class _pi_event {

native_type get() const noexcept { return evEnd_; };

pi_result set_event_complete() noexcept {

if (isCompleted_) {
return PI_INVALID_OPERATION;
}

isRecorded_ = true;
isCompleted_ = true;

trigger_callback(get_execution_status());

return PI_SUCCESS;
}

void trigger_callback(pi_int32 status) {

std::vector<event_callback> callbacks;

// Here we move all callbacks into local variable before we call them.
// This is a defensive maneuver; if any of the callbacks attempt to
// add additional callbacks, we will end up in a bad spot. Our mutex
// will be locked twice and the vector will be modified as it is being
// iterated over! By moving everything locally, we can call all of these
// callbacks and let them modify the original vector without much worry.

{
std::lock_guard<std::mutex> lock(mutex_);
event_callbacks_.swap(callbacks);
}

for (auto &event_callback : callbacks) {
event_callback.trigger_callback(this, status);
}
}

pi_queue get_queue() const noexcept { return queue_; }

pi_command_type get_command_type() const noexcept { return commandType_; }
Expand All @@ -390,10 +324,10 @@ class _pi_event {

bool is_recorded() const noexcept { return isRecorded_; }

bool is_completed() const noexcept { return isCompleted_; }

bool is_started() const noexcept { return isStarted_; }

bool is_completed() const noexcept { return isCompleted_; };

pi_int32 get_execution_status() const noexcept {

if (!is_recorded()) {
Expand All @@ -406,24 +340,8 @@ class _pi_event {
return PI_EVENT_COMPLETE;
}

void set_event_callback(const event_callback &callback) {
auto current_status = get_execution_status();
if (current_status <= callback.get_status()) {
callback.trigger_callback(this, current_status);
} else {
std::lock_guard<std::mutex> lock(mutex_);
event_callbacks_.emplace_back(callback);
}
}

pi_context get_context() const noexcept { return context_; };

bool is_user_event() const noexcept {
return get_command_type() == PI_COMMAND_TYPE_USER;
}

bool is_native_event() const noexcept { return !is_user_event(); }

pi_uint32 increment_reference_count() { return ++refCount_; }

pi_uint32 decrement_reference_count() { return --refCount_; }
Expand Down Expand Up @@ -462,13 +380,14 @@ class _pi_event {

std::atomic_uint32_t refCount_; // Event reference count.

std::atomic_bool isCompleted_; // Atomic bool used by user events. Can be
// used to wait for a user event's completion.
bool isCompleted_; // Signifies whether the operations have completed
//

bool isRecorded_; // Signifies wether a native CUDA event has been recorded
// yet.
bool isStarted_; // Signifies wether the operation associated with the
// PI event has started or not
bool isStarted_; // Signifies wether the operation associated with the
// PI event has started or not
//

native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
// event, this will be nullptr.
Expand All @@ -484,12 +403,6 @@ class _pi_event {
pi_context context_; // pi_context associated with the event. If this is a
// native event, this will be the same context associated
// with the queue_ member.

std::mutex mutex_; // Protect access to event_callbacks_. TODO: There might be
// a lock-free data structure we can use here.
std::vector<event_callback>
event_callbacks_; // Callbacks that can be triggered when an event's state
// changes.
};

/// Implementation of PI Program on CUDA Module object
Expand Down

0 comments on commit 1ef4cd6

Please sign in to comment.