[SYCL][CUDA] Remove pi Event Callback implementation

Since introduction of host tasks in #1471, `piEventCallback` and related functionality is not required by the SYCL-RT. Removing the implementation of this behaviour from the CUDA backend simplifies the submission of operations to streams and overall increases performance. Signed-off-by: Ruyman Reyes <[email protected]>
codeplaysoftware · May 20, 2020 · 1ef4cd6 · 1ef4cd6
1 parent ae3fd5c
commit 1ef4cd6
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 219 deletions.
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
@@ -276,13 +276,15 @@ _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
       isStarted_{false}, evEnd_{nullptr}, evStart_{nullptr}, evQueued_{nullptr},
       queue_{queue}, context_{context} {
 
-  if (is_native_event()) {
+  if (type != PI_COMMAND_TYPE_USER) {
     PI_CHECK_ERROR(cuEventCreate(&evEnd_, CU_EVENT_DEFAULT));
 
     if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
       PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
       PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
     }
+  } else {
+    cl::sycl::detail::pi::die("User-defined events not implemented");
   }
 
   if (queue_ != nullptr) {
@@ -303,7 +305,7 @@ pi_result _pi_event::start() {
   pi_result result;
 
   try {
-    if (is_native_event() && queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
+    if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
       // NOTE: This relies on the default stream to be unused.
       result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
       result = PI_CHECK_ERROR(cuEventRecord(evStart_, queue_->get()));
@@ -313,8 +315,6 @@ pi_result _pi_event::start() {
   }
 
   isStarted_ = true;
-  // let observers know that the event is "submitted"
-  trigger_callback(get_execution_status());
   return result;
 }
 
@@ -351,37 +351,16 @@ pi_result _pi_event::record() {
 
   pi_result result = PI_INVALID_OPERATION;
 
-  if (is_native_event()) {
-
-    if (!queue_) {
-      return PI_INVALID_QUEUE;
-    }
+  if (!queue_) {
+    return PI_INVALID_QUEUE;
+  }
 
-    CUstream cuStream = queue_->get();
+  CUstream cuStream = queue_->get();
 
-    try {
-      result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
-
-      result = cuda_piEventRetain(this);
-      try {
-        result = PI_CHECK_ERROR(cuLaunchHostFunc(
-            cuStream,
-            [](void *userData) {
-              pi_event event = reinterpret_cast<pi_event>(userData);
-              event->set_event_complete();
-              cuda_piEventRelease(event);
-            },
-            this));
-      } catch (...) {
-        // If host function fails to enqueue we must release the event here
-        result = cuda_piEventRelease(this);
-        throw;
-      }
-    } catch (pi_result error) {
-      result = error;
-    }
-  } else {
-    result = PI_SUCCESS;
+  try {
+    result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
+  } catch (pi_result error) {
+    result = error;
   }
 
   if (result == PI_SUCCESS) {
@@ -392,65 +371,23 @@ pi_result _pi_event::record() {
 }
 
 pi_result _pi_event::wait() {
-
   pi_result retErr;
-  if (is_native_event()) {
-    try {
-      retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
-      isCompleted_ = true;
-    } catch (pi_result error) {
-      retErr = error;
-    }
-  } else {
-
-    while (!is_completed()) {
-      // wait for user event to complete
-    }
-    retErr = PI_SUCCESS;
+  try {
+    retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
+    isCompleted_ = true;
+  } catch (pi_result error) {
+    retErr = error;
   }
 
-  auto is_success = retErr == PI_SUCCESS;
-  auto status = is_success ? get_execution_status() : pi_int32(retErr);
-
-  trigger_callback(status);
-
   return retErr;
 }
 
 // makes all future work submitted to queue wait for all work captured in event.
 pi_result enqueueEventWait(pi_queue queue, pi_event event) {
-  if (event->is_native_event()) {
-
-    // for native events, the cuStreamWaitEvent call is used.
-    // This makes all future work submitted to stream wait for all
-    // work captured in event.
-
-    return PI_CHECK_ERROR(cuStreamWaitEvent(queue->get(), event->get(), 0));
-
-  } else {
-
-    // for user events, we enqueue a callback. When invoked, the
-    // callback will block until the user event is marked as
-    // completed.
-
-    static auto user_wait_func = [](void *user_data) {
-      // The host function must not make any CUDA API calls.
-      auto event = static_cast<pi_event>(user_data);
-
-      // busy wait for user event to complete
-      event->wait();
-
-      // this function does not need the event to be kept alive
-      // anymore
-      cuda_piEventRelease(event);
-    };
-
-    // retain event to ensure it is still alive when the
-    // user_wait_func callback is invoked
-    cuda_piEventRetain(event);
-
-    return PI_CHECK_ERROR(cuLaunchHostFunc(queue->get(), user_wait_func, event));
-  }
+  // for native events, the cuStreamWaitEvent call is used.
+  // This makes all future work submitted to stream wait for all
+  // work captured in event.
+  return PI_CHECK_ERROR(cuStreamWaitEvent(queue->get(), event->get(), 0));
 }
 
 _pi_program::_pi_program(pi_context ctxt)
@@ -2763,37 +2700,13 @@ pi_result cuda_piEventSetCallback(pi_event event,
                                   pi_int32 command_exec_callback_type,
                                   pfn_notify notify, void *user_data) {
 
-  assert(event);
-  assert(notify);
-  assert(command_exec_callback_type == PI_EVENT_SUBMITTED ||
-         command_exec_callback_type == PI_EVENT_RUNNING ||
-         command_exec_callback_type == PI_EVENT_COMPLETE);
-  event_callback callback(pi_event_status(command_exec_callback_type), notify,
-                          user_data);
-
-  event->set_event_callback(callback);
-
+  cl::sycl::detail::pi::die("Event Callback not implemented");
   return PI_SUCCESS;
 }
 
 pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {
 
-  assert(execution_status >= PI_EVENT_COMPLETE &&
-         execution_status <= PI_EVENT_QUEUED);
-
-  if (!event || event->is_native_event()) {
-    return PI_INVALID_EVENT;
-  }
-
-  if (execution_status == PI_EVENT_COMPLETE) {
-    return event->set_event_complete();
-  } else if (execution_status < 0) {
-    // TODO: A negative integer value causes all enqueued commands that wait
-    // on this user event to be terminated.
-    cl::sycl::detail::pi::die("cuda_piEventSetStatus support for negative execution_status not "
-                              "implemented.");
-  }
-
+  cl::sycl::detail::pi::die("Event Set Status not implemented");
   return PI_INVALID_VALUE;
 }
 
@@ -2821,19 +2734,13 @@ pi_result cuda_piEventRelease(pi_event event) {
   if (event->decrement_reference_count() == 0) {
     std::unique_ptr<_pi_event> event_ptr{event};
     pi_result result = PI_INVALID_EVENT;
-
-    if (event->is_native_event()) {
-      try {
-        ScopedContext active(event->get_context());
-        auto cuEvent = event->get();
-        result = PI_CHECK_ERROR(cuEventDestroy(cuEvent));
-      } catch (...) {
-        result = PI_OUT_OF_RESOURCES;
-      }
-    } else {
-      result = PI_SUCCESS;
+    try {
+      ScopedContext active(event->get_context());
+      auto cuEvent = event->get();
+      result = PI_CHECK_ERROR(cuEventDestroy(cuEvent));
+    } catch (...) {
+      result = PI_OUT_OF_RESOURCES;
     }
-
     return result;
   }
 
@@ -2888,9 +2795,6 @@ pi_result cuda_piEnqueueEventsWait(pi_queue command_queue,
 /// \return PI_SUCCESS on success. PI_INVALID_EVENT if given a user event.
 pi_result cuda_piextEventGetNativeHandle(pi_event event,
                                          pi_native_handle *nativeHandle) {
-  if (event->is_user_event()) {
-    return PI_INVALID_EVENT;
-  }
   *nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
   return PI_SUCCESS;
 }

diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
@@ -302,37 +302,6 @@ struct _pi_queue {
 
 typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
                            void *userData);
-
-class event_callback {
-public:
-  void trigger_callback(pi_event event, pi_int32 currentEventStatus) const {
-
-    auto validParameters = callback_ && event;
-
-    // As a pi_event_status value approaches 0, it gets closer to completion.
-    // If the calling pi_event's status is less than or equal to the event
-    // status the user is interested in, invoke the callback anyway. The event
-    // will have passed through that state anyway.
-    auto validStatus = currentEventStatus <= observedEventStatus_;
-
-    if (validParameters && validStatus) {
-
-      callback_(event, currentEventStatus, userData_);
-    }
-  }
-
-  event_callback(pi_event_status status, pfn_notify callback, void *userData)
-      : observedEventStatus_{status}, callback_{callback}, userData_{userData} {
-  }
-
-  pi_event_status get_status() const noexcept { return observedEventStatus_; }
-
-private:
-  pi_event_status observedEventStatus_;
-  pfn_notify callback_;
-  void *userData_;
-};
-
 /// PI Event mapping to CUevent
 ///
 class _pi_event {
@@ -347,41 +316,6 @@ class _pi_event {
 
   native_type get() const noexcept { return evEnd_; };
 
-  pi_result set_event_complete() noexcept {
-
-    if (isCompleted_) {
-      return PI_INVALID_OPERATION;
-    }
-
-    isRecorded_ = true;
-    isCompleted_ = true;
-
-    trigger_callback(get_execution_status());
-
-    return PI_SUCCESS;
-  }
-
-  void trigger_callback(pi_int32 status) {
-
-    std::vector<event_callback> callbacks;
-
-    // Here we move all callbacks into local variable before we call them.
-    // This is a defensive maneuver; if any of the callbacks attempt to
-    // add additional callbacks, we will end up in a bad spot. Our mutex
-    // will be locked twice and the vector will be modified as it is being
-    // iterated over! By moving everything locally, we can call all of these
-    // callbacks and let them modify the original vector without much worry.
-
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      event_callbacks_.swap(callbacks);
-    }
-
-    for (auto &event_callback : callbacks) {
-      event_callback.trigger_callback(this, status);
-    }
-  }
-
   pi_queue get_queue() const noexcept { return queue_; }
 
   pi_command_type get_command_type() const noexcept { return commandType_; }
@@ -390,10 +324,10 @@ class _pi_event {
 
   bool is_recorded() const noexcept { return isRecorded_; }
 
-  bool is_completed() const noexcept { return isCompleted_; }
-
   bool is_started() const noexcept { return isStarted_; }
 
+  bool is_completed() const noexcept { return isCompleted_; };
+
   pi_int32 get_execution_status() const noexcept {
 
     if (!is_recorded()) {
@@ -406,24 +340,8 @@ class _pi_event {
     return PI_EVENT_COMPLETE;
   }
 
-  void set_event_callback(const event_callback &callback) {
-    auto current_status = get_execution_status();
-    if (current_status <= callback.get_status()) {
-      callback.trigger_callback(this, current_status);
-    } else {
-      std::lock_guard<std::mutex> lock(mutex_);
-      event_callbacks_.emplace_back(callback);
-    }
-  }
-
   pi_context get_context() const noexcept { return context_; };
 
-  bool is_user_event() const noexcept {
-    return get_command_type() == PI_COMMAND_TYPE_USER;
-  }
-
-  bool is_native_event() const noexcept { return !is_user_event(); }
-
   pi_uint32 increment_reference_count() { return ++refCount_; }
 
   pi_uint32 decrement_reference_count() { return --refCount_; }
@@ -462,13 +380,14 @@ class _pi_event {
 
   std::atomic_uint32_t refCount_; // Event reference count.
 
-  std::atomic_bool isCompleted_; // Atomic bool used by user events. Can be
-                                 // used to wait for a user event's completion.
+  bool isCompleted_; // Signifies whether the operations have completed
+                     //
 
   bool isRecorded_; // Signifies wether a native CUDA event has been recorded
                     // yet.
-  bool isStarted_; // Signifies wether the operation associated with the
-                   // PI event has started or not
+  bool isStarted_;  // Signifies wether the operation associated with the
+                    // PI event has started or not
+                    //
 
   native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
                       // event, this will be nullptr.
@@ -484,12 +403,6 @@ class _pi_event {
   pi_context context_; // pi_context associated with the event. If this is a
                        // native event, this will be the same context associated
                        // with the queue_ member.
-
-  std::mutex mutex_; // Protect access to event_callbacks_. TODO: There might be
-                     // a lock-free data structure we can use here.
-  std::vector<event_callback>
-      event_callbacks_; // Callbacks that can be triggered when an event's state
-                        // changes.
 };
 
 /// Implementation of PI Program on CUDA Module object