Skip to content

Commit

Permalink
Leave only a single thread EventMonitor
Browse files Browse the repository at this point in the history
as opposed to using multiple threads for calblacks and event monitoring
  • Loading branch information
pvelesko committed Feb 20, 2024
1 parent fc4b654 commit 6e1382d
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 142 deletions.
6 changes: 2 additions & 4 deletions src/CHIPBackend.hh
Original file line number Diff line number Diff line change
Expand Up @@ -1773,8 +1773,7 @@ public:
*/
class Backend {
protected:
chipstar::EventMonitor *CallbackEventMonitor_ = nullptr;
chipstar::EventMonitor *StaleEventMonitor_ = nullptr;
chipstar::EventMonitor *EventMonitor_ = nullptr;

int MinQueuePriority_;
int MaxQueuePriority_ = 0;
Expand Down Expand Up @@ -2024,8 +2023,7 @@ public:
createCallbackData(hipStreamCallback_t Callback, void *UserData,
chipstar::Queue *ChipQ) = 0;

virtual chipstar::EventMonitor *createCallbackEventMonitor_() = 0;
virtual chipstar::EventMonitor *createStaleEventMonitor_() = 0;
virtual chipstar::EventMonitor *createEventMonitor_() = 0;

/* event interop */
virtual hipEvent_t getHipEvent(void *NativeEvent) = 0;
Expand Down
131 changes: 23 additions & 108 deletions src/backend/Level0/CHIPBackendLevel0.cc
Original file line number Diff line number Diff line change
Expand Up @@ -597,65 +597,13 @@ CHIPCallbackDataLevel0::CHIPCallbackDataLevel0(hipStreamCallback_t CallbackF,
// EventMonitorLevel0
// ***********************************************************************

void CHIPCallbackEventMonitorLevel0::monitor() {
// CHIPCallbackDataLevel0 *CbData;
// while (true) {
// usleep(200);
// LOCK(EventMonitorMtx); // chipstar::EventMonitor::Stop
// {

// if (Stop) {
// logTrace("CHIPCallbackEventMonitorLevel0 out of callbacks. Exiting "
// "thread");
// if (Backend->CallbackQueue.size())
// logError("Callback thread exiting while there are still active "
// "callbacks in the queue");
// pthread_exit(0);
// }

// LOCK(Backend->CallbackQueueMtx); // Backend::CallbackQueue

// if ((Backend->CallbackQueue.size() == 0))
// continue;

// // get the callback item
// CbData = (CHIPCallbackDataLevel0 *)Backend->CallbackQueue.front();

// // Lock the item and members
// assert(CbData);
// LOCK( // Backend::CallbackQueue
// CbData->CallbackDataMtx);
// Backend->CallbackQueue.pop();

// // Update Status
// logTrace("CHIPCallbackEventMonitorLevel0::monitor() checking event "
// "status for {}",
// static_cast<void *>(CbData->GpuReady.get()));
// CbData->GpuReady->updateFinishStatus(false);
// if (CbData->GpuReady->getEventStatus() != EVENT_STATUS_RECORDED) {
// // if not ready, push to the back
// Backend->CallbackQueue.push(CbData);
// continue;
// }
// }

// CbData->execute(hipSuccess);
// CbData->CpuCallbackComplete->hostSignal();
// CbData->GpuAck->wait();

// delete CbData;
// pthread_yield();
// }
}

void CHIPStaleEventMonitorLevel0::checkCallbacks() {
void CHIPEventMonitorLevel0::checkCallbacks() {
CHIPCallbackDataLevel0 *CbData;
// usleep(200);
LOCK(EventMonitorMtx); // chipstar::EventMonitor::Stop
{

if (Stop) {
logTrace("CHIPCallbackEventMonitorLevel0 out of callbacks. Exiting "
logTrace("checkCallbacks: out of callbacks. Exiting "
"thread");
if (Backend->CallbackQueue.size())
logError("Callback thread exiting while there are still active "
Expand All @@ -678,7 +626,7 @@ void CHIPStaleEventMonitorLevel0::checkCallbacks() {
Backend->CallbackQueue.pop();

// Update Status
logTrace("CHIPCallbackEventMonitorLevel0::monitor() checking event "
logTrace("checkCallbacks: checking event "
"status for {}",
static_cast<void *>(CbData->GpuReady.get()));
CbData->GpuReady->updateFinishStatus(false);
Expand All @@ -697,7 +645,7 @@ void CHIPStaleEventMonitorLevel0::checkCallbacks() {
pthread_yield();
}

void CHIPStaleEventMonitorLevel0::checkEvents() {
void CHIPEventMonitorLevel0::checkEvents() {
CHIPBackendLevel0 *BackendZe = static_cast<CHIPBackendLevel0 *>(Backend);
LOCK(Backend->EventsMtx); // Backend::Events
LOCK(BackendZe->CommandListsMtx); // CHIPBackendLevel0::EventCommandListMapk
Expand Down Expand Up @@ -732,7 +680,7 @@ void CHIPStaleEventMonitorLevel0::checkEvents() {
} // done collecting events to delete
}

void CHIPStaleEventMonitorLevel0::exitChecks() {
void CHIPEventMonitorLevel0::exitChecks() {
LOCK(EventMonitorMtx); // chipstar::EventMonitor::Stop
/**
* In the case that a user doesn't destroy all the
Expand All @@ -756,7 +704,7 @@ void CHIPStaleEventMonitorLevel0::exitChecks() {
pthread_exit(0);

if (EpasedTime > ChipEnvVars.getL0CollectEventsTimeout()) {
logError("CHIPStaleEventMonitorLevel0 stop was called but not all events "
logError("CHIPEventMonitorLevel0 stop was called but not all events "
"have been cleared. Timeout of {} seconds has been reached.",
ChipEnvVars.getL0CollectEventsTimeout());
size_t MaxPrintEntries = std::min(Backend->Events.size(), size_t(10));
Expand All @@ -773,15 +721,15 @@ void CHIPStaleEventMonitorLevel0::exitChecks() {
// print only once a second to avoid saturating stdout with logs
if (CurrTime - LastPrint_ >= 1) {
LastPrint_ = CurrTime;
logDebug("CHIPStaleEventMonitorLevel0 stop was called but not all "
logDebug("CHIPEventMonitorLevel0 stop was called but not all "
"events have been cleared. Timeout of {} seconds has not "
"been reached yet. Elapsed time: {} seconds",
ChipEnvVars.getL0CollectEventsTimeout(), EpasedTime);
}
}
}

void CHIPStaleEventMonitorLevel0::monitor() {
void CHIPEventMonitorLevel0::monitor() {

// Stop is false and I have more events
while (true) {
Expand Down Expand Up @@ -865,14 +813,13 @@ CHIPQueueLevel0::~CHIPQueueLevel0() {
finish(); // must finish the queue because it's possible that that there are
// outstanding operations which have an associated
// chipstar::Event. If we do not finish we risk the chance of
// StaleEventMonitor of deadlocking while waiting for queue
// EventMonitor of deadlocking while waiting for queue
// completion and subsequent event status change
}
updateLastEvent(
nullptr); // Just in case that unique_ptr destructor calls this, the
// generic ~Queue() (which calls updateLastEvent(nullptr))
// hasn't been called yet, and the stale event monitor ends up
// waiting forever.
updateLastEvent(nullptr); // Just in case that unique_ptr destructor calls
// this, the generic ~Queue() (which calls
// updateLastEvent(nullptr)) hasn't been called yet,
// and the event monitor ends up waiting forever.

// The application must not call this function from
// simultaneous threads with the same command queue handle.
Expand Down Expand Up @@ -1746,7 +1693,8 @@ CHIPBackendLevel0::createEventShared(chipstar::Context *ChipCtx,
assert(Event && "LZEventPool returned a null event");

std::static_pointer_cast<CHIPEventLevel0>(Event)->reset();
assert(!std::static_pointer_cast<CHIPEventLevel0>(Event)->getAssignedCmdList());
assert(
!std::static_pointer_cast<CHIPEventLevel0>(Event)->getAssignedCmdList());
logDebug("CHIPBackendLevel0::createEventShared: Context {} Event {}",
(void *)ChipCtx, (void *)Event.get());
Event->isDeletedSanityCheck();
Expand All @@ -1764,7 +1712,7 @@ chipstar::Event *CHIPBackendLevel0::createEvent(chipstar::Context *ChipCtx,

void CHIPBackendLevel0::uninitialize() {
/**
* Stale chipstar::Event Monitor expects to collect all events. To do this,
* chipstar::Event Monitor expects to collect all events. To do this,
* all events must reach the refcount of 0. At this point, all queues should
* have their LastEvent as nullptr but in case a user didn't sync and destroy
* a user-created stream, such stream might not have its LastEvent as nullptr.
Expand All @@ -1775,38 +1723,12 @@ void CHIPBackendLevel0::uninitialize() {
logTrace("Backend::uninitialize(): Setting the LastEvent to null for all "
"user-created queues");

if (CallbackEventMonitor_) {
logTrace("Backend::uninitialize(): Killing CallbackEventMonitor");
LOCK(
CallbackEventMonitor_->EventMonitorMtx); // chipstar::EventMonitor::Stop
CallbackEventMonitor_->Stop = true;
}
CallbackEventMonitor_->join();

{
logTrace("Backend::uninitialize(): Killing StaleEventMonitor");
LOCK(StaleEventMonitor_->EventMonitorMtx); // chipstar::EventMonitor::Stop
StaleEventMonitor_->Stop = true;
logTrace("Backend::uninitialize(): Killing EventMonitor");
LOCK(EventMonitor_->EventMonitorMtx); // chipstar::EventMonitor::Stop
EventMonitor_->Stop = true;
}
StaleEventMonitor_->join();

// if (Backend->Events.size()) {
// logTrace("Remaining {} events that haven't been collected:",
// Backend->Events.size());
// for (auto *E : Backend->Events) {
// logTrace("{} status= {} refc={}", E->Msg, E->getEventStatusStr(),
// E->getCHIPRefc());
// if (!E->isUserEvent()) {
// // A strong indicator that we are missing decreaseRefCount() call
// // for events which are solely managed by the chipStar.
// assert(!(E->isFinished() && E->getCHIPRefc() > 0) &&
// "Missed decreaseRefCount()?");
// assert(E->isFinished() && "Uncollected non-user events!");
// }
// }
// logTrace("Remaining {} command lists that haven't been collected:",
// static_cast<CHIPBackendLevel0*>(Backend)->EventCommandListMap.size());
// }
EventMonitor_->join();
return;
}

Expand Down Expand Up @@ -1924,11 +1846,7 @@ void CHIPBackendLevel0::initializeImpl() {
ChipL0Ctx->setDevice(ChipL0Dev);
}

StaleEventMonitor_ =
(CHIPStaleEventMonitorLevel0 *)::Backend->createStaleEventMonitor_();
CallbackEventMonitor_ = (CHIPCallbackEventMonitorLevel0 *)::Backend
->createCallbackEventMonitor_();

EventMonitor_ = (CHIPEventMonitorLevel0 *)::Backend->createEventMonitor_();
// Run these lasts, as they may depend on the device properties being
// populated
setUseImmCmdLists(DeviceName);
Expand All @@ -1952,13 +1870,10 @@ void CHIPBackendLevel0::initializeFromNative(const uintptr_t *NativeHandles,
CHIPDeviceLevel0 *ChipDev = CHIPDeviceLevel0::create(Dev, ChipCtx, 0);
ChipCtx->setDevice(ChipDev);

LOCK(::Backend->BackendMtx); // CHIPBackendLevel0::StaleEventMonitor
LOCK(::Backend->BackendMtx); // CHIPBackendLevel0::EventMonitor
ChipDev->LegacyDefaultQueue = ChipDev->createQueue(NativeHandles, NumHandles);

StaleEventMonitor_ =
(CHIPStaleEventMonitorLevel0 *)::Backend->createStaleEventMonitor_();
CallbackEventMonitor_ = (CHIPCallbackEventMonitorLevel0 *)::Backend
->createCallbackEventMonitor_();
EventMonitor_ = (CHIPEventMonitorLevel0 *)::Backend->createEventMonitor_();
setActiveDevice(ChipDev);
}

Expand Down
27 changes: 6 additions & 21 deletions src/backend/Level0/CHIPBackendLevel0.hh
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ public:
* @param CmdList command list to Assign with this event
*/
void assignCmdList(CHIPContextLevel0 *ChipContext,
ze_command_list_handle_t CmdList);
ze_command_list_handle_t CmdList);

/**
* @brief Reset and then return the command list handle back to the context
Expand Down Expand Up @@ -167,16 +167,7 @@ public:
virtual ~CHIPCallbackDataLevel0() override {}
};

class CHIPCallbackEventMonitorLevel0 : public chipstar::EventMonitor {
public:
~CHIPCallbackEventMonitorLevel0() {
logTrace("CHIPCallbackEventMonitorLevel0 DEST");
join();
};
virtual void monitor() override;
};

class CHIPStaleEventMonitorLevel0 : public chipstar::EventMonitor {
class CHIPEventMonitorLevel0 : public chipstar::EventMonitor {
// variable for storing the how much time has passed since trying to exit
// the monitor loop
int TimeSinceStopRequested_ = 0;
Expand All @@ -196,8 +187,8 @@ class CHIPStaleEventMonitorLevel0 : public chipstar::EventMonitor {
void checkCallbacks();

public:
~CHIPStaleEventMonitorLevel0() {
logTrace("CHIPStaleEventMonitorLevel0 DEST");
~CHIPEventMonitorLevel0() {
logTrace("CHIPEventMonitorLevel0 DEST");
join();
};
virtual void monitor() override;
Expand Down Expand Up @@ -688,14 +679,8 @@ public:
return new CHIPCallbackDataLevel0(Callback, UserData, ChipQueue);
}

virtual chipstar::EventMonitor *createCallbackEventMonitor_() override {
auto Evm = new CHIPCallbackEventMonitorLevel0();
Evm->start();
return Evm;
}

virtual chipstar::EventMonitor *createStaleEventMonitor_() override {
auto Evm = new CHIPStaleEventMonitorLevel0();
virtual chipstar::EventMonitor *createEventMonitor_() override {
auto Evm = new CHIPEventMonitorLevel0();
Evm->start();
return Evm;
}
Expand Down
8 changes: 1 addition & 7 deletions src/backend/OpenCL/CHIPBackendOpenCL.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1628,13 +1628,7 @@ chipstar::CallbackData *CHIPBackendOpenCL::createCallbackData(
UNIMPLEMENTED(nullptr);
}

chipstar::EventMonitor *CHIPBackendOpenCL::createCallbackEventMonitor_() {
auto Evm = new EventMonitorOpenCL();
Evm->start();
return Evm;
}

chipstar::EventMonitor *CHIPBackendOpenCL::createStaleEventMonitor_() {
chipstar::EventMonitor *CHIPBackendOpenCL::createEventMonitor_() {
UNIMPLEMENTED(nullptr);
}

Expand Down
3 changes: 1 addition & 2 deletions src/backend/OpenCL/CHIPBackendOpenCL.hh
Original file line number Diff line number Diff line change
Expand Up @@ -405,8 +405,7 @@ public:
virtual chipstar::CallbackData *
createCallbackData(hipStreamCallback_t Callback, void *UserData,
chipstar::Queue *ChipQueue) override;
virtual chipstar::EventMonitor *createCallbackEventMonitor_() override;
virtual chipstar::EventMonitor *createStaleEventMonitor_() override;
virtual chipstar::EventMonitor *createEventMonitor_() override;

virtual hipEvent_t getHipEvent(void *NativeEvent) override;
virtual void *getNativeEvent(hipEvent_t HipEvent) override;
Expand Down

0 comments on commit 6e1382d

Please sign in to comment.