Skip to content

Commit

Permalink
Merge pull request #774 from CHIP-SPV/various-improvements
Browse files Browse the repository at this point in the history
Various improvements
  • Loading branch information
pvelesko authored Feb 20, 2024
2 parents a8e0554 + 4b3cb0b commit c6c22d2
Show file tree
Hide file tree
Showing 10 changed files with 227 additions and 40 deletions.
2 changes: 1 addition & 1 deletion HIP
1 change: 1 addition & 0 deletions samples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ set(SAMPLES
11_device
hipStreamSemantics
hipKernelLaunchIsNonBlocking
hipAddCallback
hipMultiThreadAddCallback
hipInfo
hipSymbol
Expand Down
2 changes: 2 additions & 0 deletions samples/hipAddCallback/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

add_chip_test(hipAddCallback hipAddCallback PASSED hipAddCallback.cc)
140 changes: 140 additions & 0 deletions samples/hipAddCallback/hipAddCallback.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/*
Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2022 Paulius Velesko <[email protected]>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

/**
Testcase Scenario :
Validate behaviour of HIP when multiple hipStreaAddCallback() are called over
multiple Threads.
*/

//#include <hip_test_common.hh>
#include <atomic>
#include <iostream>
#include "hip/hip_runtime.h"

#ifndef NDEBUG
#define HIPCHECK(x) assert(x == hipSuccess)
#define HIP_CHECK(x) assert(x == hipSuccess)
#else
#define HIPCHECK(x) x
#define HIP_CHECK(x) x
#endif

static constexpr size_t N = 4096;
static constexpr int numThreads = 1;
static std::atomic<int> Cb_count{0}, Data_mismatch{0};
static hipStream_t mystream;
static float *A1_h, *C1_h;

#if HT_AMD
#define HIPRT_CB
#endif
// TODO
#define HIPRT_CB

static __global__ void device_function(float *C_d, float *A_d, size_t Num) {
size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x;

for (size_t i = gputhread; i < Num; i += stride) {
C_d[i] = A_d[i] * A_d[i];
}

// TODO Currently makes OpenCL Fail ?
// Delay thread 1 only in the GPU
// if (gputhread == 1) {
// uint64_t wait_t = 3200000000, start = clock64(), cur;
// do {
// cur = clock64() - start;
// } while (cur < wait_t);
//}
}

static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
void *userData) {
assert(stream == mystream);
assert(userData == nullptr);
HIPCHECK(status);

for (size_t i = 0; i < N; i++) {
// Validate the data and update Data_mismatch
if (C1_h[i] != A1_h[i] * A1_h[i]) {
Data_mismatch++;
}
}

// Increment the Cb_count to indicate that the callback is processed.
++Cb_count;
}

/**
Test multiple hipStreamAddCallback() called over
multiple Threads.
*/
int main() {
std::cout << "START\n";
float *A_d, *C_d;
size_t Nbytes = (N) * sizeof(float);
constexpr float Phi = 1.618f;

A1_h = reinterpret_cast<float *>(malloc(Nbytes));
assert(A1_h != nullptr);
C1_h = reinterpret_cast<float *>(malloc(Nbytes));
assert(C1_h != nullptr);

// Fill with Phi + i
for (size_t i = 0; i < N; i++) {
A1_h[i] = Phi + i;
}

HIP_CHECK(hipMalloc(&A_d, Nbytes));
HIP_CHECK(hipMalloc(&C_d, Nbytes));

HIP_CHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking));

HIP_CHECK(hipMemcpyAsync(A_d, A1_h, Nbytes, hipMemcpyHostToDevice, mystream));

constexpr unsigned threadsPerBlock = 256;
constexpr unsigned blocks = (N + 255) / threadsPerBlock;

hipLaunchKernelGGL((device_function), dim3(blocks), dim3(threadsPerBlock), 0,
mystream, C_d, A_d, N);
HIP_CHECK(hipMemcpyAsync(C1_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));

HIPCHECK(hipStreamAddCallback(mystream, Thread1_Callback, nullptr, 0));

HIP_CHECK(hipStreamSynchronize(mystream));
HIP_CHECK(hipStreamDestroy(mystream));

HIP_CHECK(hipFree(A_d));
HIP_CHECK(hipFree(C_d));

free(A1_h);
free(C1_h);

// Cb_count should match total number of callbacks added from both threads
// Data_mismatch will be updated if there is problem in data validation
assert(Cb_count.load() == numThreads);
assert(Data_mismatch.load() == 0);
std::cout << "PASSED\n";
}
11 changes: 11 additions & 0 deletions src/CHIPBackend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1465,6 +1465,17 @@ chipstar::Queue::~Queue() {
this->ChipDevice_->NumQueuesAlive.fetch_sub(1, std::memory_order_relaxed);
};

void chipstar::Queue::updateLastEvent(
const std::shared_ptr<chipstar::Event> &NewEvent) {
LOCK(LastEventMtx); // CHIPQueue::LastEvent_
logDebug("Setting LastEvent for {} {} -> {}", (void *)this,
(void *)LastEvent_.get(), (void *)NewEvent.get());
if (NewEvent == LastEvent_) // TODO: should I compare NewEvent.get()
return;

LastEvent_ = NewEvent;
}

std::vector<std::shared_ptr<chipstar::Event>>
chipstar::Queue::getSyncQueuesLastEvents() {
auto Dev = ::Backend->getActiveDevice();
Expand Down
10 changes: 1 addition & 9 deletions src/CHIPBackend.hh
Original file line number Diff line number Diff line change
Expand Up @@ -2153,15 +2153,7 @@ public:

chipstar::QueueFlags getQueueFlags() { return QueueFlags_; }
virtual void
updateLastEvent(const std::shared_ptr<chipstar::Event> &NewEvent) {
LOCK(LastEventMtx); // CHIPQueue::LastEvent_
logDebug("Setting LastEvent for {} {} -> {}", (void *)this,
(void *)LastEvent_.get(), (void *)NewEvent.get());
if (NewEvent == LastEvent_) // TODO: should I compare NewEvent.get()
return;

LastEvent_ = NewEvent;
}
updateLastEvent(const std::shared_ptr<chipstar::Event> &NewEvent);

/**
* @brief Blocking memory copy
Expand Down
11 changes: 11 additions & 0 deletions src/CHIPDriver.hh
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ private:
bool SkipUninit_ = false;
std::string JitFlags_ = CHIP_DEFAULT_JIT_FLAGS;
bool L0ImmCmdLists_ = true;
unsigned long L0EventTimeout_ = 0;
int L0CollectEventsTimeout_ = 0;

public:
Expand All @@ -247,6 +248,12 @@ public:
const std::string &getJitFlags() const { return JitFlags_; }
bool getL0ImmCmdLists() const { return L0ImmCmdLists_; }
int getL0CollectEventsTimeout() const { return L0CollectEventsTimeout_; }
unsigned long getL0EventTimeout() const {
if (L0EventTimeout_ == 0)
return UINT64_MAX;

return L0EventTimeout_ * 1e9;
}

private:
void parseEnvironmentVariables() {
Expand Down Expand Up @@ -274,6 +281,9 @@ private:

if (!readEnvVar("CHIP_L0_COLLECT_EVENTS_TIMEOUT").empty())
L0CollectEventsTimeout_ = parseInt("CHIP_L0_COLLECT_EVENTS_TIMEOUT");

if (!readEnvVar("CHIP_L0_EVENT_TIMEOUT").empty())
L0EventTimeout_ = parseInt("CHIP_L0_EVENT_TIMEOUT");
}

std::string_view parseJitFlags(const std::string &StrIn) {
Expand Down Expand Up @@ -313,6 +323,7 @@ private:
logDebug("CHIP_JIT_FLAGS_OVERRIDE={}", JitFlags_);
logDebug("CHIP_L0_IMM_CMD_LISTS={}", L0ImmCmdLists_ ? "on" : "off");
logDebug("CHIP_L0_COLLECT_EVENTS_TIMEOUT={}", L0CollectEventsTimeout_);
logDebug("CHIP_L0_EVENT_TIMEOUT={}", L0EventTimeout_);
logDebug("CHIP_SKIP_UNINIT={}", SkipUninit_ ? "on" : "off");
}
};
Expand Down
11 changes: 11 additions & 0 deletions src/CHIPException.hh
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,17 @@ public:
} \
} while (0)

#define CHIPERR_CHECK_LOG_AND_ABORT(status, success, errtype, ...) \
do { \
if (status != success) { \
std::string error_msg = std::string(resultToString(status)); \
std::string custom_msg = std::string(__VA_ARGS__); \
std::string msg_ = error_msg + " " + custom_msg; \
std::cout << msg_ << std::endl; \
std::abort(); \
} \
} while (0)

#define CHIP_TRY try {
#define CHIP_CATCH \
} \
Expand Down
67 changes: 41 additions & 26 deletions src/backend/Level0/CHIPBackendLevel0.cc
Original file line number Diff line number Diff line change
Expand Up @@ -216,20 +216,20 @@ createSampler(CHIPDeviceLevel0 *ChipDev, const hipResourceDesc *PResDesc,
// CHIPEventLevel0
// ***********************************************************************

void CHIPEventLevel0::associateCmdList(CHIPContextLevel0 *ChipContext,
ze_command_list_handle_t CmdList) {
logTrace("CHIPEventLevel0({})::associateCmdList({})", (void *)this,
void CHIPEventLevel0::assignCmdList(CHIPContextLevel0 *ChipContext,
ze_command_list_handle_t CmdList) {
logTrace("CHIPEventLevel0({})::assignCmdList({})", (void *)this,
(void *)CmdList);
assert(AssocCmdList_ == nullptr && "command list already associated!");
assert(AssocContext_ == nullptr && "queue already associated!");
assert(AssocCmdList_ == nullptr && "command list already assigned!");
assert(AssocContext_ == nullptr && "queue already assigned!");
AssocCmdList_ = CmdList;
AssocContext_ = ChipContext;
}

void CHIPEventLevel0::disassociateCmdList() {
assert(AssocCmdList_ != nullptr && "command list not associated!");
assert(AssocContext_ != nullptr && "queue not associated!");
logTrace("CHIPEventLevel0({})::disassociateCmdList({})", (void *)this,
void CHIPEventLevel0::unassignCmdList() {
assert(AssocCmdList_ != nullptr && "command list not assigned!");
assert(AssocContext_ != nullptr && "queue not assigned!");
logTrace("CHIPEventLevel0({})::unassignCmdList({})", (void *)this,
(void *)AssocCmdList_);
auto Status = zeCommandListReset(AssocCmdList_);
CHIPERR_CHECK_LOG_AND_THROW(Status, ZE_RESULT_SUCCESS, hipErrorTbd);
Expand Down Expand Up @@ -417,10 +417,20 @@ void CHIPQueueLevel0::recordEvent(chipstar::Event *ChipEvent) {

bool CHIPEventLevel0::wait() {
assert(!Deleted_ && "chipstar::Event use after delete!");
logTrace("CHIPEventLevel0::wait() {} msg={}", (void *)this, Msg);
logTrace("CHIPEventLevel0::wait(timeout: {}) {} Msg: {} Handle: {}",
ChipEnvVars.getL0EventTimeout(), (void *)this, Msg, (void *)Event_);

ze_result_t Status = zeEventHostSynchronize(Event_, UINT64_MAX);
CHIPERR_CHECK_LOG_AND_THROW(Status, ZE_RESULT_SUCCESS, hipErrorTbd);
ze_result_t Status =
zeEventHostSynchronize(Event_, ChipEnvVars.getL0EventTimeout());
if (Status == ZE_RESULT_NOT_READY) {
logError("CHIPEventLevel0::wait() {} Msg {} handle {} timed out after {} "
"seconds.\n"
"Aborting now... segfaults, illegal instructions and other "
"undefined behavior may follow.",
(void *)this, Msg, (void *)Event_,
ChipEnvVars.getL0EventTimeout() / 1e9);
std::abort();
}

LOCK(EventMtx); // chipstar::Event::EventStatus_
EventStatus_ = EVENT_STATUS_RECORDED;
Expand Down Expand Up @@ -649,7 +659,7 @@ void CHIPStaleEventMonitorLevel0::checkEvents() {
ChipEventLz->releaseDependencies();
Backend->Events.erase(Backend->Events.begin() + EventIdx);
if (ChipEventLz->getAssocCmdList())
ChipEventLz->disassociateCmdList();
ChipEventLz->unassignCmdList();
ChipEventLz->doActions();
}

Expand Down Expand Up @@ -1501,10 +1511,8 @@ void CHIPQueueLevel0::finish() {
if (ChipEnvVars.getL0ImmCmdLists()) {
auto Event = getLastEvent();
auto EventLZ = std::static_pointer_cast<CHIPEventLevel0>(Event);
if (EventLZ) {
auto EventHandle = EventLZ->peek();
zeEventHostSynchronize(EventHandle, UINT64_MAX);
}
if (EventLZ)
EventLZ->wait();
} else {
zeCommandQueueSynchronize(ZeCmdQ_, UINT64_MAX);
}
Expand Down Expand Up @@ -1563,7 +1571,7 @@ void CHIPQueueLevel0::executeCommandListReg(
CHIPERR_CHECK_LOG_AND_THROW(Status, ZE_RESULT_SUCCESS, hipErrorTbd);

auto EventLz = std::static_pointer_cast<CHIPEventLevel0>(LastCmdListEvent);
EventLz->associateCmdList(this->ChipCtxLz_, CommandList);
EventLz->assignCmdList(this->ChipCtxLz_, CommandList);

updateLastEvent(LastCmdListEvent);
Backend->trackEvent(LastCmdListEvent);
Expand Down Expand Up @@ -1917,14 +1925,22 @@ void CHIPContextLevel0::freeImpl(void *Ptr) {

CHIPContextLevel0::~CHIPContextLevel0() {
logTrace("~CHIPContextLevel0() {}", (void *)this);
// print cmd lists statistics
if (!ChipEnvVars.getL0ImmCmdLists() && CmdListsRequested_ > 0)
logDebug("Command lists requested: {}, reused {}%", CmdListsRequested_,
100 * (CmdListsReused_ / CmdListsRequested_));

// print out reuse statistics
if (CmdListsRequested_ != 0)
logInfo("Command list reuse: {}%",
100 * (CmdListsReused_ / CmdListsRequested_));
else
logInfo("Command list reuse: N/A (No command lists requested)");

if (EventsRequested_ != 0)
logInfo("Events reuse: {}%", 100 * (EventsReused_ / EventsRequested_));
else
logInfo("Events reuse: N/A (No events requested)");

// delete all event pools
for (LZEventPool *Pool : EventPools_) {
for (LZEventPool *Pool : EventPools_)
delete Pool;
}
EventPools_.clear();

// delete all devicesA
Expand All @@ -1933,9 +1949,8 @@ CHIPContextLevel0::~CHIPContextLevel0() {
// The application must not call this function from
// simultaneous threads with the same context handle.
// Done via destructor should not be called from multiple threads
if (ownsZeContext) {
if (ownsZeContext)
zeContextDestroy(this->ZeCtx);
}
}

void *CHIPContextLevel0::allocateImpl(size_t Size, size_t Alignment,
Expand Down
Loading

0 comments on commit c6c22d2

Please sign in to comment.