From 6fffa50442544cbf13367a7877698ded511aad80 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 11 Aug 2020 04:44:08 -0500 Subject: [PATCH 01/38] optimized tool stats; Change-Id: I1baab986d36207b87f6f9ad5e0a45a9cffbea0c8 --- src/core/roctracer.cpp | 12 +-- src/core/trace_buffer.h | 21 ++++- src/proxy/intercept_queue.h | 4 +- test/CMakeLists.txt | 2 +- test/tool/tracer_tool.cpp | 163 ++++++++++++++++++++++++++++++------ 5 files changed, 166 insertions(+), 36 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 2d15bbba..dd47bb0f 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -218,11 +218,12 @@ template<> bool act_en_functor_t::fun(const act_en_functor_t::record_t& record) void hsa_async_copy_handler(::proxy::Tracker::entry_t* entry); void hsa_kernel_handler(::proxy::Tracker::entry_t* entry); -TraceBuffer::flush_prm_t trace_buffer_prm[] = { +constexpr TraceBuffer::flush_prm_t trace_buffer_prm[] = { {COPY_ENTRY_TYPE, hsa_async_copy_handler}, {KERNEL_ENTRY_TYPE, hsa_kernel_handler} }; -TraceBuffer trace_buffer("HSA GPU", 0x200000, trace_buffer_prm, 2); +TraceBuffer* trace_buffer = NULL; +//TraceBuffer trace_buffer("HSA GPU", 0x200000, trace_buffer_prm, 2); namespace hsa_support { // callbacks table @@ -567,7 +568,7 @@ hsa_status_t hsa_amd_memory_async_copy_interceptor( { hsa_status_t status = HSA_STATUS_SUCCESS; if (hsa_support::async_copy_callback_enabled) { - trace_entry_t* entry = trace_buffer.GetEntry(); + trace_entry_t* entry = trace_buffer->GetEntry(); ::proxy::Tracker::Enable(COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src, src_agent, size, num_dep_signals, @@ -591,7 +592,7 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( { hsa_status_t status = HSA_STATUS_SUCCESS; if (hsa_support::async_copy_callback_enabled) { - trace_entry_t* entry = trace_buffer.GetEntry(); + trace_entry_t* entry = trace_buffer->GetEntry(); ::proxy::Tracker::Enable(COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); status = hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src, src_offset, range, copy_agent, @@ -1289,13 +1290,14 @@ PUBLIC_API void roctracer_unload() { PUBLIC_API void roctracer_flush_buf() { ONLOAD_TRACE_BEG(); - roctracer::trace_buffer.Flush(); + roctracer::trace_buffer->Flush(); ONLOAD_TRACE_END(); } CONSTRUCTOR_API void constructor() { ONLOAD_TRACE_BEG(); roctracer::util::Logger::Create(); + roctracer::trace_buffer = new roctracer::TraceBuffer("HSA GPU", 0x200000, roctracer::trace_buffer_prm, 2); roctracer_load(); ONLOAD_TRACE_END(); } diff --git a/src/core/trace_buffer.h b/src/core/trace_buffer.h index cd62dda7..8d994046 100644 --- a/src/core/trace_buffer.h +++ b/src/core/trace_buffer.h @@ -124,7 +124,7 @@ class TraceBuffer : protected TraceBufferBase { callback_t fun; }; - TraceBuffer(const char* name, uint32_t size, flush_prm_t* flush_prm_arr, uint32_t flush_prm_count) : + TraceBuffer(const char* name, uint32_t size, const flush_prm_t* flush_prm_arr, uint32_t flush_prm_count, uint32_t prior = 0) : is_flushed_(false), work_thread_started_(false) { @@ -139,12 +139,14 @@ class TraceBuffer : protected TraceBufferBase { flush_prm_arr_ = flush_prm_arr; flush_prm_count_ = flush_prm_count; + priority_ = prior; + TraceBufferBase::Push(this); } ~TraceBuffer() { StopWorkerThread(); - Flush(); + FlushAll(); } void StartWorkerThread() { @@ -176,14 +178,24 @@ class TraceBuffer : protected TraceBufferBase { } void Flush() { flush_buf(); } + void Flush(const bool& b) { + DisableFlushing(!b); + flush_buf(); + } + void DisableFlushing(const bool& b) { is_flushed_.exchange(b, std::memory_order_acquire); } private: void flush_buf() { std::lock_guard lck(mutex_); const bool is_flushed = is_flushed_.exchange(true, std::memory_order_acquire); + if (priority_ != 0) { + priority_ -= 1; + return; + } + if (is_flushed == false) { - for (flush_prm_t* prm = flush_prm_arr_; prm < flush_prm_arr_ + flush_prm_count_; prm++) { + for (const flush_prm_t* prm = flush_prm_arr_; prm < flush_prm_arr_ + flush_prm_count_; prm++) { // Flushed entries type uint32_t type = prm->type; // Flushing function @@ -253,8 +265,9 @@ class TraceBuffer : protected TraceBufferBase { volatile std::atomic end_pointer_; std::list buf_list_; - flush_prm_t* flush_prm_arr_; + const flush_prm_t* flush_prm_arr_; uint32_t flush_prm_count_; + uint32_t priority_; volatile std::atomic is_flushed_; pthread_t work_thread_; diff --git a/src/proxy/intercept_queue.h b/src/proxy/intercept_queue.h index f92f1ce6..000c7e88 100644 --- a/src/proxy/intercept_queue.h +++ b/src/proxy/intercept_queue.h @@ -39,7 +39,7 @@ THE SOFTWARE. #include "util/hsa_rsrc_factory.h" #include "util/exception.h" -namespace roctracer { extern TraceBuffer trace_buffer; } +namespace roctracer { extern TraceBuffer* trace_buffer; } namespace rocprofiler { extern decltype(hsa_queue_create)* hsa_queue_create_fn; @@ -160,7 +160,7 @@ class InterceptQueue { const char* kernel_name = GetKernelName(kernel_symbol); // Adding kernel timing tracker - ::proxy::Tracker::entry_t* entry = roctracer::trace_buffer.GetEntry(); + ::proxy::Tracker::entry_t* entry = roctracer::trace_buffer->GetEntry(); entry->kernel.tid = syscall(__NR_gettid); entry->kernel.name = kernel_name; ::proxy::Tracker::Enable(roctracer::KERNEL_ENTRY_TYPE, obj->agent_info_->dev_id, completion_signal, entry); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3e3b9654..6a6d7d17 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,7 +44,7 @@ endif () ## Path to HSA test set ( HSA_TEST_DIR "${TEST_DIR}/hsa/test" ) -set ( HSA_REV "5b47aae" ) +set ( HSA_REV "a657002" ) ## test run script set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index df2530a5..db87d04d 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -42,6 +42,7 @@ THE SOFTWARE. #include "src/core/loader.h" #include "src/core/trace_buffer.h" +#include "util/evt_stats.h" #include "util/hsa_rsrc_factory.h" #include "util/xml.h" @@ -96,6 +97,12 @@ std::vector kfd_api_vec; LOADER_INSTANTIATE(); TRACE_BUFFER_INSTANTIATE(); +typedef EvtStatsT EvtStatsA; +// HIP stats +EvtStats* hip_api_stats = NULL; +EvtStatsA* hip_kernel_stats = NULL; +EvtStatsA* hip_memcpy_stats = NULL; + // Global output file handle FILE* begin_ts_file_handle = NULL; FILE* roctx_file_handle = NULL; @@ -137,7 +144,7 @@ static inline const char* cxx_demangle(const char* symbol) { size_t funcnamesize; int status; const char* ret = (symbol != NULL) ? abi::__cxa_demangle(symbol, NULL, &funcnamesize, &status) : symbol; - return (ret != NULL) ? ret : symbol; + return (ret != NULL) ? ret : strdup(symbol); } // Tracing control thread @@ -208,8 +215,9 @@ struct roctx_trace_entry_t { }; void roctx_flush_cb(roctx_trace_entry_t* entry); -roctracer::TraceBuffer::flush_prm_t roctx_flush_prm[1] = {{0, roctx_flush_cb}}; -roctracer::TraceBuffer roctx_trace_buffer("rocTX API", 0x200000, roctx_flush_prm, 1); +constexpr roctracer::TraceBuffer::flush_prm_t roctx_flush_prm[1] = {{0, roctx_flush_cb}}; +roctracer::TraceBuffer* roctx_trace_buffer = NULL; +//roctracer::TraceBuffer roctx_trace_buffer("rocTX API", 0x200000, roctx_flush_prm, 1); // rocTX callback function static inline void roctx_callback_fun( @@ -224,7 +232,7 @@ static inline void roctx_callback_fun( #else const timestamp_t time = timer->timestamp_fn_ns(); #endif - roctx_trace_entry_t* entry = roctx_trace_buffer.GetEntry(); + roctx_trace_entry_t* entry = roctx_trace_buffer->GetEntry(); entry->valid = roctracer::TRACE_ENTRY_COMPL; entry->type = 0; entry->cid = cid; @@ -286,8 +294,9 @@ struct hsa_api_trace_entry_t { }; void hsa_api_flush_cb(hsa_api_trace_entry_t* entry); -roctracer::TraceBuffer::flush_prm_t hsa_flush_prm[1] = {{0, hsa_api_flush_cb}}; -roctracer::TraceBuffer hsa_api_trace_buffer("HSA API", 0x200000, hsa_flush_prm, 1); +constexpr roctracer::TraceBuffer::flush_prm_t hsa_flush_prm[1] = {{0, hsa_api_flush_cb}}; +roctracer::TraceBuffer* hsa_api_trace_buffer = NULL; +//roctracer::TraceBuffer hsa_api_trace_buffer("HSA API", 0x200000, hsa_flush_prm, 1); // HSA API callback function void hsa_api_callback( @@ -302,7 +311,7 @@ void hsa_api_callback( hsa_begin_timestamp = timer->timestamp_fn_ns(); } else { const timestamp_t end_timestamp = (cid == HSA_API_ID_hsa_shut_down) ? hsa_begin_timestamp : timer->timestamp_fn_ns(); - hsa_api_trace_entry_t* entry = hsa_api_trace_buffer.GetEntry(); + hsa_api_trace_entry_t* entry = hsa_api_trace_buffer->GetEntry(); entry->valid = roctracer::TRACE_ENTRY_COMPL; entry->type = 0; entry->cid = cid; @@ -348,8 +357,9 @@ struct hip_api_trace_entry_t { }; void hip_api_flush_cb(hip_api_trace_entry_t* entry); -roctracer::TraceBuffer::flush_prm_t hip_flush_prm[1] = {{0, hip_api_flush_cb}}; -roctracer::TraceBuffer hip_api_trace_buffer("HIP", 0x200000, hip_flush_prm, 1); +constexpr roctracer::TraceBuffer::flush_prm_t hip_api_flush_prm[1] = {{0, hip_api_flush_cb}}; +roctracer::TraceBuffer* hip_api_trace_buffer = NULL; +//roctracer::TraceBuffer hip_api_trace_buffer("HIP API", 0x200000, hip_api_flush_prm, 1); static inline bool is_hip_kernel_launch_api(const uint32_t& cid) { bool ret = @@ -379,7 +389,7 @@ void hip_api_callback( hipApiArgsInit((hip_api_id_t)cid, const_cast(data)); const timestamp_t end_timestamp = timer->timestamp_fn_ns(); - hip_api_trace_entry_t* entry = hip_api_trace_buffer.GetEntry(); + hip_api_trace_entry_t* entry = hip_api_trace_buffer->GetEntry(); entry->valid = roctracer::TRACE_ENTRY_COMPL; entry->type = 0; entry->cid = cid; @@ -440,7 +450,7 @@ void mark_api_callback( const char* name = reinterpret_cast(callback_data); const timestamp_t timestamp = timer->timestamp_fn_ns(); - hip_api_trace_entry_t* entry = hip_api_trace_buffer.GetEntry(); + hip_api_trace_entry_t* entry = hip_api_trace_buffer->GetEntry(); entry->valid = roctracer::TRACE_ENTRY_COMPL; entry->type = 0; entry->cid = 0; @@ -454,7 +464,14 @@ void mark_api_callback( entry->ptr = NULL; } +typedef std::map hip_kernel_map_t; +hip_kernel_map_t* hip_kernel_map = NULL; +std::mutex hip_kernel_mutex; + void hip_api_flush_cb(hip_api_trace_entry_t* entry) { + static uint64_t correlation_id = 0; + correlation_id += 1; + const uint32_t domain = entry->domain; const uint32_t cid = entry->cid; const hip_api_data_t* data = &(entry->data); @@ -469,12 +486,22 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { if (domain == ACTIVITY_DOMAIN_HIP_API) { #if HIP_PROF_HIP_API_STRING - const char* str = hipApiString((hip_api_id_t)cid, data); - rec_ss << " " << str; - if (is_hip_kernel_launch_api(cid)) { - if (entry->name) rec_ss << " kernel=" << cxx_demangle(entry->name); + if (hip_api_stats != NULL) { + hip_api_stats->add_event(cid, end_timestamp - begin_timestamp); + if (is_hip_kernel_launch_api(cid)) { + hip_kernel_mutex.lock(); + (*hip_kernel_map)[correlation_id] = entry->name; + hip_kernel_mutex.unlock(); + } + } else { + const char* str = hipApiString((hip_api_id_t)cid, data); + rec_ss << " " << str; + if (is_hip_kernel_launch_api(cid) && entry->name) { + const char* kernel_name = cxx_demangle(entry->name); + rec_ss << " kernel=" << kernel_name; + } + fprintf(hip_api_file_handle, "%s\n", rec_ss.str().c_str()); } - fprintf(hip_api_file_handle, "%s\n", rec_ss.str().c_str()); #else // !HIP_PROF_HIP_API_STRING switch (cid) { case HIP_API_ID_hipMemcpy: @@ -536,6 +563,47 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { fflush(hip_api_file_handle); } +/////////////////////////////////////////////////////////////////////////////////////////////////////// +// HSA API tracing + +struct hip_act_trace_entry_t { + uint32_t valid; + uint32_t type; + uint32_t kind; + timestamp_t dur; + uint64_t correlation_id; +}; + +void hip_act_flush_cb(hip_act_trace_entry_t* entry); +constexpr roctracer::TraceBuffer::flush_prm_t hip_act_flush_prm[1] = {{0, hip_act_flush_cb}}; +roctracer::TraceBuffer* hip_act_trace_buffer = NULL; +//roctracer::TraceBuffer hip_act_trace_buffer("HIP ACT", 0x200000, hip_act_flush_prm, 1); + +// HIP ACT trace buffer flush callback +void hip_act_flush_cb(hip_act_trace_entry_t* entry) { + const uint32_t domain = ACTIVITY_DOMAIN_HCC_OPS; + const uint32_t op = 0; + const char * name = roctracer_op_string(domain, op, entry->kind); + if (name == NULL) { + printf("hip_act_flush_cb name is NULL\n"); fflush(stdout); + abort(); + } + + if (strncmp("Kernel", name, 6) == 0) { + hip_kernel_mutex.lock(); + if (hip_kernel_stats == NULL) { + printf("hip_act_flush_cb hip_kernel_stats is NULL\n"); fflush(stdout); + abort(); + } + name = (*hip_kernel_map)[entry->correlation_id]; + hip_kernel_mutex.unlock(); + const char* kernel_name = cxx_demangle(name); + hip_kernel_stats->add_event(kernel_name, entry->dur); + } else { + hip_memcpy_stats->add_event(name, entry->dur); + } +} + // Activity tracing callback // hipMalloc id(3) correlation_id(1): begin_ns(1525888652762640464) end_ns(1525888652762877067) void pool_activity_callback(const char* begin, const char* end, void* arg) { @@ -546,11 +614,20 @@ void pool_activity_callback(const char* begin, const char* end, void* arg) { const char * name = roctracer_op_string(record->domain, record->op, record->kind); switch(record->domain) { case ACTIVITY_DOMAIN_HCC_OPS: - fprintf(hcc_activity_file_handle, "%lu:%lu %d:%lu %s:%lu:%u\n", - record->begin_ns, record->end_ns, - record->device_id, record->queue_id, - name, record->correlation_id, my_pid); - fflush(hcc_activity_file_handle); + if (hip_memcpy_stats != NULL) { + hip_act_trace_entry_t* entry = hip_act_trace_buffer->GetEntry(); + entry->valid = roctracer::TRACE_ENTRY_COMPL; + entry->type = 0; + entry->kind = record->kind; + entry->dur = record->end_ns - record->begin_ns; + entry->correlation_id = record->correlation_id; + } else { + fprintf(hcc_activity_file_handle, "%lu:%lu %d:%lu %s:%lu:%u\n", + record->begin_ns, record->end_ns, + record->device_id, record->queue_id, + name, record->correlation_id, my_pid); + fflush(hcc_activity_file_handle); + } break; case ACTIVITY_DOMAIN_HSA_OPS: if (record->op == HSA_OP_ID_RESERVED1) { @@ -639,8 +716,10 @@ int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const } // Open output file -FILE* open_output_file(const char* prefix, const char* name) { +FILE* open_output_file(const char* prefix, const char* name, const char** path = NULL) { FILE* file_handle = NULL; + if (path != NULL) *path = NULL; + if (prefix != NULL) { std::ostringstream oss; oss << prefix << "/" << GetPid() << "_" << name; @@ -651,6 +730,8 @@ FILE* open_output_file(const char* prefix, const char* name) { perror(errmsg.str().c_str()); abort(); } + + if (path != NULL) *path = strdup(oss.str().c_str()); } else file_handle = stdout; return file_handle; } @@ -720,6 +801,7 @@ void tool_unload() { // Flush tracing pool close_tracing_pool(); roctracer::TraceBufferBase::FlushAll(); + hip_act_trace_buffer->Flush(true); close_file_handles(); ONLOAD_TRACE_END(); @@ -979,15 +1061,40 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, (void*)mark_api_callback); // Allocating tracing pool open_tracing_pool(); + + // Check for optimized stats + const bool is_stats_opt = (getenv("ROCP_STATS_OPT") != NULL); + + // HIP kernel ma pinstantiation + if (is_stats_opt) hip_kernel_map = new hip_kernel_map_t; + // Enable tracing if (trace_hip_api) { hip_api_file_handle = open_output_file(output_prefix, "hip_api_trace.txt"); ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); - ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); + + if (is_stats_opt) { + const char* path = NULL; + FILE* f = open_output_file(output_prefix, "hip_api_stats.csv", &path); + hip_api_stats = new EvtStats(f, path); + for (uint32_t id = 0; id < HIP_API_ID_NUMBER; id += 1) { + const char* label = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, id, 0); + hip_api_stats->set_label(id, label); + } + } } if (trace_hip_activity) { hcc_activity_file_handle = open_output_file(output_prefix, "hcc_ops_trace.txt"); ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS)); + + if (is_stats_opt) { + FILE* f = NULL; + const char* path = NULL; + f = open_output_file(output_prefix, "hip_kernel_stats.csv", &path); + hip_kernel_stats = new EvtStatsA(f, path); + f = open_output_file(output_prefix, "hip_memcpy_stats.csv", &path); + hip_memcpy_stats = new EvtStatsA(f, path); + } } } @@ -1010,6 +1117,10 @@ extern "C" PUBLIC_API void OnUnload() { extern "C" CONSTRUCTOR_API void constructor() { ONLOAD_TRACE_BEG(); + roctx_trace_buffer = new roctracer::TraceBuffer("rocTX API", 0x200000, roctx_flush_prm, 1); + hip_api_trace_buffer = new roctracer::TraceBuffer("HIP API", 0x200000, hip_api_flush_prm, 1); + hip_act_trace_buffer = new roctracer::TraceBuffer("HIP ACT", 0x200000, hip_act_flush_prm, 1, 1); + hsa_api_trace_buffer = new roctracer::TraceBuffer("HSA API", 0x200000, hsa_flush_prm, 1); roctracer_load(); tool_load(); ONLOAD_TRACE_END(); @@ -1018,7 +1129,11 @@ extern "C" DESTRUCTOR_API void destructor() { ONLOAD_TRACE_BEG(); roctracer_flush_buf(); tool_unload(); + + if (hip_api_stats) hip_api_stats->dump(); + if (hip_kernel_stats) hip_kernel_stats->dump(); + if (hip_memcpy_stats) hip_memcpy_stats->dump(); + roctracer_unload(); ONLOAD_TRACE_END(); } - From db1ccb0619d20489ed7455539ae3c04e5608950e Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sat, 15 Aug 2020 02:23:43 -0500 Subject: [PATCH 02/38] flush-rate option fixed; Change-Id: I50473f8008672772dd4aaf37cbc64472cb50b4a3 --- src/core/roctracer.cpp | 3 +- src/core/trace_buffer.h | 142 ++++++++++++++++++++++---------------- src/proxy/tracker.h | 13 ++-- test/run.sh | 5 +- test/tool/tracer_tool.cpp | 55 +++++++-------- 5 files changed, 119 insertions(+), 99 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index dd47bb0f..52f1e28b 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -223,7 +223,6 @@ constexpr TraceBuffer::flush_prm_t trace_buffer_prm[] = { {KERNEL_ENTRY_TYPE, hsa_kernel_handler} }; TraceBuffer* trace_buffer = NULL; -//TraceBuffer trace_buffer("HSA GPU", 0x200000, trace_buffer_prm, 2); namespace hsa_support { // callbacks table @@ -1127,7 +1126,7 @@ PUBLIC_API roctracer_status_t roctracer_flush_activity_expl(roctracer_pool_t* po API_METHOD_PREFIX if (pool == NULL) pool = roctracer_default_pool(); roctracer::MemoryPool* memory_pool = reinterpret_cast(pool); - memory_pool->Flush(); + if (memory_pool != NULL) memory_pool->Flush(); roctracer::TraceBufferBase::FlushAll(); API_METHOD_SUFFIX } diff --git a/src/core/trace_buffer.h b/src/core/trace_buffer.h index 8d994046..cb6767f2 100644 --- a/src/core/trace_buffer.h +++ b/src/core/trace_buffer.h @@ -36,15 +36,17 @@ enum { TRACE_ENTRY_COMPL = 2 }; -enum { - API_ENTRY_TYPE, - COPY_ENTRY_TYPE, - KERNEL_ENTRY_TYPE +enum entry_type_t { + DFLT_ENTRY_TYPE = 0, + API_ENTRY_TYPE = 1, + COPY_ENTRY_TYPE = 2, + KERNEL_ENTRY_TYPE = 3, + NUM_ENTRY_TYPE = 4 }; struct trace_entry_t { std::atomic valid; - uint32_t type; + entry_type_t type; uint64_t dispatch; uint64_t begin; // kernel begin timestamp, ns uint64_t end; // kernel end timestamp, ns @@ -67,14 +69,26 @@ struct trace_entry_t { template struct push_element_fun { T* const elem_; - void fun(T* node) { if (node->next_elem_ == NULL) node->next_elem_ = elem_; } - push_element_fun(T* elem) : elem_(elem) {} + T** prev_; + bool fun(T* node) { + if (node->priority_ > elem_->priority_) { + *prev_ = elem_; + elem_->next_elem_ = node; + } else if (node->next_elem_ == NULL) { + node->next_elem_ = elem_; + } else { + prev_ = &(node->next_elem_); + return false; + } + return true; + } + push_element_fun(T* elem, T** prev) : elem_(elem), prev_(prev) {} }; template struct call_element_fun { void (T::*fptr_)(); - void fun(T* node) { (node->*fptr_)(); } + bool fun(T* node) const { (node->*fptr_)(); return false; } call_element_fun(void (T::*f)()) : fptr_(f) {} }; @@ -89,10 +103,10 @@ struct TraceBufferBase { static void Push(TraceBufferBase* elem) { if (head_elem_ == NULL) head_elem_ = elem; - else foreach(push_element_fun(elem)); + else foreach(push_element_fun(elem, &head_elem_)); } - TraceBufferBase() : next_elem_(NULL) {} + TraceBufferBase(const uint32_t& prior) : priority_(prior), next_elem_(NULL) {} template static void foreach(const F& f_in) { @@ -101,11 +115,12 @@ struct TraceBufferBase { TraceBufferBase* p = head_elem_; while (p != NULL) { TraceBufferBase* next = p->next_elem_; - f.fun(p); + if (f.fun(p) == true) break; p = next; } } + const uint32_t priority_; TraceBufferBase* next_elem_; static TraceBufferBase* head_elem_; static mutex_t mutex_; @@ -118,35 +133,41 @@ class TraceBuffer : protected TraceBufferBase { typedef TraceBuffer Obj; typedef uint64_t pointer_t; typedef std::recursive_mutex mutex_t; + typedef typename std::list buf_list_t; + typedef typename buf_list_t::iterator buf_list_it_t; struct flush_prm_t { - uint32_t type; + entry_type_t type; callback_t fun; }; TraceBuffer(const char* name, uint32_t size, const flush_prm_t* flush_prm_arr, uint32_t flush_prm_count, uint32_t prior = 0) : - is_flushed_(false), + TraceBufferBase(prior), + size_(size), work_thread_started_(false) { name_ = strdup(name); - size_ = size; data_ = allocate_fun(); next_ = allocate_fun(); read_pointer_ = 0; + write_pointer_ = 0; end_pointer_ = size; buf_list_.push_back(data_); - flush_prm_arr_ = flush_prm_arr; - flush_prm_count_ = flush_prm_count; - - priority_ = prior; + memset(f_array_, 0, sizeof(f_array_)); + for (const flush_prm_t* prm = flush_prm_arr; prm < flush_prm_arr + flush_prm_count; prm++) { + const entry_type_t type = prm->type; + if (type >= NUM_ENTRY_TYPE) FATAL("out of f_array bounds (" << type << ")"); + if (f_array_[type] != NULL) FATAL("handler function ptr redefinition (" << type << ")"); + f_array_[type] = prm->fun; + } TraceBufferBase::Push(this); } ~TraceBuffer() { StopWorkerThread(); - FlushAll(); + Flush(); } void StartWorkerThread() { @@ -171,52 +192,52 @@ class TraceBuffer : protected TraceBufferBase { } Entry* GetEntry() { - const pointer_t pointer = read_pointer_.fetch_add(1); + const pointer_t pointer = write_pointer_.fetch_add(1); if (pointer >= end_pointer_) wrap_buffer(pointer); if (pointer >= end_pointer_) FATAL("pointer >= end_pointer_ after buffer wrap"); - return data_ + (pointer + size_ - end_pointer_); + Entry* entry = data_ + (size_ + pointer - end_pointer_); + entry->valid = TRACE_ENTRY_INV; + entry->type = DFLT_ENTRY_TYPE; + return entry; } void Flush() { flush_buf(); } - void Flush(const bool& b) { - DisableFlushing(!b); - flush_buf(); - } - void DisableFlushing(const bool& b) { is_flushed_.exchange(b, std::memory_order_acquire); } private: void flush_buf() { std::lock_guard lck(mutex_); - const bool is_flushed = is_flushed_.exchange(true, std::memory_order_acquire); - if (priority_ != 0) { - priority_ -= 1; - return; - } + pointer_t pointer = read_pointer_; + pointer_t curr_pointer = write_pointer_.load(std::memory_order_relaxed); + buf_list_it_t it = buf_list_.begin(); + buf_list_it_t end_it = buf_list_.end(); + while(it != end_it) { + Entry* buf = *it; + Entry* ptr = buf + (pointer % size_); + Entry* end_ptr = buf + size_; + while ((ptr < end_ptr) && (pointer < curr_pointer)) { + if (ptr->valid != TRACE_ENTRY_COMPL) break; + + entry_type_t type = ptr->type; + if (type >= NUM_ENTRY_TYPE) FATAL("out of f_array bounds (" << type << ")"); + callback_t f_ptr = f_array_[type]; + if (f_ptr == NULL) FATAL("f_ptr == NULL"); + (*f_ptr)(ptr); + + ptr++; + pointer++; + } - if (is_flushed == false) { - for (const flush_prm_t* prm = flush_prm_arr_; prm < flush_prm_arr_ + flush_prm_count_; prm++) { - // Flushed entries type - uint32_t type = prm->type; - // Flushing function - callback_t fun = prm->fun; - if (fun == NULL) FATAL("flush function is not set"); - - pointer_t pointer = 0; - for (Entry* ptr : buf_list_) { - Entry* end = ptr + size_; - while ((ptr < end) && (pointer < read_pointer_)) { - if (ptr->type == type) { - if (ptr->valid == TRACE_ENTRY_COMPL) { - fun(ptr); - } - } - ptr++; - pointer++; - } - } + buf_list_it_t prev = it; + it++; + if (ptr == end_ptr) { + free_fun(*prev); + buf_list_.erase(prev); } + if (pointer == curr_pointer) break; } + + read_pointer_ = pointer; } inline Entry* allocate_fun() { @@ -226,6 +247,10 @@ class TraceBuffer : protected TraceBufferBase { return ptr; } + inline void free_fun(void* ptr) { + free(ptr); + } + static void* allocate_worker(void* arg) { Obj* obj = (Obj*)arg; @@ -258,17 +283,14 @@ class TraceBuffer : protected TraceBufferBase { } const char* name_; - uint32_t size_; + const uint32_t size_; Entry* data_; Entry* next_; - volatile std::atomic read_pointer_; + pointer_t read_pointer_; + volatile std::atomic write_pointer_; volatile std::atomic end_pointer_; - std::list buf_list_; - - const flush_prm_t* flush_prm_arr_; - uint32_t flush_prm_count_; - uint32_t priority_; - volatile std::atomic is_flushed_; + buf_list_t buf_list_; + callback_t f_array_[NUM_ENTRY_TYPE]; pthread_t work_thread_; pthread_mutex_t work_mutex_; diff --git a/src/proxy/tracker.h b/src/proxy/tracker.h index edb223b0..dc0322bd 100644 --- a/src/proxy/tracker.h +++ b/src/proxy/tracker.h @@ -40,9 +40,10 @@ class Tracker { public: typedef util::HsaRsrcFactory::timestamp_t timestamp_t; typedef roctracer::trace_entry_t entry_t; + typedef roctracer::entry_type_t entry_type_t; // Add tracker entry - inline static void Enable(uint32_t type, const hsa_agent_t& agent, const hsa_signal_t& signal, entry_t* entry) { + inline static void Enable(entry_type_t type, const hsa_agent_t& agent, const hsa_signal_t& signal, entry_t* entry) { hsa_status_t status = HSA_STATUS_ERROR; util::HsaRsrcFactory* hsa_rsrc = &(util::HsaRsrcFactory::Instance()); @@ -88,13 +89,16 @@ class Tracker { } entry->complete = hsa_rsrc->TimestampNs(); + hsa_signal_t orig = entry->orig; + hsa_signal_t signal = entry->signal; + + // Releasing completed entry entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); // Original intercepted signal completion - hsa_signal_t orig = entry->orig; if (orig.handle) { amd_signal_t* orig_signal_ptr = reinterpret_cast(orig.handle); - amd_signal_t* prof_signal_ptr = reinterpret_cast(entry->signal.handle); + amd_signal_t* prof_signal_ptr = reinterpret_cast(signal.handle); orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; @@ -102,7 +106,7 @@ class Tracker { if (signal_value != new_value) EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value"); hsa_signal_store_screlease(orig, signal_value); } - hsa_signal_destroy(entry->signal); + hsa_signal_destroy(signal); } // Handler for packet completion @@ -113,7 +117,6 @@ class Tracker { // Complete entry Tracker::Complete(signal_value, entry); - return false; } }; diff --git a/test/run.sh b/test/run.sh index 962033f6..c5c8aa45 100755 --- a/test/run.sh +++ b/test/run.sh @@ -79,7 +79,9 @@ eval_test() { test_runnum=$((test_runnum + 1)) eval "$cmdline" >$test_trace 2>&1 is_failed=$? - cat $test_trace + if [ $is_failed != 0 ] ; then + cat $test_trace + fi if [ $IS_CI = 1 ] ; then is_failed=0; else @@ -87,6 +89,7 @@ eval_test() { python ./test/check_trace.py -in $test_name -ck $check_trace_flag is_failed=$? if [ $is_failed != 0 ] ; then + echo "Trace checker error:" python ./test/check_trace.py -v -in $test_name -ck $check_trace_flag fi fi diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index db87d04d..a075bc29 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -176,6 +176,8 @@ void* control_thr_fun(void*) { usleep(dist_us); } } + + return NULL; } // Flushing control thread @@ -204,8 +206,8 @@ void* flush_thr_fun(void*) { // rocTX annotation tracing struct roctx_trace_entry_t { - uint32_t valid; - uint32_t type; + std::atomic valid; + roctracer::entry_type_t type; uint32_t cid; timestamp_t time; uint32_t pid; @@ -215,9 +217,8 @@ struct roctx_trace_entry_t { }; void roctx_flush_cb(roctx_trace_entry_t* entry); -constexpr roctracer::TraceBuffer::flush_prm_t roctx_flush_prm[1] = {{0, roctx_flush_cb}}; +constexpr roctracer::TraceBuffer::flush_prm_t roctx_flush_prm = {roctracer::DFLT_ENTRY_TYPE, roctx_flush_cb}; roctracer::TraceBuffer* roctx_trace_buffer = NULL; -//roctracer::TraceBuffer roctx_trace_buffer("rocTX API", 0x200000, roctx_flush_prm, 1); // rocTX callback function static inline void roctx_callback_fun( @@ -233,14 +234,13 @@ static inline void roctx_callback_fun( const timestamp_t time = timer->timestamp_fn_ns(); #endif roctx_trace_entry_t* entry = roctx_trace_buffer->GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; entry->cid = cid; entry->time = time; entry->pid = GetPid(); entry->tid = tid; entry->rid = rid; entry->message = (message != NULL) ? strdup(message) : NULL; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } void roctx_api_callback( @@ -283,8 +283,8 @@ void roctx_flush_cb(roctx_trace_entry_t* entry) { // HSA API tracing struct hsa_api_trace_entry_t { - uint32_t valid; - uint32_t type; + std::atomic valid; + roctracer::entry_type_t type; uint32_t cid; timestamp_t begin; timestamp_t end; @@ -294,9 +294,8 @@ struct hsa_api_trace_entry_t { }; void hsa_api_flush_cb(hsa_api_trace_entry_t* entry); -constexpr roctracer::TraceBuffer::flush_prm_t hsa_flush_prm[1] = {{0, hsa_api_flush_cb}}; +constexpr roctracer::TraceBuffer::flush_prm_t hsa_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hsa_api_flush_cb}; roctracer::TraceBuffer* hsa_api_trace_buffer = NULL; -//roctracer::TraceBuffer hsa_api_trace_buffer("HSA API", 0x200000, hsa_flush_prm, 1); // HSA API callback function void hsa_api_callback( @@ -312,14 +311,13 @@ void hsa_api_callback( } else { const timestamp_t end_timestamp = (cid == HSA_API_ID_hsa_shut_down) ? hsa_begin_timestamp : timer->timestamp_fn_ns(); hsa_api_trace_entry_t* entry = hsa_api_trace_buffer->GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; entry->cid = cid; entry->begin = hsa_begin_timestamp; entry->end = end_timestamp; entry->pid = GetPid(); entry->tid = GetTid(); entry->data = *data; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } } @@ -343,8 +341,8 @@ void hsa_activity_callback( // HIP API tracing struct hip_api_trace_entry_t { - uint32_t valid; - uint32_t type; + std::atomic valid; + roctracer::entry_type_t type; uint32_t domain; uint32_t cid; timestamp_t begin; @@ -357,9 +355,8 @@ struct hip_api_trace_entry_t { }; void hip_api_flush_cb(hip_api_trace_entry_t* entry); -constexpr roctracer::TraceBuffer::flush_prm_t hip_api_flush_prm[1] = {{0, hip_api_flush_cb}}; +constexpr roctracer::TraceBuffer::flush_prm_t hip_api_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hip_api_flush_cb}; roctracer::TraceBuffer* hip_api_trace_buffer = NULL; -//roctracer::TraceBuffer hip_api_trace_buffer("HIP API", 0x200000, hip_api_flush_prm, 1); static inline bool is_hip_kernel_launch_api(const uint32_t& cid) { bool ret = @@ -390,8 +387,6 @@ void hip_api_callback( const timestamp_t end_timestamp = timer->timestamp_fn_ns(); hip_api_trace_entry_t* entry = hip_api_trace_buffer->GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; entry->cid = cid; entry->domain = domain; entry->begin = hip_begin_timestamp; @@ -437,6 +432,8 @@ void hip_api_callback( } } } + + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } } @@ -451,8 +448,6 @@ void mark_api_callback( const timestamp_t timestamp = timer->timestamp_fn_ns(); hip_api_trace_entry_t* entry = hip_api_trace_buffer->GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; entry->cid = 0; entry->domain = domain; entry->begin = timestamp; @@ -462,6 +457,7 @@ void mark_api_callback( entry->data = {}; entry->name = strdup(name); entry->ptr = NULL; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } typedef std::map hip_kernel_map_t; @@ -567,17 +563,16 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { // HSA API tracing struct hip_act_trace_entry_t { - uint32_t valid; - uint32_t type; + std::atomic valid; + roctracer::entry_type_t type; uint32_t kind; timestamp_t dur; uint64_t correlation_id; }; void hip_act_flush_cb(hip_act_trace_entry_t* entry); -constexpr roctracer::TraceBuffer::flush_prm_t hip_act_flush_prm[1] = {{0, hip_act_flush_cb}}; +constexpr roctracer::TraceBuffer::flush_prm_t hip_act_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hip_act_flush_cb}; roctracer::TraceBuffer* hip_act_trace_buffer = NULL; -//roctracer::TraceBuffer hip_act_trace_buffer("HIP ACT", 0x200000, hip_act_flush_prm, 1); // HIP ACT trace buffer flush callback void hip_act_flush_cb(hip_act_trace_entry_t* entry) { @@ -616,11 +611,10 @@ void pool_activity_callback(const char* begin, const char* end, void* arg) { case ACTIVITY_DOMAIN_HCC_OPS: if (hip_memcpy_stats != NULL) { hip_act_trace_entry_t* entry = hip_act_trace_buffer->GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; entry->kind = record->kind; entry->dur = record->end_ns - record->begin_ns; entry->correlation_id = record->correlation_id; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } else { fprintf(hcc_activity_file_handle, "%lu:%lu %d:%lu %s:%lu:%u\n", record->begin_ns, record->end_ns, @@ -801,7 +795,6 @@ void tool_unload() { // Flush tracing pool close_tracing_pool(); roctracer::TraceBufferBase::FlushAll(); - hip_act_trace_buffer->Flush(true); close_file_handles(); ONLOAD_TRACE_END(); @@ -1117,10 +1110,10 @@ extern "C" PUBLIC_API void OnUnload() { extern "C" CONSTRUCTOR_API void constructor() { ONLOAD_TRACE_BEG(); - roctx_trace_buffer = new roctracer::TraceBuffer("rocTX API", 0x200000, roctx_flush_prm, 1); - hip_api_trace_buffer = new roctracer::TraceBuffer("HIP API", 0x200000, hip_api_flush_prm, 1); - hip_act_trace_buffer = new roctracer::TraceBuffer("HIP ACT", 0x200000, hip_act_flush_prm, 1, 1); - hsa_api_trace_buffer = new roctracer::TraceBuffer("HSA API", 0x200000, hsa_flush_prm, 1); + roctx_trace_buffer = new roctracer::TraceBuffer("rocTX API", 0x200000, &roctx_flush_prm, 1); + hip_api_trace_buffer = new roctracer::TraceBuffer("HIP API", 0x200000, &hip_api_flush_prm, 1); + hip_act_trace_buffer = new roctracer::TraceBuffer("HIP ACT", 0x200000, &hip_act_flush_prm, 1, 1); + hsa_api_trace_buffer = new roctracer::TraceBuffer("HSA API", 0x200000, &hsa_flush_prm, 1); roctracer_load(); tool_load(); ONLOAD_TRACE_END(); From dae98a346cebf58a935d0bfa79c3c51f6db7f58b Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 28 Aug 2020 06:31:24 -0500 Subject: [PATCH 03/38] Format ostream ops; Change-Id: I33d01cde41d9a762a8a955a1faccfdef02d8c0ac --- inc/roctracer_hip.h | 14 ++++++++ script/gen_ostream_ops.py | 72 +++++++++++++++++++++++++++++++-------- test/tool/tracer_tool.cpp | 1 + 3 files changed, 72 insertions(+), 15 deletions(-) diff --git a/inc/roctracer_hip.h b/inc/roctracer_hip.h index 86ffc1ae..091f3279 100644 --- a/inc/roctracer_hip.h +++ b/inc/roctracer_hip.h @@ -23,6 +23,20 @@ THE SOFTWARE. #ifndef INC_ROCTRACER_HIP_H_ #define INC_ROCTRACER_HIP_H_ +#ifdef __cplusplus +#include + +inline static std::ostream& operator<<(std::ostream& out, const unsigned char& v) { + out << (unsigned int)v; + return out; +} + +inline static std::ostream& operator<<(std::ostream& out, const char& v) { + out << (unsigned char)v; + return out; +} +#endif // __cplusplus + #include #include #include diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index 142ec98e..900c3677 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -3,6 +3,7 @@ import os, sys, re import CppHeaderParser import argparse +import string LICENSE = \ '/*\n' + \ @@ -33,7 +34,7 @@ ' inline static std::ostream& put(std::ostream& out, const T& v) { return out; }\n' + \ '};\n\n' -header_hip = \ +header_basic = \ 'template \n' + \ ' inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n' + \ ' using std::operator<<;\n' + \ @@ -43,6 +44,7 @@ structs_analyzed = {} global_ops_hip = '' +global_str = '' # process_struct traverses recursively all structs to extract all fields def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, apiname): @@ -51,6 +53,7 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a # cppHeader: cppHeader object created by CppHeaderParser.CppHeader(...) # parent_hier_name: parent hierarchical name used for nested structs/enums # apiname: for example hip, kfd. + global global_str if cppHeader_struct == 'max_align_t': #function pointers not working in cppheaderparser return @@ -59,7 +62,7 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a if cppHeader_struct in structs_analyzed: return - structs_analyzed[cppHeader_struct] = 1; + structs_analyzed[cppHeader_struct] = 1 for l in reversed(range(len(cppHeader.classes[cppHeader_struct]["properties"]["public"]))): key = 'name' name = "" @@ -85,16 +88,21 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a if key4 in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]: prop = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key4] + str = '' if "union" not in mtype: if apiname.lower() == 'hip' or apiname.lower() == 'hsa': - str = " roctracer::" + apiname.lower() + "_support::operator<<(out, v."+name+");\n" + str += " roctracer::" + apiname.lower() + "_support::operator<<(out, \"" + name + " = \");\n" + str += " roctracer::" + apiname.lower() + "_support::operator<<(out, v."+name+");\n" + str += " roctracer::" + apiname.lower() + "_support::operator<<(out, \", \");\n" else: + str += " roctracer::" + apiname.lower() + "_support::output_streamer::put(out, \"" + name + " = \");\n" if array_size == "": - str = " roctracer::" + apiname.lower() + "_support::output_streamer<"+mtype+">::put(out,v."+name+");\n" + str += " roctracer::" + apiname.lower() + "_support::output_streamer<" + mtype + ">::put(out, v." + name + ");\n" else: - str = " roctracer::" + apiname.lower() + "_support::output_streamer<"+mtype+"["+array_size+"]>::put(out,v."+name+");\n" + str += " roctracer::" + apiname.lower() + "_support::output_streamer<" + mtype + "[" + array_size + "]>::put(out, v." + name + ");\n" + str += " roctracer::" + apiname.lower() + "_support::output_streamer::put(out, \", \");\n" if "void" not in mtype: - file_handle.write(str) + global_str += str else: if prop != '': next_cppHeader_struct = prop + "::" @@ -105,11 +113,12 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a process_struct(file_handle, next_cppHeader_struct, cppHeader, name, apiname) # Parses API header file and generates ostream ops files ostream_ops.h -def gen_cppheader(infilepath, outfilepath): +def gen_cppheader(infilepath, outfilepath, structs_depth): # infilepath: API Header file to be parsed # outfilepath: Output file where ostream operators are written global_ops_hip = '' global_ops_hsa = '' + global global_str try: cppHeader = CppHeaderParser.CppHeader(infilepath) except CppHeaderParser.CppParseError as e: @@ -140,10 +149,12 @@ def gen_cppheader(infilepath, outfilepath): f.write('\n') f.write('namespace roctracer {\n') f.write('namespace ' + apiname.lower() + '_support {\n') + if structs_depth != -1: + f.write('static int ' + apiname.upper() + '_depth_max = ' + str(structs_depth) + ';\n') f.write('// begin ostream ops for '+ apiname + ' \n') if apiname.lower() == "hip" or apiname.lower() == "hsa": f.write("// basic ostream ops\n") - f.write(header_hip) + f.write(header_basic) f.write("// End of basic ostream ops\n\n") else: f.write(header) @@ -154,28 +165,56 @@ def gen_cppheader(infilepath, outfilepath): if apiname.lower() == 'hsa': if c == 'max_align_t' or c == '__fsid_t': #already defined for hip continue - #if apiname.lower() == 'hip' and c == 'hipIpcEventHandle_t': #feature is TBD - # continue if len(cppHeader.classes[c]["properties"]["public"])!=0: if apiname.lower() == 'hip' or apiname.lower() == 'hsa': f.write("std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") f.write("{\n") + f.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '{');\n") + if structs_depth != -1: + f.write(" " + apiname.upper() + "_depth_max++;\n") + f.write(" if (" + apiname.upper() + "_depth_max <= " + str(structs_depth) + ") {\n" ) process_struct(f, c, cppHeader, "", apiname) - f.write(" return out;\n") + global_str = "\n".join(global_str.split("\n")[0:-2]) + if structs_depth != -1: #reindent + global_str = string.split(global_str, '\n') + global_str = [' ' + string.lstrip(line) for line in global_str] + global_str = string.join(global_str, '\n') + f.write(global_str+"\n") + if structs_depth != -1: + f.write(" };\n") + f.write(" " + apiname.upper() + "_depth_max--;\n") + f.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '}');\n") + f.write(" return out;\n") f.write("}\n") + global_str = '' else: f.write("\ntemplate<>\n") f.write("struct output_streamer<" + c + "&> {\n") f.write(" inline static std::ostream& put(std::ostream& out, "+c+"& v)\n") f.write("{\n") + f.write(" roctracer::" + apiname.lower() + "_support::output_streamer::put(out, '{');\n") + if structs_depth != -1: + f.write(apiname.upper() + "_depth_max++;\n") + f.write(" if (" + apiname.upper() + "_depth_max <= " + str(structs_depth) + ") {\n" ) process_struct(f, c, cppHeader, "", apiname) - f.write(" return out;\n") + global_str = "\n".join(global_str.split("\n")[0:-2]) + if structs_depth != -1: #reindent + global_str = string.split(global_str, '\n') + global_str = [' ' + string.lstrip(line) for line in global_str] + global_str = string.join(global_str, '\n') + f.write(global_str+"\n") + if structs_depth != -1: + f.write(" };\n") + f.write(" " + apiname.upper() + "_depth_max--;\n") + f.write(" roctracer::" + apiname.lower() + "_support::output_streamer::put(out, '}');\n") + f.write(" return out;\n") f.write("}\n") f.write("};\n") + global_str = '' if apiname.lower() == 'hip': - global_ops_hip += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hip_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" + global_ops_hip += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hip_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" if apiname.lower() == 'hsa': - global_ops_hsa += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hsa_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" + global_ops_hsa += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hsa_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" footer = \ '// end ostream ops for '+ apiname + ' \n' @@ -196,8 +235,11 @@ def gen_cppheader(infilepath, outfilepath): requiredNamed = parser.add_argument_group('Required arguments') requiredNamed.add_argument('-in', metavar='file', help='Header file to be parsed', required=True) requiredNamed.add_argument('-out', metavar='file', help='Output file with ostream operators', required=True) +requiredNamed.add_argument('-depth', metavar='N', type=int, help='Depth for nested structs', required=False) +structs_depth = 0 args = vars(parser.parse_args()) if __name__ == '__main__': - gen_cppheader(args['in'], args['out']) + if args['depth'] != None: structs_depth = args['depth'] + gen_cppheader(args['in'], args['out'], structs_depth) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index a075bc29..ad866012 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1110,6 +1110,7 @@ extern "C" PUBLIC_API void OnUnload() { extern "C" CONSTRUCTOR_API void constructor() { ONLOAD_TRACE_BEG(); + roctracer::hip_support::HIP_depth_max = 0; roctx_trace_buffer = new roctracer::TraceBuffer("rocTX API", 0x200000, &roctx_flush_prm, 1); hip_api_trace_buffer = new roctracer::TraceBuffer("HIP API", 0x200000, &hip_api_flush_prm, 1); hip_act_trace_buffer = new roctracer::TraceBuffer("HIP ACT", 0x200000, &hip_act_flush_prm, 1, 1); From b730da09041d52419bfa7a27fbed9c4b6afaa8d0 Mon Sep 17 00:00:00 2001 From: Rachida Kebichi Date: Fri, 11 Sep 2020 14:25:21 -0400 Subject: [PATCH 04/38] Fix for trace checker Change-Id: Ib8a0df7b7bb0da2e68b5b4d99ce8025de169f317 (cherry picked from commit 29da9a744d7da29fdb691d28ada3212647bb8379) --- script/check_trace.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/script/check_trace.py b/script/check_trace.py index 29baff8a..a4506a12 100644 --- a/script/check_trace.py +++ b/script/check_trace.py @@ -120,14 +120,14 @@ def diff_strings(cnt_r, cnt, metric): for evt2 in events_order[tid]: if diff_cnt == diff_cnt_r: if evt != evt2: - print (">I< Difference starts at index: " + str(diff_cnt_r) + ", tid_r " + str(tid_r) + ", tid " + str(tid) + ", with evts " + evt + " and " + evt2 + "\n") + print (">I< Difference starts at tid rank: " + str(cnt_tid) + " event index: " + str(diff_cnt_r) + ", tid_r " + str(tid_r) + ", tid " + str(tid) + ", with evts " + evt + " and " + evt2 + "\n") found_diff_evt = 1 break diff_cnt += 1 diff_cnt_r += 1 if found_diff_evt: break if len(events_order_r[tid_r]) != len(events_order[tid]) and found_diff_evt == 0: - print (">I< Difference starts at index: " + str(min(len(events_order_r[tid_r]), len(events_order[tid]))) + ", with missing evts\n") + print (">I< Difference starts at tid rank: " + str(cnt_tid) + " event index: " + str(min(len(events_order_r[tid_r]), len(events_order[tid]))) + ", with missing evts\n") break cnt_tid += 1 cnt_tid_r += 1 @@ -292,6 +292,10 @@ def gen_events_info(tracefile, trace_level, no_events_cnt, events2ignore, events if metric == 'or': for tid in sorted (events_order.keys()) : res = res + str(events_order[tid]) + if metric == 'cnt': + newres = res.split('\n') + newres = sorted(newres) + res = str(newres) return res parser = argparse.ArgumentParser(description='check_trace.py: check a trace aainst golden ref. Returns 0 for success, 1 for failure') From 4c2b6cbba493f34e33623c13b0418ec174c8e5d6 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sat, 19 Sep 2020 16:49:46 -0500 Subject: [PATCH 05/38] codeobj tracing prof protocol Change-Id: Ib49c8ee034fb7481b21f950490e10b350f2a1b79 (cherry picked from commit 6567c48e98b0cf3a10ff46bf411057642c307990) --- inc/ext/prof_protocol.h | 1 + 1 file changed, 1 insertion(+) diff --git a/inc/ext/prof_protocol.h b/inc/ext/prof_protocol.h index c29ff0e6..1c00e972 100644 --- a/inc/ext/prof_protocol.h +++ b/inc/ext/prof_protocol.h @@ -36,6 +36,7 @@ typedef enum { ACTIVITY_DOMAIN_KFD_API = 4, // KFD API domain ACTIVITY_DOMAIN_EXT_API = 5, // External ID domain ACTIVITY_DOMAIN_ROCTX = 6, // ROCTX domain + ACTIVITY_DOMAIN_HSA_EVT = 7, // HSA events ACTIVITY_DOMAIN_NUMBER } activity_domain_t; From 35bc1e93f84b9ed74de1f1fd39a2665d85d2cebb Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 9 Sep 2020 01:44:51 -0500 Subject: [PATCH 06/38] gen_ostream_ops.py fix - ostream operators as inline static Change-Id: I9688236b06dd167960662b8eecf1a07c93b43fff (cherry picked from commit c9ed0f067d779e89f94bb2d0a2d25618a69f4623) --- script/gen_ostream_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index 900c3677..73585ce8 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -167,7 +167,7 @@ def gen_cppheader(infilepath, outfilepath, structs_depth): continue if len(cppHeader.classes[c]["properties"]["public"])!=0: if apiname.lower() == 'hip' or apiname.lower() == 'hsa': - f.write("std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") + f.write("inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") f.write("{\n") f.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '{');\n") if structs_depth != -1: From 5bf3efa8aa72817200410483bdfd554ac5f46f58 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 7 Sep 2020 13:50:10 -0500 Subject: [PATCH 07/38] build normalizing - generating under build directory Change-Id: Id9203aec7800024bd749059a415fb29b8051005a --- CMakeLists.txt | 11 +++++- build.sh | 2 +- script/hsaap.py | 6 +-- script/kfdap.py | 60 +++++++++++++++--------------- src/CMakeLists.txt | 41 ++++++++++---------- src/kfd/.gitignore | 1 - test/CMakeLists.txt | 3 +- test/MatrixTranspose/Makefile | 2 +- test/MatrixTranspose_test/Makefile | 2 +- test/run.sh | 2 +- 10 files changed, 69 insertions(+), 61 deletions(-) delete mode 100644 src/kfd/.gitignore diff --git a/CMakeLists.txt b/CMakeLists.txt index f1ad1982..97c06cf9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,12 +100,14 @@ set ( PUBLIC_HEADERS roctracer_kfd.h roctracer_roctx.h roctracer_cb_table.h + ext/prof_protocol.h + ext/hsa_rt_utils.hpp +) +set ( GEN_HEADERS hip_ostream_ops.h hsa_prof_str.h kfd_ostream_ops.h kfd_prof_str.h - ext/prof_protocol.h - ext/hsa_rt_utils.hpp ) if ( ${LIBRARY_TYPE} STREQUAL SHARED ) @@ -137,6 +139,11 @@ foreach ( header ${PUBLIC_HEADERS} ) install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/${header} DESTINATION ${DEST_NAME}/include/${header_subdir} ) install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/${header} DESTINATION include/${DEST_NAME}/${header_subdir} ) endforeach () +foreach ( header ${GEN_HEADERS} ) + get_filename_component ( header_subdir ${header} DIRECTORY ) + install ( FILES ${PROJECT_BINARY_DIR}/inc/${header} DESTINATION ${DEST_NAME}/include/${header_subdir} ) + install ( FILES ${PROJECT_BINARY_DIR}/inc/${header} DESTINATION include/${DEST_NAME}/${header_subdir} ) +endforeach () #install ( FILES ${PROJECT_BINARY_DIR}/inc-link DESTINATION include RENAME ${DEST_NAME} ) install ( FILES ${PROJECT_BINARY_DIR}/so-link DESTINATION lib RENAME ${ROCTRACER_LIBRARY}.so ) install ( FILES ${PROJECT_BINARY_DIR}/so-major-link DESTINATION lib RENAME ${ROCTRACER_LIBRARY}.so.${LIB_VERSION_MAJOR} ) diff --git a/build.sh b/build.sh index a8515b69..cae5a59a 100755 --- a/build.sh +++ b/build.sh @@ -1,4 +1,4 @@ -#!/bin/bash -x +#!/bin/bash -e SRC_DIR=`dirname $0` COMPONENT="roctracer" ROCM_PATH="${ROCM_PATH:=/opt/rocm}" diff --git a/script/hsaap.py b/script/hsaap.py index a50b3d4d..84ee9bbf 100755 --- a/script/hsaap.py +++ b/script/hsaap.py @@ -490,15 +490,15 @@ def gen_out_stream(self, n, name, call, struct): # main # Usage if len(sys.argv) != 3: - print ("Usage:", sys.argv[0], " ", file=sys.stderr) + print ("Usage:", sys.argv[0], " ", file=sys.stderr) sys.exit(1) else: - ROOT = sys.argv[1] + '/' + PREFIX = sys.argv[1] + '/' HSA_DIR = sys.argv[2] + '/' descr = API_DescrParser(OUT, HSA_DIR, API_TABLES_H, API_HEADERS_H, LICENSE) -out_file = ROOT + OUT +out_file = PREFIX + OUT print ('Generating "' + out_file + '"') f = open(out_file, 'w') f.write(descr.content[:-1]) diff --git a/script/kfdap.py b/script/kfdap.py index e920bbf6..9f560a35 100755 --- a/script/kfdap.py +++ b/script/kfdap.py @@ -2,10 +2,10 @@ from __future__ import print_function import os, sys, re -OUT_H = 'inc/kfd_prof_str.h' -OUT_CPP = 'src/kfd/kfd_wrapper.cpp' -API_HEADERS_H = ( - ('HSAKMTAPI', 'hsakmt.h'), +OUT_H = 'inc/kfd_prof_str.h' +OUT_CPP = 'src/kfd_wrapper.cpp' +API_HEADERS_H = ( + ('HSAKMTAPI', 'hsakmt.h'), ) LICENSE = \ @@ -38,7 +38,7 @@ def fatal(module, msg): sys.exit(1) # Get next text block -def NextBlock(pos, record): +def NextBlock(pos, record): if len(record) == 0: return pos space_pattern = re.compile(r'(\s+)') @@ -82,8 +82,8 @@ def __init__(self, header, name, full_fct): self.inp = open(header, 'r') - self.beg_pattern = re.compile(name) - self.end_pattern = re.compile('.*\)\s*;\s*$'); + self.beg_pattern = re.compile(name) + self.end_pattern = re.compile('.*\)\s*;\s*$'); self.array = [] self.parse() @@ -92,10 +92,10 @@ def norm_line(self, line): return re.sub(r'^\s+', r' ', line) def fix_comment_line(self, line): - return re.sub(r'\/\/.*', r'', line) + return re.sub(r'\/\/.*', r'', line) def remove_ret_line(self, line): - return re.sub(r'\n', r'', line) + return re.sub(r'\n', r'', line) # check for start record def is_start(self, record): @@ -107,7 +107,7 @@ def is_end(self, record): # check for declaration entry record def is_entry(self, record): - return re.match(r'^\s*HSAKMTAPI\s*(.*)\s*\((.*)\)', record) + return re.match(r'^\s*HSAKMTAPI\s*(.*)\s*\((.*)\)', record) # parse method def parse(self): @@ -121,7 +121,7 @@ def parse(self): line = self.norm_line(line) line = self.fix_comment_line(line) - if cumulate == 1: record += " " + line; + if cumulate == 1: record += " " + line; else: record = line; if self.is_start(line): rettype = prev_line.strip(); cumulate = 1; prev_line = line; continue; if self.is_end(line): record = self.remove_ret_line(record); cumulate = 0; active = 1; @@ -132,7 +132,7 @@ def parse(self): mycall_full = rettype + " " + m.group(1) + ' (' + m.group(2) + ')' mycall = m.group(1) self.full_fct[mycall] = mycall_full - self.array.append(mycall) + self.array.append(mycall) rettype = ""; prev_line = line @@ -173,7 +173,7 @@ def get_args(self, record): struct = {'ret': '', 'args': '', 'astr': {}, 'alst': [], 'tlst': []} record = re.sub(r'^\s+', r'', record) record = re.sub(r'\s*(\*+)\s*', r'\1 ', record) - rind = NextBlock(0, record) + rind = NextBlock(0, record) struct['ret'] = record[0:rind] pos = record.find('(') end = NextBlock(pos, record); @@ -184,7 +184,7 @@ def get_args(self, record): struct['args'] = re.sub(r',', r', ', args) if args == "void": return struct - + if len(args) == 0: return struct pos = 0 @@ -217,7 +217,7 @@ def get_args(self, record): # parse given api def parse(self, call, full_fct): - if call in full_fct: + if call in full_fct: self.data[call] = self.get_args(full_fct[call]) else: self.data[call] = self.get_args(call) @@ -238,7 +238,7 @@ def __init__(self, out_file, kfd_dir, api_headers, license): self.api_calls = {} self.api_rettypes = set() self.api_id = {} - + api_data = {} full_fct = {} api_list = [] @@ -271,7 +271,7 @@ def __init__(self, out_file, kfd_dir, api_headers, license): self.ns_calls = ns_calls self.content_h += "// automatically generated\n\n" + license + '\n' - + self.content_h += "/////////////////////////////////////////////////////////////////////////////\n" for call in self.ns_calls: self.content_h += '// ' + call + ' was not parsed\n' @@ -298,7 +298,7 @@ def __init__(self, out_file, kfd_dir, api_headers, license): self.content_h += 'namespace kfd_support {\n' self.add_section('API get_name function', ' ', self.gen_get_name) - self.add_section('API get_code function', ' ', self.gen_get_code) + self.add_section('API get_code function', ' ', self.gen_get_code) self.add_section('API intercepting code', '', self.gen_intercept_decl) self.add_section('API intercepting code', '', self.gen_intercept) @@ -369,7 +369,7 @@ def gen_id_enum(self, n, name, call, data): self.content_h += ' KFD_API_ID_NUMBER = ' + str(n) + ',\n' self.content_h += ' KFD_API_ID_ANY = ' + str(n + 1) + ',\n' self.content_h += '};\n' - + # generate API args structure def gen_arg_struct(self, n, name, call, struct): if n == -1: @@ -396,7 +396,7 @@ def gen_arg_struct(self, n, name, call, struct): else: self.content_h += ' } args;\n' self.content_h += '} kfd_api_data_t;\n' - + # generate API callbacks def gen_callbacks(self, n, name, call, struct): if n == -1: @@ -406,7 +406,7 @@ def gen_callbacks(self, n, name, call, struct): if call != '-': call_id = self.api_id[call]; ret_type = struct['ret'] - self.content_h += ret_type + ' ' + call + '_callback(' + struct['args'] + ') {\n' # 'static ' + + self.content_h += ret_type + ' ' + call + '_callback(' + struct['args'] + ') {\n' # 'static ' + self.content_h += ' if (' + name + '_table == NULL) intercept_KFDApiTable();\n' self.content_h += ' kfd_api_data_t api_data{};\n' for var in struct['alst']: @@ -448,7 +448,7 @@ def gen_intercept(self, n, name, call, struct): if call != '-': self.content_h += ' typedef decltype(' + name + '_table_t::' + call + '_fn) ' + call + '_t;\n' - self.content_h += ' ' + name + '_table->' + call + '_fn = (' + call + '_t)' + 'dlsym(RTLD_NEXT,\"' + call + '\");\n' + self.content_h += ' ' + name + '_table->' + call + '_fn = (' + call + '_t)' + 'dlsym(RTLD_NEXT,\"' + call + '\");\n' # generate API name function def gen_get_name(self, n, name, call, struct): @@ -493,7 +493,7 @@ def gen_out_stream(self, n, name, call, struct): arg_var = arg_list[ind] arg_val = 'api_data.args.' + call + '.' + arg_var if re.search(r'MemFlags',arg_var): - continue + continue self.content_h += ' typedef decltype(' + arg_val.replace("[]","") + ') arg_val_type_t' + str(ind) + ';\n' self.content_h += ' roctracer::kfd_support::output_streamer::put(out, ' + arg_val.replace("[]","") + ')' if ind < len(arg_list)-1: self.content_h += ' << ", ";\n' @@ -510,11 +510,11 @@ def gen_out_stream(self, n, name, call, struct): self.content_h += ' abort();\n' self.content_h += ' }\n' self.content_h += ' return out;\n' - self.content_h += '}\n' + self.content_h += '}\n' self.content_h += '#endif\n' - self.content_cpp += 'inline std::ostream& operator<< (std::ostream& out, const HsaMemFlags& v) { out << "HsaMemFlags"; return out; }\n' + self.content_cpp += 'inline std::ostream& operator<< (std::ostream& out, const HsaMemFlags& v) { out << "HsaMemFlags"; return out; }\n' - # generate PUBLIC_API for all API fcts + # generate PUBLIC_API for all API fcts def gen_public_api(self, n, name, call, struct): if n == -1: self.content_cpp += 'extern "C" {\n' @@ -540,21 +540,21 @@ def gen_public_api(self, n, name, call, struct): # main # Usage if len(sys.argv) != 3: - print ("Usage:", sys.argv[0], " ", file = sys.stderr) + print ("Usage:", sys.argv[0], " ", file = sys.stderr) sys.exit(1) else: - ROOT = sys.argv[1] + '/' + PREFIX = sys.argv[1] + '/' KFD_DIR = sys.argv[2] + '/' descr = API_DescrParser(OUT_H, KFD_DIR, API_HEADERS_H, LICENSE) -out_file = ROOT + OUT_H +out_file = PREFIX + OUT_H print ('Generating "' + out_file + '"') f = open(out_file, 'w') f.write(descr.content_h[:-1]) f.close() -out_file = ROOT + OUT_CPP +out_file = PREFIX + OUT_CPP print ('Generating "' + out_file + '"') f = open(out_file, 'w') f.write(descr.content_cpp[:-1]) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ceb33c74..c794c491 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,19 @@ -# +# Generating tracing primitives +set ( GEN_INC_DIR ${PROJECT_BINARY_DIR}/inc ) +set ( GEN_SRC_DIR ${PROJECT_BINARY_DIR}/src ) +execute_process ( COMMAND sh -xc "mkdir -p ${GEN_INC_DIR}" ) +execute_process ( COMMAND sh -xc "mkdir -p ${GEN_SRC_DIR}" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/hsaap.py ${PROJECT_BINARY_DIR} ${HSA_RUNTIME_INC_PATH}" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/kfdap.py ${PROJECT_BINARY_DIR} ${HSA_KMT_INC_PATH}" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_KMT_INC_PATH}/hsakmttypes.h > ${GEN_INC_DIR}/hsakmttypes_pp.h" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsakmttypes_pp.h -out ${GEN_INC_DIR}/kfd_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HIP_PATH}/include/hip/hip_runtime_api.h ${HIP_DEFINES} -I${HIP_PATH}/include -I${ROCM_ROOT_DIR}/hsa/include > ${GEN_INC_DIR}/hip_runtime_api_pp.h" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hip_runtime_api_pp.h -out ${GEN_INC_DIR}/hip_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${GEN_INC_DIR}/hsa_pp.h" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) + # Build dynamic Library object -# -set ( TARGET_LIB "${TARGET_NAME}" ) +set ( TARGET_LIB ${TARGET_NAME} ) set ( LIB_SRC ${LIB_DIR}/core/roctracer.cpp ${LIB_DIR}/proxy/proxy_queue.cpp @@ -10,33 +22,22 @@ set ( LIB_SRC ${LIB_DIR}/util/hsa_rsrc_factory.cpp ) add_library ( ${TARGET_LIB} ${LIBRARY_TYPE} ${LIB_SRC} ) -target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++ ) -# Generating HSA tracing primitives -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/hsaap.py ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH}" ) - -# Generating KFD/Thunk tracing primitives +# Build KFD/Thunk tracing library set ( KFD_LIB "kfdwrapper64" ) -set ( KFD_LIB_SRC - ${LIB_DIR}/kfd/kfd_wrapper.cpp -) -execute_process ( COMMAND sh -xc "${CMAKE_CXX_COMPILER} -E ${HSA_KMT_INC_PATH}/hsakmttypes.h > ${PROJECT_BINARY_DIR}/hsakmttypes_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${PROJECT_BINARY_DIR}/hsakmttypes_pp.h -out ${ROOT_DIR}/inc/kfd_ostream_ops.h" ) -execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} ${HIP_DEFINES} -I${HIP_PATH}/include -I${ROCM_ROOT_DIR}/hsa/include -E ${HIP_PATH}/include/hip/hip_runtime_api.h > ${PROJECT_BINARY_DIR}/hip_runtime_api_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${PROJECT_BINARY_DIR}/hip_runtime_api_pp.h -out ${ROOT_DIR}/inc/hip_ostream_ops.h" ) -execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${PROJECT_BINARY_DIR}/hsa_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${PROJECT_BINARY_DIR}/hsa_pp.h -out ${ROOT_DIR}/inc/hsa_ostream_ops.h" ) +set ( KFD_LIB_SRC ${GEN_SRC_DIR}/kfd_wrapper.cpp) add_library ( ${KFD_LIB} SHARED ${KFD_LIB_SRC} ) -target_include_directories ( ${KFD_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HSA_KMT_INC_PATH} ) +target_include_directories ( ${KFD_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${KFD_LIB} PRIVATE c stdc++ ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/kfdap.py ${ROOT_DIR} ${HSA_KMT_INC_PATH}" ) +# Build ROCTX tracing library set ( ROCTX_LIB "roctx64" ) set ( ROCTX_LIB_SRC ${LIB_DIR}/roctx/roctx.cpp ${LIB_DIR}/roctx/roctx_intercept.cpp ) add_library ( ${ROCTX_LIB} SHARED ${ROCTX_LIB_SRC} ) -target_include_directories ( ${ROCTX_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ) +target_include_directories ( ${ROCTX_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${ROCTX_LIB} PRIVATE c stdc++ ) diff --git a/src/kfd/.gitignore b/src/kfd/.gitignore deleted file mode 100644 index 0c2acea7..00000000 --- a/src/kfd/.gitignore +++ /dev/null @@ -1 +0,0 @@ -kfd_wrapper.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6a6d7d17..c1b56c0c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -50,6 +50,7 @@ set ( HSA_REV "a657002" ) set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) ## build HIP tests +set ( INC_PATH "${INC_PATH} ${PROJECT_BINARY_DIR}/inc" ) set ( TEST_ENV HIP_VDI=${HIP_VDI} ROCM_PATH=${ROCM_ROOT_DIR} HSA_PATH=${ROCM_ROOT_DIR}/hsa INC_PATH=${INC_PATH} LIB_PATH=${LIB_PATH} HIPCC_VERBOSE=3 ) add_custom_target( mytest COMMAND ${TEST_ENV} make -C "${TEST_DIR}/MatrixTranspose" @@ -75,7 +76,7 @@ if ( DEFINED ROCTRACER_TARGET ) set ( TEST_LIB "tracer_tool" ) set ( TEST_LIB_SRC ${TEST_DIR}/tool/tracer_tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) - target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ) + target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ${PROJECT_BINARY_DIR}/inc ) target_link_libraries ( ${TEST_LIB} ${ROCTRACER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt ) endif () diff --git a/test/MatrixTranspose/Makefile b/test/MatrixTranspose/Makefile index 647067dd..9a805fb1 100644 --- a/test/MatrixTranspose/Makefile +++ b/test/MatrixTranspose/Makefile @@ -23,7 +23,7 @@ EXECUTABLE=./MatrixTranspose all: clean $(EXECUTABLE) -CXXFLAGS =-g -I$(INC_PATH) -DLOCAL_BUILD=1 --rocm-path=$(ROCM_PATH) +CXXFLAGS =-g $(INC_PATH:%=-I%) -DLOCAL_BUILD=1 --rocm-path=$(ROCM_PATH) CXX=$(HIPCC) $(EXECUTABLE): $(OBJECTS) diff --git a/test/MatrixTranspose_test/Makefile b/test/MatrixTranspose_test/Makefile index 3e879ee8..758f8d94 100644 --- a/test/MatrixTranspose_test/Makefile +++ b/test/MatrixTranspose_test/Makefile @@ -17,7 +17,7 @@ TARGET=hcc EXECUTABLE=./MatrixTranspose OBJECTS = MatrixTranspose.o -FLAGS =-g -I$(INC_PATH) -I$(ROCM_PATH)/hsa/include/hsa -I$(ROCM_PATH)/hsa/include -I$(ROCM_PATH)/hip/include -I$(ROCM_PATH)/include -DLOCAL_BUILD=1 -DHIP_VDI=${HIP_VDI} -DITERATIONS=$(ITERATIONS) -DAMD_INTERNAL_BUILD=1 +FLAGS =-g $(INC_PATH:%=-I%) -I$(ROCM_PATH)/hsa/include/hsa -I$(ROCM_PATH)/hsa/include -I$(ROCM_PATH)/hip/include -I$(ROCM_PATH)/include -DLOCAL_BUILD=1 -DHIP_VDI=${HIP_VDI} -DITERATIONS=$(ITERATIONS) -DAMD_INTERNAL_BUILD=1 ifeq ($(C_TEST), 1) COMP=${CC} diff --git a/test/run.sh b/test/run.sh index c5c8aa45..c5931061 100755 --- a/test/run.sh +++ b/test/run.sh @@ -1,4 +1,4 @@ -#!/bin/sh -x +#!/bin/sh ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. From 401c48b8b79f93b882e0bc863e6847544962a45f Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 16 Sep 2020 20:32:18 -0400 Subject: [PATCH 08/38] testing using v3 object Change-Id: Ifca31d632726ab83f4c672b46cd9b97f817e757d --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c1b56c0c..ce003b7c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,7 +44,7 @@ endif () ## Path to HSA test set ( HSA_TEST_DIR "${TEST_DIR}/hsa/test" ) -set ( HSA_REV "a657002" ) +set ( HSA_REV "19b1191" ) ## test run script set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) From 96ea2d613b07a7e0824741e6a104986130bc77e7 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 11 Sep 2020 10:29:06 -0500 Subject: [PATCH 09/38] HCC_HOME env cleanup Change-Id: I2b00e5d310e6349fc52d5df60aae85f4c06adebe --- README.md | 3 +-- build.sh | 1 - build_static.sh | 1 - cmake_modules/env.cmake | 8 +------- run_test.sh | 1 - 5 files changed, 2 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 8f3e8481..e700ee40 100644 --- a/README.md +++ b/README.md @@ -57,9 +57,8 @@ rocTX API: - Set environment: export CMAKE_PREFIX_PATH=/opt/rocm - - To use custom HIP/HCC versions: + - To use custom HIP version: export HIP_PATH=/opt/rocm/hip - export HCC_HOME=/opt/rocm/hcc - To build roctracer library: export CMAKE_BUILD_TYPE= # release by default diff --git a/build.sh b/build.sh index cae5a59a..a5201275 100755 --- a/build.sh +++ b/build.sh @@ -17,7 +17,6 @@ if [ -e "$DEFAULTS" ] ; then source "$DEFAULTS"; fi if [ -z "$ROCTRACER_ROOT" ]; then ROCTRACER_ROOT=$SRC_DIR; fi if [ -z "$BUILD_DIR" ] ; then BUILD_DIR=$PWD; fi if [ -z "$HIP_PATH" ] ; then export HIP_PATH="$ROCM_PATH/hip"; fi -if [ -z "$HCC_HOME" ] ; then export HCC_HOME="$ROCM_PATH/hcc"; fi if [ -z "$BUILD_TYPE" ] ; then BUILD_TYPE="release"; fi if [ -z "$PACKAGE_ROOT" ] ; then PACKAGE_ROOT=$ROCM_PATH; fi if [ -z "$PACKAGE_PREFIX" ] ; then PACKAGE_PREFIX="$ROCM_PATH/$COMPONENT"; fi diff --git a/build_static.sh b/build_static.sh index 938df3ce..bb6ecf29 100755 --- a/build_static.sh +++ b/build_static.sh @@ -17,7 +17,6 @@ if [ -e "$DEFAULTS" ] ; then source "$DEFAULTS"; fi if [ -z "$ROCTRACER_ROOT" ]; then ROCTRACER_ROOT=$SRC_DIR; fi if [ -z "$BUILD_DIR" ] ; then BUILD_DIR=$PWD; fi if [ -z "$HIP_PATH" ] ; then export HIP_PATH="$ROCM_PATH/hip"; fi -if [ -z "$HCC_HOME" ] ; then export HCC_HOME="$ROCM_PATH/hcc"; fi if [ -z "$BUILD_TYPE" ] ; then BUILD_TYPE="release"; fi if [ -z "$PACKAGE_ROOT" ] ; then PACKAGE_ROOT=$ROCM_PATH; fi if [ -z "$PACKAGE_PREFIX" ] ; then PACKAGE_PREFIX="$ROCM_PATH/$COMPONENT"; fi diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 8dbf2c9c..41f6253a 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -89,7 +89,7 @@ else() set ( HIP_DEFINES "-D__HIP_PLATFORM_HCC__=1") endif() -## Enable HIP/HCC local build +## Enable HIP local build if ( DEFINED LOCAL_BUILD ) add_definitions ( -DLOCAL_BUILD=${LOCAL_BUILD} ) else() @@ -114,15 +114,10 @@ if ( NOT DEFINED CMAKE_PREFIX_PATH AND DEFINED ENV{CMAKE_PREFIX_PATH} ) endif() set ( ENV{CMAKE_PREFIX_PATH} ${CMAKE_PREFIX_PATH} ) -set ( HCC_HOME "/opt/rocm/hcc" ) set ( HIP_PATH "/opt/rocm/hip" ) -if ( DEFINED ENV{HCC_HOME} ) - set ( HCC_HOME $ENV{HCC_HOME} ) -endif() if ( DEFINED ENV{HIP_PATH} ) set ( HIP_PATH $ENV{HIP_PATH} ) endif() -set ( HCC_INC_DIR "${HCC_HOME}/include" ) set ( HIP_INC_DIR "${HIP_PATH}/include" ) ## Extend Compiler flags based on build type @@ -170,7 +165,6 @@ message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) message ( "----HSA_KMT_LIB_PATH: ${HSA_KMT_LIB_PATH}" ) message ( "-------ROCM_ROOT_DIR: ${ROCM_ROOT_DIR}" ) message ( "-------------KFD-Inc: ${HSA_KMT_INC_PATH}" ) -message ( "-------------HCC-Inc: ${HCC_INC_DIR}" ) message ( "-------------HIP-Inc: ${HIP_INC_DIR}" ) message ( "-------------HIP-VDI: ${HIP_VDI}" ) message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" ) diff --git a/run_test.sh b/run_test.sh index c2ea74a6..61f1b301 100755 --- a/run_test.sh +++ b/run_test.sh @@ -7,7 +7,6 @@ fatal() { } if [ -z "$BUILD_DIR" ] ; then export BUILD_DIR=$PWD; fi -if [ -z "$HCC_HOME" ] ; then export HCC_HOME="$ROCM_PATH/hcc"; fi cd $BUILD_DIR ./run.sh From 28e4b8e014517dde5c12a98c4a08501dda964e87 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 25 Sep 2020 09:00:53 -0500 Subject: [PATCH 10/38] SWDEV-253997 : packaging fix: installing hsa_ostream_ops.h Change-Id: Ib739cbb7538473afc9744e12d2bd568635e78616 (cherry picked from commit 1d975e5ba587fdcb24e3ee0ae4b3ae9202a756de) --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97c06cf9..fe3cecde 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,8 +106,9 @@ set ( PUBLIC_HEADERS set ( GEN_HEADERS hip_ostream_ops.h hsa_prof_str.h - kfd_ostream_ops.h + hsa_ostream_ops.h kfd_prof_str.h + kfd_ostream_ops.h ) if ( ${LIBRARY_TYPE} STREQUAL SHARED ) From 367e2c496dc0fedd156b73af8d64a5e64d5af778 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 1 Oct 2020 01:28:43 -0400 Subject: [PATCH 11/38] SWDEV-251491 : gen_ostream_ops.py porting to python3 Change-Id: I7081b6ad21b038040267067bd73d8a44df46e4ff --- script/gen_ostream_ops.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index 73585ce8..c8f23629 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import os, sys, re import CppHeaderParser @@ -176,9 +176,9 @@ def gen_cppheader(infilepath, outfilepath, structs_depth): process_struct(f, c, cppHeader, "", apiname) global_str = "\n".join(global_str.split("\n")[0:-2]) if structs_depth != -1: #reindent - global_str = string.split(global_str, '\n') - global_str = [' ' + string.lstrip(line) for line in global_str] - global_str = string.join(global_str, '\n') + global_str = global_str.split('\n') + global_str = [' ' + line.lstrip() for line in global_str] + global_str = "\n".join(global_str) f.write(global_str+"\n") if structs_depth != -1: f.write(" };\n") @@ -199,9 +199,9 @@ def gen_cppheader(infilepath, outfilepath, structs_depth): process_struct(f, c, cppHeader, "", apiname) global_str = "\n".join(global_str.split("\n")[0:-2]) if structs_depth != -1: #reindent - global_str = string.split(global_str, '\n') - global_str = [' ' + string.lstrip(line) for line in global_str] - global_str = string.join(global_str, '\n') + global_str = global_str.split('\n') + global_str = [' ' + line.lstrip() for line in global_str] + global_str = "\n".join(global_str) f.write(global_str+"\n") if structs_depth != -1: f.write(" };\n") From fb0c230b0bf4fd822d01aa9c2633d12f0e15029c Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 22 Sep 2020 06:20:08 -0400 Subject: [PATCH 12/38] SWDEV-249924 : hip correlation id explicite propogation; tracer debug instrumentation; Change-Id: Ibbc411541f5610ce739f3fc1efa1ab7f605220f5 initial commmit Change-Id: I34b360be62c2083819dc5c3acc8268bd69f2f58a --- cmake_modules/env.cmake | 5 ++++- src/core/roctracer.cpp | 37 ++++++++++++++++++++++++------ src/util/logger.h | 18 ++++++++++++++- test/tool/tracer_tool.cpp | 47 ++++++++++++++++++++++++++++++--------- 4 files changed, 88 insertions(+), 19 deletions(-) diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 41f6253a..405f2665 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -61,8 +61,11 @@ if ( "$ENV{CXX}" STREQUAL "/usr/bin/clang++" ) endif() ## Enable debug trace +if ( DEFINED CMAKE_DEBUG_TRACE ) + add_definitions ( -DDEBUG_TRACE_ON=1 ) +endif() if ( DEFINED ENV{CMAKE_DEBUG_TRACE} ) - add_definitions ( -DDEBUG_TRACE=1 ) + add_definitions ( -DDEBUG_TRACE_ON=1 ) endif() if ( NOT DEFINED LIBRARY_TYPE ) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 52f1e28b..21f4b667 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -95,6 +95,7 @@ THE SOFTWARE. static inline uint32_t GetPid() { return syscall(__NR_getpid); } +static inline uint32_t GetTid() { return syscall(__NR_gettid); } /////////////////////////////////////////////////////////////////////////////////////////////////// // Mark callback @@ -294,13 +295,19 @@ static inline void CorrelationIdRegistr(const activity_correlation_id_t& correla if (correlation_id_map == NULL) correlation_id_map = new correlation_id_map_t; const auto ret = correlation_id_map->insert({correlation_id, correlation_id_tls}); if (ret.second == false) EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id is not unique(" << correlation_id << ")"); + + DEBUG_TRACE("CorrelationIdRegistr id(%lu) id_tls(%lu)\n", correlation_id, correlation_id_tls); } static inline activity_correlation_id_t CorrelationIdLookup(const activity_correlation_id_t& correlation_id) { auto it = correlation_id_map->find(correlation_id); if (correlation_id_wait) while (it == correlation_id_map->end()) it = correlation_id_map->find(correlation_id); if (it == correlation_id_map->end()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id lookup failed(" << correlation_id << ")"); - return it->second; + const activity_correlation_id_t ret_val = it->second; + + DEBUG_TRACE("CorrelationIdLookup id(%lu) ret(%lu)\n", correlation_id, ret_val); + + return ret_val; } typedef std::mutex hip_activity_mutex_t; @@ -341,6 +348,7 @@ void* HIP_SyncApiDataCallback( const void* callback_data, void* arg) { + void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); hip_api_data_t* data_ptr = const_cast(data); MemoryPool* pool = reinterpret_cast(arg); @@ -375,16 +383,20 @@ void* HIP_SyncApiDataCallback( // Passing correlatin ID correlation_id_tls = correlation_id; - return data_ptr; + ret = data_ptr; } else { // popping the record entry if (!record_pair_stack.empty()) record_pair_stack.pop(); // Clearing correlatin ID correlation_id_tls = 0; - - return NULL; } + + const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); + DEBUG_TRACE("HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu)\n", + name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0); + + return ret; } void* HIP_SyncActivityCallback( @@ -395,6 +407,7 @@ void* HIP_SyncActivityCallback( { static hsa_rt_utils::Timer timer; + void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); hip_api_data_t* data_ptr = const_cast(data); MemoryPool* pool = reinterpret_cast(arg); @@ -436,7 +449,7 @@ void* HIP_SyncActivityCallback( // Passing correlatin ID correlation_id_tls = correlation_id; - return data_ptr; + ret = data_ptr; } else { if (pool == NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback exit: pool is NULL"); @@ -469,9 +482,13 @@ void* HIP_SyncActivityCallback( // Clearing correlatin ID correlation_id_tls = 0; - - return NULL; } + + const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); + DEBUG_TRACE("HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu)\n", + name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0); + + return ret; } void HCC_ActivityIdCallback(activity_correlation_id_t correlation_id) { @@ -484,6 +501,10 @@ void HCC_AsyncActivityCallback(uint32_t op_id, void* record, void* arg) { record_ptr->domain = ACTIVITY_DOMAIN_HCC_OPS; record_ptr->correlation_id = CorrelationIdLookup(record_ptr->correlation_id); pool->Write(*record_ptr); + + const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HCC_OPS, record_ptr->op, record_ptr->kind); + DEBUG_TRACE("HCC_AsyncActivityCallback(\"%s\"): op(%u) kind(%u) record(%p) pool(%p) correlation_id(%d)\n", + name, record_ptr->op, record_ptr->kind, record, pool, record_ptr->correlation_id); } // Open output file @@ -673,6 +694,8 @@ PUBLIC_API const char* roctracer_op_string( return roctracer::HipLoader::Instance().ApiName(op); case ACTIVITY_DOMAIN_KFD_API: return roctracer::kfd_support::GetApiName(op); + case ACTIVITY_DOMAIN_EXT_API: + return "EXT_API"; default: EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")"); } diff --git a/src/util/logger.h b/src/util/logger.h index cd8dd470..8e525f68 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -100,10 +100,10 @@ class Logger { return *obj; } - private: static uint32_t GetPid() { return syscall(__NR_getpid); } static uint32_t GetTid() { return syscall(__NR_gettid); } + private: Logger() : file_(NULL), dirty_(false), streaming_(false), messaging_(false) { const char* path = getenv("ROCTRACER_LOG"); if (path != NULL) { @@ -198,4 +198,20 @@ class Logger { } while(0) #endif +#if DEBUG_TRACE_ON +inline static void DEBUG_TRACE(const char* fmt, ...) { + constexpr int size = 256; + char buf[size]; + + va_list valist; + va_start(valist, fmt); + vsnprintf(buf, size, fmt, valist); + printf("%u:%u %s", + roctracer::util::Logger::GetPid(), roctracer::util::Logger::GetTid(), buf); fflush(stdout); + va_end(valist); +} +#else +inline static void DEBUG_TRACE(const char* fmt, ...) {} +#endif + #endif // SRC_UTIL_LOGGER_H_ diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index ad866012..05855fd0 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include /* names denangle */ #include #include +#include #include #include #include /* SYS_xxx definitions */ @@ -78,6 +79,24 @@ THE SOFTWARE. #define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") #define ONLOAD_TRACE_END() ONLOAD_TRACE("end") +static inline uint32_t GetPid() { return syscall(__NR_getpid); } +static inline uint32_t GetTid() { return syscall(__NR_gettid); } + +#if DEBUG_TRACE_ON +inline static void DEBUG_TRACE(const char* fmt, ...) { + constexpr int size = 256; + char buf[size]; + + va_list valist; + va_start(valist, fmt); + vsnprintf(buf, size, fmt, valist); + printf("%u:%u %s", GetPid(), GetTid(), buf); fflush(stdout); + va_end(valist); +} +#else +inline static void DEBUG_TRACE(const char* fmt, ...) {} +#endif + typedef hsa_rt_utils::Timer::timestamp_t timestamp_t; hsa_rt_utils::Timer* timer = NULL; thread_local timestamp_t hsa_begin_timestamp = 0; @@ -125,9 +144,6 @@ void close_file_handles() { if (pc_sample_file_handle) close_output_file(pc_sample_file_handle); } -static inline uint32_t GetPid() { return syscall(__NR_getpid); } -static inline uint32_t GetTid() { return syscall(__NR_gettid); } - static const uint32_t my_pid = GetPid(); // Error handler @@ -378,19 +394,20 @@ void hip_api_callback( { (void)arg; const hip_api_data_t* data = reinterpret_cast(callback_data); + const timestamp_t timestamp = timer->timestamp_fn_ns(); + hip_api_trace_entry_t* entry = NULL; if (data->phase == ACTIVITY_API_PHASE_ENTER) { - hip_begin_timestamp = timer->timestamp_fn_ns(); + hip_begin_timestamp = timestamp; } else { // Post onit of HIP APU args hipApiArgsInit((hip_api_id_t)cid, const_cast(data)); - const timestamp_t end_timestamp = timer->timestamp_fn_ns(); - hip_api_trace_entry_t* entry = hip_api_trace_buffer->GetEntry(); + entry = hip_api_trace_buffer->GetEntry(); entry->cid = cid; entry->domain = domain; entry->begin = hip_begin_timestamp; - entry->end = end_timestamp; + entry->end = timestamp; entry->pid = GetPid(); entry->tid = GetTid(); entry->data = *data; @@ -435,6 +452,10 @@ void hip_api_callback( entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } + + const char * name = roctracer_op_string(domain, cid, 0); + DEBUG_TRACE("hip_api_callback(\"%s\") phase(%d): cid(%u) data(%p) entry(%p) name(\"%s\") correlation_id(%lu)\n", + name, data->phase, cid, data, entry, (entry) ? entry->name : NULL, data->correlation_id); } void mark_api_callback( @@ -465,12 +486,10 @@ hip_kernel_map_t* hip_kernel_map = NULL; std::mutex hip_kernel_mutex; void hip_api_flush_cb(hip_api_trace_entry_t* entry) { - static uint64_t correlation_id = 0; - correlation_id += 1; - const uint32_t domain = entry->domain; const uint32_t cid = entry->cid; const hip_api_data_t* data = &(entry->data); + const uint64_t correlation_id = data->correlation_id; const timestamp_t begin_timestamp = entry->begin; const timestamp_t end_timestamp = entry->end; std::ostringstream rec_ss; @@ -480,6 +499,10 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { rec_ss << std::dec << begin_timestamp << ":" << end_timestamp << " " << entry->pid << ":" << entry->tid; oss << std::dec << rec_ss.str() << " " << str; + const char * name = roctracer_op_string(entry->domain, entry->cid, 0); + DEBUG_TRACE("hip_api_flush_cb(\"%s\"): domain(%u) cid(%u) entry(%p) name(\"%s\" correlation_id(%lu))\n", + name, entry->domain, entry->cid, entry, entry->name, correlation_id); + if (domain == ACTIVITY_DOMAIN_HIP_API) { #if HIP_PROF_HIP_API_STRING if (hip_api_stats != NULL) { @@ -496,6 +519,7 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { const char* kernel_name = cxx_demangle(entry->name); rec_ss << " kernel=" << kernel_name; } + rec_ss<< " :" << correlation_id; fprintf(hip_api_file_handle, "%s\n", rec_ss.str().c_str()); } #else // !HIP_PROF_HIP_API_STRING @@ -607,6 +631,9 @@ void pool_activity_callback(const char* begin, const char* end, void* arg) { while (record < end_record) { const char * name = roctracer_op_string(record->domain, record->op, record->kind); + DEBUG_TRACE("pool_activity_callback(\"%s\"): domain(%u) op(%u) kind(%u) record(%p) correlation_id(%lu)\n", + name, record->domain, record->op, record->kind, record, record->correlation_id); + switch(record->domain) { case ACTIVITY_DOMAIN_HCC_OPS: if (hip_memcpy_stats != NULL) { From 8bb2d3095ea86edb5b306685a38f5cc6654c1ddb Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 30 Oct 2020 02:09:09 -0500 Subject: [PATCH 13/38] SWDEV-258731 : cleanup Change-Id: I0bc4ca977ce44f864178e78ec339888f86cbed8a --- src/core/roctracer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 21f4b667..a50f2d3d 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -95,7 +95,6 @@ THE SOFTWARE. static inline uint32_t GetPid() { return syscall(__NR_getpid); } -static inline uint32_t GetTid() { return syscall(__NR_gettid); } /////////////////////////////////////////////////////////////////////////////////////////////////// // Mark callback From 9f02bb977f464e4edee44f2828b31a39073d1a01 Mon Sep 17 00:00:00 2001 From: Pruthvi Madugundu Date: Fri, 5 Jun 2020 00:56:19 -0700 Subject: [PATCH 14/38] Add RUNPATH to libtracer_tool.so - All libs will have RUNPATH - libtracer_tool.so is added with RUNPATH based on ROCM_RPATH when defined else not set. Signed-off-by: Pruthvi Madugundu Change-Id: I6515e603c82e1360e03eca2967f6a85e5faadc9a --- cmake_modules/env.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 405f2665..f7824148 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -53,6 +53,11 @@ set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" ) +## Set RUNPATH if ROCM_RPATH is defined and passed by the environment +if ( DEFINED ROCM_RPATH ) + set ( CMAKE_SHARED_LINKER_FLAGS " -Wl,--enable-new-dtags -Wl,--rpath,${ROCM_RPATH} ${CMAKE_SHARED_LINKER_FLAGS}" ) +endif () + set ( CMAKE_SKIP_BUILD_RPATH TRUE ) ## CLANG options From 73fb6ea9a6ce06f16966e55ea3bf5b03fc0f6981 Mon Sep 17 00:00:00 2001 From: Cole Nelson Date: Fri, 16 Oct 2020 12:20:40 -0700 Subject: [PATCH 15/38] CMakeList.txt: conformant package names Still needs valid email ID in the form of package_name.support@amd.com. SWDEV-257322 Names complete as built (internal) : roctracer-dev_1.0.0.40000-crdnnv.444_amd64.deb roctracer-dev-1.0.0.40000-crdnnv.444.el7.x86_64.rpm These changes are to satisfy: http://confluence.amd.com/display/GPUCPT/Package+File+Naming Change-Id: I5991326eb87d7dfa1304e3b2c5afb78f5a0c0361 Signed-off-by: Cole Nelson (cherry picked from commit 16ad4e9de702f8eeb2d7eb2cbb0db493d21af1bc) --- CMakeLists.txt | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fe3cecde..f5c1d73c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -177,18 +177,49 @@ install ( TARGETS "kfdwrapper64" LIBRARY DESTINATION ${DEST_NAME}/lib ) ## Packaging directives set ( CPACK_GENERATOR "DEB" "RPM" "TGZ" ) set ( CPACK_PACKAGE_NAME "${ROCTRACER_NAME}-dev" ) -set ( CPACK_PACKAGE_VENDOR "AMD" ) +set ( CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc." ) set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} ) set ( CPACK_PACKAGE_VERSION_MINOR ${BUILD_VERSION_MINOR} ) set ( CPACK_PACKAGE_VERSION_PATCH ${BUILD_VERSION_PATCH} ) -set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." ) +set ( CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}" ) +if ( DEFINED ENV{ROCM_LIBPATCH_VERSION} ) + set ( CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION}.$ENV{ROCM_LIBPATCH_VERSION}" ) +endif() +message ( "-- CPACK_PACKAGE_VERSION: ${CPACK_PACKAGE_VERSION}" ) +set ( CPACK_PACKAGE_CONTACT "TODO " ) set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "AMD ROCTRACER library" ) set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" ) ## Debian package specific variables +if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) + set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) +else() + set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" ) +endif() +message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" ) +set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" ) set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" ) ## RPM package specific variables +if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} ) + set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} ) +else() + set ( CPACK_RPM_PACKAGE_RELEASE "local" ) +endif() +message ( "Using CPACK_RPM_PACKAGE_RELEASE ${CPACK_RPM_PACKAGE_RELEASE}" ) + +## 'dist' breaks manual builds on debian systems due to empty Provides +execute_process( COMMAND rpm --eval %{?dist} + RESULT_VARIABLE PROC_RESULT + OUTPUT_VARIABLE EVAL_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE ) +message("RESULT_VARIABLE ${PROC_RESULT} OUTPUT_VARIABLE: ${EVAL_RESULT}") + +if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" ) + string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" ) +endif() +set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" ) +message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}") set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) From 0876c253d8b6380a1d6bd1f2b34c5b4c1e8c94db Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 1 Oct 2020 05:58:50 -0400 Subject: [PATCH 16/38] SWDEV-251491 : disabling hipModuleUnload tracing which is called on exit Change-Id: I99c22eec3fea6ac8820d574c44df099febdd27c4 (cherry picked from commit bb8f2f67858d68b13b00696b2798857676201210) --- test/tool/tracer_tool.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index 05855fd0..d6be6f4f 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1092,6 +1092,7 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, if (trace_hip_api) { hip_api_file_handle = open_output_file(output_prefix, "hip_api_trace.txt"); ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); + ROCTRACER_CALL(roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, HIP_API_ID_hipModuleUnload)); if (is_stats_opt) { const char* path = NULL; From 18c83ea763439f9c9ab9a39ce296920c4fb99a56 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 23 Sep 2020 02:52:06 -0400 Subject: [PATCH 17/38] hip library loader check Change-Id: I34957db88932e1ed725a0a0d8ca9a66fecc92e38 (cherry picked from commit 9061c4ea414b78b71fa1dc0d0869b7519b366e73) --- src/core/loader.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/core/loader.h b/src/core/loader.h index 5d6e0d6c..d1f2ef01 100644 --- a/src/core/loader.h +++ b/src/core/loader.h @@ -291,7 +291,6 @@ typedef HipLoaderShared HipLoader; template bool roctracer::BaseLoader::to_check_symb_ = true; \ template<> const char* roctracer::RocpLoader::lib_name_ = "librocprofiler64.so"; \ template<> const char* roctracer::HccLoader::lib_name_ = "libamdhip64.so"; \ - template<> bool roctracer::HccLoader::to_check_open_ = false; \ template<> const char* roctracer::KfdLoader::lib_name_ = "libkfdwrapper64.so"; \ template<> const char* roctracer::RocTxLoader::lib_name_ = "libroctx64.so"; \ template<> bool roctracer::RocTxLoader::to_load_ = true; @@ -302,8 +301,7 @@ typedef HipLoaderShared HipLoader; roctracer::HipLoaderStatic::instance_t roctracer::HipLoaderStatic::instance_{}; #else #define LOADER_INSTANTIATE_HIP() \ - template<> const char* roctracer::HipLoaderShared::lib_name_ = "libamdhip64.so"; \ - template<> bool roctracer::HipLoaderShared::to_check_open_ = false; + template<> const char* roctracer::HipLoaderShared::lib_name_ = "libamdhip64.so"; #endif #if HIP_VDI From d8d7137e185bc5da67514b3c88e607418aa582bf Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 3 Sep 2020 04:01:28 -0500 Subject: [PATCH 18/38] SWDEV-213367 : codeobj event implementation Change-Id: Ibcaca6869ce96d8802c5fa8ba241f43834d6f2a7 update - codeobj event implementation Change-Id: I4c12f26a19f2b31d9ac2211c3426a0e587a332b3 update2 - codeobj event implementation Change-Id: Ic877549a83542ae00352503471d881e847ebac9c test - codeobj event implementation Change-Id: I0618d3a93de94c3d7467372ba4a3d4ea5520bfc7 URI reference test - codeobj event implementation Change-Id: I6cf7e8a648cf012cb0708058b118a75e58f992b9 adding test/app - codeobj event implementation Change-Id: Idf4c197c7b9116ccde5ec50ff47a26a858bfab32 uri test fix - codeobj event implementation Change-Id: I7c385f82f516d9d8f2cd726366f00be3664006e3 uri test cleanup - codeobj event implementation Change-Id: I542d5baf88c048c8b4717af843b803cd93e8f3bc URI buffer fix - codeobj event implementation Change-Id: Iac65e04c03a0939935c10f53c6b580a2e33878f5 HSA events tests trace-check disabled Change-Id: I0f4d13aeeceb1d1a6e2191673eacbf9c7ae2ae52 --- cmake_modules/env.cmake | 7 +- inc/roctracer_hsa.h | 3 +- src/CMakeLists.txt | 9 +- src/core/loader.h | 9 + src/core/roctracer.cpp | 20 +- test/CMakeLists.txt | 19 +- test/app/codeobj_test.cpp | 89 ++ test/app/hsaco_test.cpp | 134 ++ .../MatrixTranspose_hip_flush_trace.txt | 1315 +++++++++++++---- test/golden_traces/tests_trace_cmp_levels.txt | 2 + test/run.sh | 7 + 11 files changed, 1297 insertions(+), 317 deletions(-) create mode 100644 test/app/codeobj_test.cpp create mode 100644 test/app/hsaco_test.cpp diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index f7824148..3f5dec60 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -156,10 +156,10 @@ get_filename_component ( HSA_RUNTIME_LIB_PATH "${HSA_RUNTIME_LIB}" DIRECTORY ) find_library ( HSA_KMT_LIB "libhsakmt.so" ) get_filename_component ( HSA_KMT_LIB_PATH "${HSA_KMT_LIB}" DIRECTORY ) -get_filename_component ( ROCM_ROOT_DIR "${HSA_KMT_LIB_PATH}" DIRECTORY ) - set ( HSA_KMT_INC_PATH "${HSA_KMT_LIB_PATH}/../include" ) -set ( ROCM_INC_PATH "${HSA_KMT_INC_PATH}" ) + +get_filename_component ( ROCM_ROOT_DIR "${HSA_KMT_LIB_PATH}" DIRECTORY ) +set ( ROCM_INC_PATH "${ROCM_ROOT_DIR}/include" ) ## Basic Tool Chain Information message ( "----------------NBit: ${NBIT}" ) @@ -172,6 +172,7 @@ message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) message ( "----HSA_KMT_LIB_PATH: ${HSA_KMT_LIB_PATH}" ) message ( "-------ROCM_ROOT_DIR: ${ROCM_ROOT_DIR}" ) +message ( "-------ROCM_INC_PATH: ${ROCM_INC_PATH}" ) message ( "-------------KFD-Inc: ${HSA_KMT_INC_PATH}" ) message ( "-------------HIP-Inc: ${HIP_INC_DIR}" ) message ( "-------------HIP-VDI: ${HIP_VDI}" ) diff --git a/inc/roctracer_hsa.h b/inc/roctracer_hsa.h index b9b0cf98..d9daa5e5 100644 --- a/inc/roctracer_hsa.h +++ b/inc/roctracer_hsa.h @@ -27,6 +27,7 @@ THE SOFTWARE. #include #include +#include // HSA OP ID enumeration enum hsa_op_id_t { @@ -34,7 +35,7 @@ enum hsa_op_id_t { HSA_OP_ID_COPY = 1, HSA_OP_ID_BARRIER = 2, HSA_OP_ID_RESERVED1 = 3, - HSA_OP_ID_NUMBER = 4 + HSA_OP_ID_NUMBER }; #ifdef __cplusplus diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c794c491..e9c72f84 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,6 +11,9 @@ execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HIP_PATH}/include/hip execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hip_runtime_api_pp.h -out ${GEN_INC_DIR}/hip_ostream_ops.h" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${GEN_INC_DIR}/hsa_pp.h" ) execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "mkdir ${GEN_INC_DIR}/rocprofiler" ) +execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/inc/rocprofiler.h ${GEN_INC_DIR}/rocprofiler/rocprofiler.h" ) +execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/src/core/activity.h ${GEN_INC_DIR}/rocprofiler/activity.h" ) # Build dynamic Library object set ( TARGET_LIB ${TARGET_NAME} ) @@ -22,14 +25,14 @@ set ( LIB_SRC ${LIB_DIR}/util/hsa_rsrc_factory.cpp ) add_library ( ${TARGET_LIB} ${LIBRARY_TYPE} ${LIB_SRC} ) -target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${ROCM_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++ ) # Build KFD/Thunk tracing library set ( KFD_LIB "kfdwrapper64" ) set ( KFD_LIB_SRC ${GEN_SRC_DIR}/kfd_wrapper.cpp) add_library ( ${KFD_LIB} SHARED ${KFD_LIB_SRC} ) -target_include_directories ( ${KFD_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) +target_include_directories ( ${KFD_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${KFD_LIB} PRIVATE c stdc++ ) # Build ROCTX tracing library @@ -39,5 +42,5 @@ set ( ROCTX_LIB_SRC ${LIB_DIR}/roctx/roctx_intercept.cpp ) add_library ( ${ROCTX_LIB} SHARED ${ROCTX_LIB_SRC} ) -target_include_directories ( ${ROCTX_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${GEN_INC_DIR} ) +target_include_directories ( ${ROCTX_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${ROCTX_LIB} PRIVATE c stdc++ ) diff --git a/src/core/loader.h b/src/core/loader.h index d1f2ef01..946521d1 100644 --- a/src/core/loader.h +++ b/src/core/loader.h @@ -94,6 +94,10 @@ class RocpApi { EnableCallback_t* EnableActivityCallback; NameCallback_t* GetOpName; + RegisterCallback_t* RegisterEvtCallback; + OperateCallback_t* RemoveEvtCallback; + NameCallback_t* GetEvtName; + protected: void init(Loader* loader) { RegisterApiCallback = loader->GetFun("RegisterApiCallback"); @@ -101,6 +105,10 @@ class RocpApi { InitActivityCallback = loader->GetFun("InitActivityCallback"); EnableActivityCallback = loader->GetFun("EnableActivityCallback"); GetOpName = loader->GetFun("GetOpName"); + + RegisterEvtCallback = loader->GetFun("RegisterEvtCallback"); + RemoveEvtCallback = loader->GetFun("RemoveEvtCallback"); + GetEvtName = loader->GetFun("GetEvtName"); } }; @@ -290,6 +298,7 @@ typedef HipLoaderShared HipLoader; template bool roctracer::BaseLoader::to_check_open_ = true; \ template bool roctracer::BaseLoader::to_check_symb_ = true; \ template<> const char* roctracer::RocpLoader::lib_name_ = "librocprofiler64.so"; \ + template<> bool roctracer::RocpLoader::to_load_ = true; \ template<> const char* roctracer::HccLoader::lib_name_ = "libamdhip64.so"; \ template<> const char* roctracer::KfdLoader::lib_name_ = "libkfdwrapper64.so"; \ template<> const char* roctracer::RocTxLoader::lib_name_ = "libroctx64.so"; \ diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index a50f2d3d..21203f91 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -685,6 +685,8 @@ PUBLIC_API const char* roctracer_op_string( switch (domain) { case ACTIVITY_DOMAIN_HSA_API: return roctracer::hsa_support::GetApiName(op); + case ACTIVITY_DOMAIN_HSA_EVT: + return roctracer::RocpLoader::Instance().GetEvtName(op); case ACTIVITY_DOMAIN_HSA_OPS: return roctracer::RocpLoader::Instance().GetOpName(op); case ACTIVITY_DOMAIN_HCC_OPS: @@ -730,6 +732,7 @@ static inline uint32_t get_op_num(const uint32_t& domain) { switch (domain) { case ACTIVITY_DOMAIN_HSA_OPS: return HSA_OP_ID_NUMBER; case ACTIVITY_DOMAIN_HSA_API: return HSA_API_ID_NUMBER; + case ACTIVITY_DOMAIN_HSA_EVT: return HSA_EVT_ID_NUMBER; case ACTIVITY_DOMAIN_HCC_OPS: return HIP_OP_ID_NUMBER; case ACTIVITY_DOMAIN_HIP_API: return HIP_API_ID_NUMBER; case ACTIVITY_DOMAIN_KFD_API: return KFD_API_ID_NUMBER; @@ -759,13 +762,18 @@ static roctracer_status_t roctracer_enable_callback_fun( #if 0 if (op == HSA_API_ID_DISPATCH) { const bool succ = roctracer::RocpLoader::Instance().RegisterApiCallback(op, (void*)callback, user_data); - if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::EnableActivityCallback error(" << op << ") failed"); + if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::RegisterApiCallback error(" << op << ") failed"); break; } #endif roctracer::hsa_support::cb_table.set(op, callback, user_data); break; } + case ACTIVITY_DOMAIN_HSA_EVT: { + const bool succ = roctracer::RocpLoader::Instance().RegisterEvtCallback(op, (void*)callback, user_data); + if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::RegisterEvtCallback error(" << op << ") failed"); + break; + } case ACTIVITY_DOMAIN_HCC_OPS: break; case ACTIVITY_DOMAIN_HIP_API: { if (roctracer::HipLoader::Instance().Enabled() == false) break; @@ -874,6 +882,11 @@ static roctracer_status_t roctracer_disable_callback_fun( } break; } + case ACTIVITY_DOMAIN_HSA_EVT: { + const bool succ = roctracer::RocpLoader::Instance().RemoveEvtCallback(op); + if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::RemoveEvtCallback error(" << op << ") failed"); + break; + } case ACTIVITY_DOMAIN_ROCTX: { if (roctracer::RocTxLoader::Instance().Enabled()) { const bool suc = roctracer::RocTxLoader::Instance().RemoveApiCallback(op); @@ -983,6 +996,7 @@ static roctracer_status_t roctracer_enable_activity_fun( break; } case ACTIVITY_DOMAIN_HSA_API: break; + case ACTIVITY_DOMAIN_HSA_EVT: break; case ACTIVITY_DOMAIN_KFD_API: break; case ACTIVITY_DOMAIN_HCC_OPS: { const bool init_phase = (roctracer::HccLoader::GetRef() == NULL); @@ -1079,6 +1093,7 @@ static roctracer_status_t roctracer_disable_activity_fun( break; } case ACTIVITY_DOMAIN_HSA_API: break; + case ACTIVITY_DOMAIN_HSA_EVT: break; case ACTIVITY_DOMAIN_KFD_API: break; case ACTIVITY_DOMAIN_HCC_OPS: { if (roctracer::HccLoader::Instance().Enabled() == false) break; @@ -1249,6 +1264,9 @@ PUBLIC_API roctracer_status_t roctracer_set_properties( roctracer::kfd_support::intercept_KFDApiTable(); break; } + case ACTIVITY_DOMAIN_HSA_EVT: { + break; + } case ACTIVITY_DOMAIN_HSA_API: { // HSA API properties HsaApiTable* table = reinterpret_cast(properties); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ce003b7c..148c60b0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -50,7 +50,8 @@ set ( HSA_REV "19b1191" ) set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) ## build HIP tests -set ( INC_PATH "${INC_PATH} ${PROJECT_BINARY_DIR}/inc" ) +set ( GEN_INC_DIR ${PROJECT_BINARY_DIR}/inc ) +set ( INC_PATH "${INC_PATH} ${GEN_INC_DIR}" ) set ( TEST_ENV HIP_VDI=${HIP_VDI} ROCM_PATH=${ROCM_ROOT_DIR} HSA_PATH=${ROCM_ROOT_DIR}/hsa INC_PATH=${INC_PATH} LIB_PATH=${LIB_PATH} HIPCC_VERBOSE=3 ) add_custom_target( mytest COMMAND ${TEST_ENV} make -C "${TEST_DIR}/MatrixTranspose" @@ -76,10 +77,24 @@ if ( DEFINED ROCTRACER_TARGET ) set ( TEST_LIB "tracer_tool" ) set ( TEST_LIB_SRC ${TEST_DIR}/tool/tracer_tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) - target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ${PROJECT_BINARY_DIR}/inc ) + target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries ( ${TEST_LIB} ${ROCTRACER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt ) endif () +## Build hsaco_test.cpp referenc test +set ( CO_LIB_NAME "hsaco_test" ) +set ( CO_LIB_SRC ${TEST_DIR}/app/hsaco_test.cpp ) +add_library ( ${CO_LIB_NAME} SHARED ${CO_LIB_SRC} ) +target_include_directories ( ${CO_LIB_NAME} PRIVATE ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries ( ${CO_LIB_NAME} ${HSA_RUNTIME_LIB} c stdc++ ) + +## Build codeobj event test +set ( CO_LIB_NAME "codeobj_test" ) +set ( CO_LIB_SRC ${TEST_DIR}/app/codeobj_test.cpp ) +add_library ( ${CO_LIB_NAME} SHARED ${CO_LIB_SRC} ) +target_include_directories ( ${CO_LIB_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${GEN_INC_DIR} ${HSA_RUNTIME_INC_PATH} ${ROCM_INC_PATH} ) +target_link_libraries ( ${CO_LIB_NAME} ${ROCTRACER_TARGET} c stdc++ ) + ## Build HSA test execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" ) execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git fetch origin && git checkout ${HSA_REV}; fi" ) diff --git a/test/app/codeobj_test.cpp b/test/app/codeobj_test.cpp new file mode 100644 index 00000000..086bcfb6 --- /dev/null +++ b/test/app/codeobj_test.cpp @@ -0,0 +1,89 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include + +#include "inc/roctracer.h" +#include "inc/roctracer_hsa.h" +#include + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +// Check returned HSA API status +void check_status(roctracer_status_t status) { + if (status != ROCTRACER_STATUS_SUCCESS) { + const char* error_string = roctracer_error_string(); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +// codeobj callback +void codeobj_callback(uint32_t domain, uint32_t cid, const void* data, void* arg) { + const hsa_evt_data_t* evt_data = reinterpret_cast(data); + const uint32_t uri_length = evt_data->codeobj.uri_length; + const char* uri = evt_data->codeobj.uri; + printf("codeobj_callback domain(%u) cid(%u): load_delta(0x%lx) load_size(0x%lx) uri_length(%u) uri(\"%s\")\n", + domain, + cid, + evt_data->codeobj.load_delta, + evt_data->codeobj.load_size, + uri_length, + uri); + fflush(stdout); +} + +void initialize() { + roctracer_status_t status = roctracer_enable_op_callback(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, codeobj_callback, NULL); + check_status(status); +} + +void cleanup() { + roctracer_status_t status = roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HSA_EVT); + check_status(status); +} + +// Tool constructor +extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) { + // Enable HSA events intercepting + settings->hsa_intercepting = 1; + // Initialize profiling + initialize(); +} + +// Tool destructor +extern "C" PUBLIC_API void OnUnloadTool() { + // Final resources cleanup + cleanup(); +} + +extern "C" CONSTRUCTOR_API void constructor() { + printf("constructor\n"); fflush(stdout); +} + +extern "C" DESTRUCTOR_API void destructor() { + OnUnloadTool(); +} diff --git a/test/app/hsaco_test.cpp b/test/app/hsaco_test.cpp new file mode 100644 index 00000000..0f2e42ad --- /dev/null +++ b/test/app/hsaco_test.cpp @@ -0,0 +1,134 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +#define HSA_RT(call) \ + do { \ + const hsa_status_t status = call; \ + if (status != HSA_STATUS_SUCCESS) { \ + printf("error \"%s\"\n", #call); fflush(stdout); \ + abort(); \ + } \ + } while(0) + +// HSA API intercepting primitives +decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; +hsa_ven_amd_loader_1_01_pfn_t loader_api_table{}; + +hsa_status_t code_object_callback( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void* arg) +{ + printf("code_object_callback\n"); fflush(stdout); + + uint64_t load_size = 0; + uint64_t load_delta = 0; + uint32_t uri_len = 0; + char* uri_str = NULL; + + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, + &load_size)); + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA, + &load_delta)); + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH, + &uri_len)); + + uri_str = (char*)calloc(uri_len + 1, sizeof(char)); + if (!uri_str) { + perror("calloc"); + abort(); + } + + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI, + uri_str)); + + printf("load_size(0x%lx)\n", load_size); fflush(stdout); + printf("load_delta(0x%lx)\n", load_delta); fflush(stdout); + printf("uri_len(%u)\n", uri_len); fflush(stdout); + printf("uri_str(\"%s\")\n", uri_str); fflush(stdout); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t hsa_executable_freeze_interceptor( + hsa_executable_t executable, + const char *options) +{ + HSA_RT(loader_api_table.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + executable, + code_object_callback, + NULL)); + HSA_RT(hsa_executable_freeze_fn( + executable, + options)); + return HSA_STATUS_SUCCESS; +} + +// HSA-runtime tool on-load method +extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, + uint64_t runtime_version, + uint64_t failed_tool_count, + const char* const* failed_tool_names) +{ + printf("OnLoad: begin\n"); fflush(stdout); + // intercepting hsa_executable_freeze API + hsa_executable_freeze_fn = table->core_->hsa_executable_freeze_fn; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; + // Fetching AMD Loader HSA extension API + HSA_RT(hsa_system_get_major_extension_table( + HSA_EXTENSION_AMD_LOADER, + 1, + sizeof(hsa_ven_amd_loader_1_01_pfn_t), + &loader_api_table)); + printf("OnLoad: end\n"); fflush(stdout); + return true; +} + +extern "C" PUBLIC_API void OnUnload() { + printf("OnUnload\n"); fflush(stdout); +} + +extern "C" CONSTRUCTOR_API void constructor() { + printf("constructor\n"); fflush(stdout); +} + +extern "C" DESTRUCTOR_API void destructor() { + printf("destructor\n"); fflush(stdout); +} diff --git a/test/golden_traces/MatrixTranspose_hip_flush_trace.txt b/test/golden_traces/MatrixTranspose_hip_flush_trace.txt index 27ef8e95..6f0c4d17 100644 --- a/test/golden_traces/MatrixTranspose_hip_flush_trace.txt +++ b/test/golden_traces/MatrixTranspose_hip_flush_trace.txt @@ -1,25 +1,58 @@ -+ ROCP_FLUSH_RATE=100000 ./test/MatrixTranspose -ROCTracer (pid=1991): +ROCTracer (pid=14696): ROCTracer: trace control flush rate(100000us) -3802701299772587 +129855595266140 HIP-trace() -Device name Device 687f +Device name Device 738c ## Iteration (99) ################# -3802701304199730:3802701304207180 1991:1991 hipGetDeviceProperties(props=, device=0) -3802701305255618:3802701305368889 1991:1991 hipMalloc(ptr=0x7fce16e0dec3, size=4194304) -3802701305370969:3802701305429809 1991:1991 hipMalloc(ptr=0x7fffc1295178, size=4194304) +129855603476896:129855603483734 14696:14696 hipGetDeviceProperties(props={}, device=0) :1 +129855604686134:129855605152950 14696:14696 hipMalloc(ptr=0x7fd65ce00000, size=4194304) :2 +129855605160451:129855605528247 14696:14696 hipMalloc(ptr=0x7fd65c800000, size=4194304) :3 PASSED! ## Iteration (98) ################# -3802701580515709:3802701582582904 0:0 CopyHostToDevice:4:1991 -3802701583225872:3802701584425191 0:0 KernelExecution:8:1991 -3802701583217109:3802701586447303 0:0 CopyDeviceToHost:10:1991 -3802701594795564:3802701596533727 0:0 CopyHostToDevice:11:1991 -3802701596646592:3802701597848875 0:0 KernelExecution:15:1991 -3802701596604988:3802701599522360 0:0 CopyDeviceToHost:17:1991 PASSED! ## Iteration (97) ################# PASSED! ## Iteration (96) ################# +129855955913848:129855957428192 0:0 CopyHostToDevice:4:14696 +129855958763342:129855959991823 0:0 KernelExecution:8:14696 +129855958734601:129855961705377 0:0 CopyDeviceToHost:10:14696 +129855971471522:129855972254607 0:0 CopyHostToDevice:11:14696 +129855972381516:129855973633356 0:0 KernelExecution:15:14696 +129855972673800:129855974135421 0:0 CopyDeviceToHost:17:14696 +129855980290261:129855981019714 0:0 CopyHostToDevice:18:14696 +129855981112002:129855982336482 0:0 KernelExecution:22:14696 +129855981076333:129855982783351 0:0 CopyDeviceToHost:24:14696 +129855988849671:129855989612220 0:0 CopyHostToDevice:25:14696 +129855989696159:129855990920319 0:0 KernelExecution:29:14696 +129855989668256:129855991384209 0:0 CopyDeviceToHost:31:14696 +129855605540988:129855957443403 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :4 +129855957456260:129855957456261 14696:14696 MARK(name(before HIP LaunchKernel)) +129855957507034:129855957514510 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :6 +129855957521000:129855957523014 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :7 +129855957529950:129855958671150 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :8 +129855958701410:129855958701411 14696:14696 MARK(name(after HIP LaunchKernel)) +129855958708321:129855961719221 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :10 +129855971408776:129855972257972 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :11 +129855972261515:129855972261516 14696:14696 MARK(name(before HIP LaunchKernel)) +129855972266736:129855972268234 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :13 +129855972271629:129855972272780 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :14 +129855972276181:129855972282118 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :15 +129855972663504:129855972663505 14696:14696 MARK(name(after HIP LaunchKernel)) +129855972666015:129855974143463 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :17 +129855980222888:129855981023250 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :18 +129855981025473:129855981025474 14696:14696 MARK(name(before HIP LaunchKernel)) +129855981028834:129855981029831 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :20 +129855981032043:129855981032913 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :21 +129855981035237:129855981038997 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :22 +129855981041265:129855981041266 14696:14696 MARK(name(after HIP LaunchKernel)) +129855981043695:129855982796928 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :24 +129855988764565:129855989615901 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :25 +129855989618073:129855989618074 14696:14696 MARK(name(before HIP LaunchKernel)) +129855989621096:129855989622129 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :27 +129855989624243:129855989625087 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :28 +129855989627271:129855989630934 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :29 +129855989632959:129855989632960 14696:14696 MARK(name(after HIP LaunchKernel)) +129855989635351:129855991396402 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :31 PASSED! ## Iteration (95) ################# PASSED! @@ -32,30 +65,6 @@ PASSED! ## Iteration (91) ################# PASSED! ## Iteration (90) ################# -3802701606826614:3802701608688328 0:0 CopyHostToDevice:18:1991 -3802701608781496:3802701609988668 0:0 KernelExecution:22:1991 -3802701608758548:3802701611510159 0:0 CopyDeviceToHost:24:1991 -3802701618702082:3802701620571865 0:0 CopyHostToDevice:25:1991 -3802701620675087:3802701621878110 0:0 KernelExecution:29:1991 -3802701620650876:3802701623502597 0:0 CopyDeviceToHost:31:1991 -3802701630690881:3802701632557164 0:0 CopyHostToDevice:32:1991 -3802701632661061:3802701633864973 0:0 KernelExecution:36:1991 -3802701632637885:3802701635182424 0:0 CopyDeviceToHost:38:1991 -3802701642392578:3802701644307152 0:0 CopyHostToDevice:39:1991 -3802701644410516:3802701645608650 0:0 KernelExecution:43:1991 -3802701644387082:3802701647064112 0:0 CopyDeviceToHost:45:1991 -3802701654288485:3802701656163049 0:0 CopyHostToDevice:46:1991 -3802701656267334:3802701657467098 0:0 KernelExecution:50:1991 -3802701656244070:3802701658916870 0:0 CopyDeviceToHost:52:1991 -3802701666450396:3802701668378780 0:0 CopyHostToDevice:53:1991 -3802701668482438:3802701669683832 0:0 KernelExecution:57:1991 -3802701668458481:3802701671148361 0:0 CopyDeviceToHost:59:1991 -3802701678631556:3802701680505490 0:0 CopyHostToDevice:60:1991 -3802701680609945:3802701681806894 0:0 KernelExecution:64:1991 -3802701680586811:3802701683591443 0:0 CopyDeviceToHost:66:1991 -3802701691032768:3802701692918102 0:0 CopyHostToDevice:67:1991 -3802701693021896:3802701694223438 0:0 KernelExecution:71:1991 -3802701692999202:3802701695886464 0:0 CopyDeviceToHost:73:1991 PASSED! ## Iteration (89) ################# PASSED! @@ -68,36 +77,132 @@ PASSED! ## Iteration (85) ################# PASSED! ## Iteration (84) ################# +129855997366746:129855998130772 0:0 CopyHostToDevice:32:14696 +129855998225065:129855999449385 0:0 KernelExecution:36:14696 +129855998197249:129855999925825 0:0 CopyDeviceToHost:38:14696 +129856005895171:129856006661973 0:0 CopyHostToDevice:39:14696 +129856006745770:129856007968491 0:0 KernelExecution:43:14696 +129856006717709:129856008455141 0:0 CopyDeviceToHost:45:14696 +129856014425283:129856015187951 0:0 CopyHostToDevice:46:14696 +129856015270363:129856016493884 0:0 KernelExecution:50:14696 +129856015242633:129856016989490 0:0 CopyDeviceToHost:52:14696 +129856022971470:129856023730704 0:0 CopyHostToDevice:53:14696 +129856023813883:129856025033244 0:0 KernelExecution:57:14696 +129856023785712:129856025544334 0:0 CopyDeviceToHost:59:14696 +129856031596064:129856032498907 0:0 CopyHostToDevice:60:14696 +129856032586758:129856033809639 0:0 KernelExecution:64:14696 +129856032558443:129856034354036 0:0 CopyDeviceToHost:66:14696 +129856040416553:129856041127473 0:0 CopyHostToDevice:67:14696 +129856041212287:129856042435488 0:0 KernelExecution:71:14696 +129856041184491:129856042941958 0:0 CopyDeviceToHost:73:14696 +129856049061163:129856049826011 0:0 CopyHostToDevice:74:14696 +129856049910719:129856051134400 0:0 KernelExecution:78:14696 +129856049882831:129856051651620 0:0 CopyDeviceToHost:80:14696 +129856057864499:129856058629610 0:0 CopyHostToDevice:81:14696 +129856058712855:129856059935896 0:0 KernelExecution:85:14696 +129856058684894:129856060452569 0:0 CopyDeviceToHost:87:14696 +129856066769721:129856067537899 0:0 CopyHostToDevice:88:14696 +129856067621801:129856068845321 0:0 KernelExecution:92:14696 +129856067594217:129856069423348 0:0 CopyDeviceToHost:94:14696 +129856075784739:129856076568384 0:0 CopyHostToDevice:95:14696 +129856076658166:129856077880567 0:0 KernelExecution:99:14696 +129856076630540:129856078394130 0:0 CopyDeviceToHost:101:14696 +129856084835135:129856085603333 0:0 CopyHostToDevice:102:14696 +129856085689351:129856086911912 0:0 KernelExecution:106:14696 +129856085661614:129856087438495 0:0 CopyDeviceToHost:108:14696 +129856093911070:129856094682948 0:0 CopyHostToDevice:109:14696 +129856094767987:129856095991348 0:0 KernelExecution:113:14696 +129856094739044:129856096520182 0:0 CopyDeviceToHost:115:14696 +129855997303698:129855998134058 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :32 +129855998136242:129855998136243 14696:14696 MARK(name(before HIP LaunchKernel)) +129855998138933:129855998139817 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :34 +129855998141918:129855998142773 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :35 +129855998144935:129855998149221 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :36 +129855998151431:129855998151432 14696:14696 MARK(name(after HIP LaunchKernel)) +129855998153828:129855999937506 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :38 +129856005829520:129856006665192 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :39 +129856006667396:129856006667397 14696:14696 MARK(name(before HIP LaunchKernel)) +129856006670307:129856006671160 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :41 +129856006673376:129856006674209 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :42 +129856006676323:129856006679651 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :43 +129856006681635:129856006681636 14696:14696 MARK(name(after HIP LaunchKernel)) +129856006683967:129856008469471 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :45 +129856014360174:129856015191285 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :46 +129856015193489:129856015193490 14696:14696 MARK(name(before HIP LaunchKernel)) +129856015196342:129856015197217 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :48 +129856015199400:129856015200221 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :49 +129856015202314:129856015205930 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :50 +129856015208058:129856015208059 14696:14696 MARK(name(after HIP LaunchKernel)) +129856015210764:129856017001555 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :52 +129856022908053:129856023733985 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :53 +129856023736320:129856023736321 14696:14696 MARK(name(before HIP LaunchKernel)) +129856023739178:129856023740063 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :55 +129856023742240:129856023743090 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :56 +129856023745309:129856023748845 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :57 +129856023750891:129856023750892 14696:14696 MARK(name(after HIP LaunchKernel)) +129856023753396:129856025556257 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :59 +129856031530409:129856032503170 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :60 +129856032505392:129856032505393 14696:14696 MARK(name(before HIP LaunchKernel)) +129856032508345:129856032509226 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :62 +129856032511486:129856032512316 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :63 +129856032514599:129856032518036 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :64 +129856032520150:129856032520151 14696:14696 MARK(name(after HIP LaunchKernel)) +129856032522410:129856034373111 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :66 +129856040397979:129856041130687 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :67 +129856041132973:129856041132974 14696:14696 MARK(name(before HIP LaunchKernel)) +129856041136399:129856041137389 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :69 +129856041139653:129856041140500 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :70 +129856041142893:129856041146663 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :71 +129856041148645:129856041148646 14696:14696 MARK(name(after HIP LaunchKernel)) +129856041151128:129856042953843 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :73 +129856048994841:129856049829566 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :74 +129856049831724:129856049831725 14696:14696 MARK(name(before HIP LaunchKernel)) +129856049834527:129856049835413 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :76 +129856049837759:129856049838585 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :77 +129856049840796:129856049844487 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :78 +129856049846529:129856049846530 14696:14696 MARK(name(after HIP LaunchKernel)) +129856049848934:129856051663797 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :80 +129856057798518:129856058633464 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :81 +129856058635650:129856058635651 14696:14696 MARK(name(before HIP LaunchKernel)) +129856058638530:129856058639560 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :83 +129856058641994:129856058642826 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :84 +129856058645125:129856058648721 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :85 +129856058650749:129856058650750 14696:14696 MARK(name(after HIP LaunchKernel)) +129856058653478:129856060466863 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :87 +129856066704603:129856067541502 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :88 +129856067543802:129856067543803 14696:14696 MARK(name(before HIP LaunchKernel)) +129856067546791:129856067547681 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :90 +129856067550027:129856067550854 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :91 +129856067553125:129856067556952 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :92 +129856067559149:129856067559150 14696:14696 MARK(name(after HIP LaunchKernel)) +129856067561903:129856069442958 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :94 +129856075719215:129856076572398 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :95 +129856076574828:129856076574829 14696:14696 MARK(name(before HIP LaunchKernel)) +129856076578071:129856076578997 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :97 +129856076581286:129856076582119 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :98 +129856076584498:129856076588395 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :99 +129856076590554:129856076590555 14696:14696 MARK(name(after HIP LaunchKernel)) +129856076592857:129856078406672 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :101 +129856084768530:129856085607081 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :102 +129856085609437:129856085609438 14696:14696 MARK(name(before HIP LaunchKernel)) +129856085612528:129856085613498 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :104 +129856085615751:129856085616602 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :105 +129856085618831:129856085623039 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :106 +129856085625178:129856085625179 14696:14696 MARK(name(after HIP LaunchKernel)) +129856085627731:129856087451206 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :108 +129856093846767:129856094686797 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :109 +129856094689153:129856094689154 14696:14696 MARK(name(before HIP LaunchKernel)) +129856094692497:129856094693485 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :111 +129856094695727:129856094696598 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :112 +129856094698884:129856094702856 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :113 +129856094705178:129856094705179 14696:14696 MARK(name(after HIP LaunchKernel)) +129856094707931:129856096534639 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :115 PASSED! ## Iteration (83) ################# PASSED! ## Iteration (82) ################# PASSED! ## Iteration (81) ################# -3802701703288299:3802701705170783 0:0 CopyHostToDevice:74:1991 -3802701705274243:3802701706486156 0:0 KernelExecution:78:1991 -3802701705250604:3802701707936074 0:0 CopyDeviceToHost:80:1991 -3802701715184407:3802701716946440 0:0 CopyHostToDevice:81:1991 -3802701717062173:3802701718258234 0:0 KernelExecution:85:1991 -3802701717027281:3802701719895352 0:0 CopyDeviceToHost:87:1991 -3802701727144976:3802701729139460 0:0 CopyHostToDevice:88:1991 -3802701729244175:3802701730445125 0:0 KernelExecution:92:1991 -3802701729220511:3802701732165583 0:0 CopyDeviceToHost:94:1991 -3802701739387037:3802701741142680 0:0 CopyHostToDevice:95:1991 -3802701741249310:3802701742453815 0:0 KernelExecution:99:1991 -3802701741225710:3802701744149042 0:0 CopyDeviceToHost:101:1991 -3802701751388465:3802701753137668 0:0 CopyHostToDevice:102:1991 -3802701753243075:3802701754440321 0:0 KernelExecution:106:1991 -3802701753219589:3802701756153951 0:0 CopyDeviceToHost:108:1991 -3802701763443335:3802701765498080 0:0 CopyHostToDevice:109:1991 -3802701765603802:3802701766820456 0:0 KernelExecution:113:1991 -3802701765580171:3802701768590463 0:0 CopyDeviceToHost:115:1991 -3802701775866137:3802701777758951 0:0 CopyHostToDevice:116:1991 -3802701777862528:3802701779073255 0:0 KernelExecution:120:1991 -3802701777839322:3802701780544442 0:0 CopyDeviceToHost:122:1991 -3802701787979987:3802701790138553 0:0 CopyHostToDevice:123:1991 -3802701790243940:3802701791446371 0:0 KernelExecution:127:1991 -3802701790220103:3802701792896973 0:0 CopyDeviceToHost:129:1991 PASSED! ## Iteration (80) ################# PASSED! @@ -114,35 +219,118 @@ PASSED! ## Iteration (74) ################# PASSED! ## Iteration (73) ################# -3802701800291738:3802701802179392 0:0 CopyHostToDevice:130:1991 -3802701802285163:3802701803481223 0:0 KernelExecution:134:1991 -3802701802261733:3802701804931343 0:0 CopyDeviceToHost:136:1991 -3802701812337128:3802701814252581 0:0 CopyHostToDevice:137:1991 -3802701814356366:3802701815565464 0:0 KernelExecution:141:1991 -3802701814332902:3802701817015292 0:0 CopyDeviceToHost:143:1991 -3802701824392847:3802701826310401 0:0 CopyHostToDevice:144:1991 -3802701826415256:3802701827613539 0:0 KernelExecution:148:1991 -3802701826391761:3802701829071431 0:0 CopyDeviceToHost:150:1991 -3802701836291435:3802701838179779 0:0 CopyHostToDevice:151:1991 -3802701838283081:3802701839480623 0:0 KernelExecution:155:1991 -3802701838259290:3802701840931690 0:0 CopyDeviceToHost:157:1991 -3802701848294054:3802701850186618 0:0 CopyHostToDevice:158:1991 -3802701850293201:3802701851487632 0:0 KernelExecution:162:1991 -3802701850269869:3802701852937908 0:0 CopyDeviceToHost:164:1991 -3802701860182332:3802701862143417 0:0 CopyHostToDevice:165:1991 -3802701862248805:3802701863444865 0:0 KernelExecution:169:1991 -3802701862224967:3802701865141909 0:0 CopyDeviceToHost:171:1991 -3802701872353003:3802701874265587 0:0 CopyHostToDevice:172:1991 -3802701874371291:3802701875572092 0:0 KernelExecution:176:1991 -3802701874348307:3802701877019147 0:0 CopyDeviceToHost:178:1991 -3802701884267750:3802701886153054 0:0 CopyHostToDevice:179:1991 -3802701886259179:3802701887463536 0:0 KernelExecution:183:1991 -3802701886235615:3802701888914085 0:0 CopyDeviceToHost:185:1991 -3802701896155929:3802701898142244 0:0 CopyHostToDevice:186:1991 -3802701898246687:3802701899454155 0:0 KernelExecution:190:1991 -3802701898223504:3802701901145246 0:0 CopyDeviceToHost:192:1991 +129856103067958:129856103841032 0:0 CopyHostToDevice:116:14696 +129856103927769:129856105150970 0:0 KernelExecution:120:14696 +129856103899316:129856105721054 0:0 CopyDeviceToHost:122:14696 +129856112245852:129856113015798 0:0 CopyHostToDevice:123:14696 +129856113100485:129856114323526 0:0 KernelExecution:127:14696 +129856113072690:129856114900649 0:0 CopyDeviceToHost:129:14696 +129856121600998:129856122374148 0:0 CopyHostToDevice:130:14696 +129856122460856:129856123685017 0:0 KernelExecution:134:14696 +129856122432406:129856124221503 0:0 CopyDeviceToHost:136:14696 +129856130996154:129856131718339 0:0 CopyHostToDevice:137:14696 +129856131803770:129856133026171 0:0 KernelExecution:141:14696 +129856131775718:129856133613724 0:0 CopyDeviceToHost:143:14696 +129856140505813:129856141285491 0:0 CopyHostToDevice:144:14696 +129856141371337:129856142594218 0:0 KernelExecution:148:14696 +129856141343575:129856143188801 0:0 CopyDeviceToHost:150:14696 +129856150234971:129856151016053 0:0 CopyHostToDevice:151:14696 +129856151102892:129856152327053 0:0 KernelExecution:155:14696 +129856151074919:129856152872907 0:0 CopyDeviceToHost:157:14696 +129856159481376:129856160253347 0:0 CopyHostToDevice:158:14696 +129856160343525:129856161566086 0:0 KernelExecution:162:14696 +129856160315355:129856162137295 0:0 CopyDeviceToHost:164:14696 +129856168059715:129856168791250 0:0 CopyHostToDevice:165:14696 +129856168876828:129856170099709 0:0 KernelExecution:169:14696 +129856168849139:129856170629902 0:0 CopyDeviceToHost:171:14696 +129856176005269:129856176724156 0:0 CopyHostToDevice:172:14696 +129856176811979:129856178033100 0:0 KernelExecution:176:14696 +129856176783784:129856178564862 0:0 CopyDeviceToHost:178:14696 +129856183804454:129856184516916 0:0 CopyHostToDevice:179:14696 +129856184609470:129856185832511 0:0 KernelExecution:183:14696 +129856184581802:129856186368858 0:0 CopyDeviceToHost:185:14696 +129856191541921:129856192254454 0:0 CopyHostToDevice:186:14696 +129856192345329:129856193569809 0:0 KernelExecution:190:14696 +129856192317767:129856194105080 0:0 CopyDeviceToHost:192:14696 +129856103003811:129856103844379 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :116 +129856103846787:129856103846788 14696:14696 MARK(name(before HIP LaunchKernel)) +129856103849922:129856103850838 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :118 +129856103853240:129856103854136 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :119 +129856103856444:129856103860149 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :120 +129856103862386:129856103862387 14696:14696 MARK(name(after HIP LaunchKernel)) +129856103864691:129856105741098 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :122 +129856112200226:129856113019342 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :123 +129856113021598:129856113021599 14696:14696 MARK(name(before HIP LaunchKernel)) +129856113024595:129856113025504 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :125 +129856113027902:129856113028756 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :126 +129856113031010:129856113034968 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :127 +129856113037098:129856113037099 14696:14696 MARK(name(after HIP LaunchKernel)) +129856113039452:129856114918382 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :129 +129856121536590:129856122377686 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :130 +129856122380177:129856122380178 14696:14696 MARK(name(before HIP LaunchKernel)) +129856122383242:129856122384157 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :132 +129856122386562:129856122387438 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :133 +129856122389743:129856122393887 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :134 +129856122395917:129856122395918 14696:14696 MARK(name(after HIP LaunchKernel)) +129856122398705:129856124236553 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :136 +129856130930250:129856131721919 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :137 +129856131724534:129856131724535 14696:14696 MARK(name(before HIP LaunchKernel)) +129856131727544:129856131728453 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :139 +129856131730840:129856131731718 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :140 +129856131734248:129856131738338 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :141 +129856131740508:129856131740509 14696:14696 MARK(name(after HIP LaunchKernel)) +129856131742956:129856133633762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :143 +129856140484642:129856141289559 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :144 +129856141292040:129856141292041 14696:14696 MARK(name(before HIP LaunchKernel)) +129856141295360:129856141296366 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :146 +129856141298705:129856141299584 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :147 +129856141301885:129856141305904 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :148 +129856141308287:129856141308288 14696:14696 MARK(name(after HIP LaunchKernel)) +129856141310745:129856143207185 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :150 +129856150167842:129856151019519 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :151 +129856151021903:129856151021904 14696:14696 MARK(name(before HIP LaunchKernel)) +129856151025430:129856151026339 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :153 +129856151028846:129856151029731 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :154 +129856151032070:129856151036399 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :155 +129856151038525:129856151038526 14696:14696 MARK(name(after HIP LaunchKernel)) +129856151041204:129856152887054 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :157 +129856159416500:129856160257922 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :158 +129856160260251:129856160260252 14696:14696 MARK(name(before HIP LaunchKernel)) +129856160263327:129856160264253 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :160 +129856160266588:129856160267551 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :161 +129856160269815:129856160273583 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :162 +129856160275639:129856160275640 14696:14696 MARK(name(after HIP LaunchKernel)) +129856160277873:129856162154856 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :164 +129856167989129:129856168794954 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :165 +129856168796817:129856168796818 14696:14696 MARK(name(before HIP LaunchKernel)) +129856168799680:129856168800356 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :167 +129856168802336:129856168803043 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :168 +129856168804923:129856168808196 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :169 +129856168810026:129856168810027 14696:14696 MARK(name(after HIP LaunchKernel)) +129856168811889:129856170642148 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :171 +129856175935119:129856176727698 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :172 +129856176729573:129856176729574 14696:14696 MARK(name(before HIP LaunchKernel)) +129856176732312:129856176733001 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :174 +129856176734764:129856176735517 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :175 +129856176737306:129856176740961 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :176 +129856176742551:129856176742552 14696:14696 MARK(name(after HIP LaunchKernel)) +129856176744384:129856178576608 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :178 +129856183733862:129856184521359 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :179 PASSED! ## Iteration (72) ################# +129856184523202:129856184523203 14696:14696 MARK(name(before HIP LaunchKernel)) +129856184526239:129856184526918 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :181 +129856184528695:129856184529339 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :182 +129856184531203:129856184534819 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :183 +129856184536444:129856184536445 14696:14696 MARK(name(after HIP LaunchKernel)) +129856184538159:129856186381152 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :185 +129856191471466:129856192258965 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :186 +129856192260887:129856192260888 14696:14696 MARK(name(before HIP LaunchKernel)) +129856192264565:129856192265231 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :188 +129856192266936:129856192267582 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :189 +129856192269493:129856192272647 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :190 +129856192274238:129856192274239 14696:14696 MARK(name(after HIP LaunchKernel)) +129856192276014:129856194117333 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :192 PASSED! ## Iteration (71) ################# PASSED! @@ -157,30 +345,6 @@ PASSED! ## Iteration (66) ################# PASSED! ## Iteration (65) ################# -3802701908363640:3802701910282004 0:0 CopyHostToDevice:193:1991 -3802701910388686:3802701911593636 0:0 KernelExecution:197:1991 -3802701910364944:3802701913041924 0:0 CopyDeviceToHost:199:1991 -3802701920274197:3802701922171761 0:0 CopyHostToDevice:200:1991 -3802701922278125:3802701923475222 0:0 KernelExecution:204:1991 -3802701922254592:3802701924925132 0:0 CopyDeviceToHost:206:1991 -3802701932168496:3802701934142771 0:0 CopyHostToDevice:207:1991 -3802701934246976:3802701935438295 0:0 KernelExecution:211:1991 -3802701934223551:3802701937141613 0:0 CopyDeviceToHost:213:1991 -3802701944352056:3802701946257570 0:0 CopyHostToDevice:214:1991 -3802701946362997:3802701947574317 0:0 KernelExecution:218:1991 -3802701946339571:3802701949023790 0:0 CopyDeviceToHost:220:1991 -3802701956400665:3802701958316110 0:0 CopyHostToDevice:221:1991 -3802701958422590:3802701959641615 0:0 KernelExecution:225:1991 -3802701958399130:3802701961106280 0:0 CopyDeviceToHost:227:1991 -3802701968320724:3802701970208178 0:0 CopyHostToDevice:228:1991 -3802701970318670:3802701971521693 0:0 KernelExecution:232:1991 -3802701970295529:3802701972971609 0:0 CopyDeviceToHost:234:1991 -3802701980199792:3802701982142436 0:0 CopyHostToDevice:235:1991 -3802701982245928:3802701983440062 0:0 KernelExecution:239:1991 -3802701982222487:3802701985143188 0:0 CopyDeviceToHost:241:1991 -3802701992355642:3802701994267646 0:0 CopyHostToDevice:242:1991 -3802701994371730:3802701995578753 0:0 KernelExecution:246:1991 -3802701994348667:3802701997026937 0:0 CopyDeviceToHost:248:1991 PASSED! ## Iteration (64) ################# PASSED! @@ -193,39 +357,152 @@ PASSED! ## Iteration (60) ################# PASSED! ## Iteration (59) ################# +129856199280943:129856199989681 0:0 CopyHostToDevice:193:14696 +129856200075190:129856201299831 0:0 KernelExecution:197:14696 +129856200047538:129856201850341 0:0 CopyDeviceToHost:199:14696 +129856206897412:129856207614253 0:0 CopyHostToDevice:200:14696 +129856207705498:129856208928859 0:0 KernelExecution:204:14696 +129856207676917:129856209473592 0:0 CopyDeviceToHost:206:14696 +129856214432984:129856215196409 0:0 CopyHostToDevice:207:14696 +129856215281304:129856216504825 0:0 KernelExecution:211:14696 +129856215253529:129856217050195 0:0 CopyDeviceToHost:213:14696 +129856221931666:129856222699124 0:0 CopyHostToDevice:214:14696 +129856222785050:129856224007611 0:0 KernelExecution:218:14696 +129856222756874:129856224558196 0:0 CopyDeviceToHost:220:14696 +129856229435728:129856230202586 0:0 CopyHostToDevice:221:14696 +129856230289822:129856231510942 0:0 KernelExecution:225:14696 +129856230262176:129856232049379 0:0 CopyDeviceToHost:227:14696 +129856236838217:129856237549415 0:0 CopyHostToDevice:228:14696 +129856237635376:129856238857136 0:0 KernelExecution:232:14696 +129856237607782:129856239407224 0:0 CopyDeviceToHost:234:14696 +129856244299394:129856245007567 0:0 CopyHostToDevice:235:14696 +129856245099279:129856246322159 0:0 KernelExecution:239:14696 +129856245071193:129856246864706 0:0 CopyDeviceToHost:241:14696 +129856251723187:129856252431603 0:0 CopyHostToDevice:242:14696 +129856252521404:129856253744124 0:0 KernelExecution:246:14696 +129856252493576:129856254289474 0:0 CopyDeviceToHost:248:14696 +129856259171693:129856259879626 0:0 CopyHostToDevice:249:14696 +129856259964936:129856261188937 0:0 KernelExecution:253:14696 +129856259937195:129856261731637 0:0 CopyDeviceToHost:255:14696 +129856266605795:129856267371070 0:0 CopyHostToDevice:256:14696 +129856267455912:129856268680233 0:0 KernelExecution:260:14696 +129856267428297:129856269227260 0:0 CopyDeviceToHost:262:14696 +129856274075448:129856274840296 0:0 CopyHostToDevice:263:14696 +129856274927804:129856276150525 0:0 KernelExecution:267:14696 +129856274899679:129856276695018 0:0 CopyDeviceToHost:269:14696 +129856281565009:129856282326831 0:0 CopyHostToDevice:270:14696 +129856282411157:129856283637077 0:0 KernelExecution:274:14696 +129856282383503:129856284175523 0:0 CopyDeviceToHost:276:14696 +129856288995752:129856289705630 0:0 CopyHostToDevice:277:14696 +129856289793308:129856291014269 0:0 KernelExecution:281:14696 +129856289765547:129856291559219 0:0 CopyDeviceToHost:283:14696 +129856296360197:129856297069117 0:0 CopyHostToDevice:284:14696 +129856297157310:129856298378111 0:0 KernelExecution:288:14696 +129856297129589:129856298914568 0:0 CopyDeviceToHost:290:14696 +129856199220209:129856199993256 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :193 +129856199995165:129856199995166 14696:14696 MARK(name(before HIP LaunchKernel)) +129856199998331:129856199999016 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :195 +129856200000971:129856200001630 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :196 +129856200003348:129856200006409 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :197 +129856200007997:129856200007998 14696:14696 MARK(name(after HIP LaunchKernel)) +129856200009781:129856201864796 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :199 +129856206828954:129856207617612 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :200 +129856207619342:129856207619343 14696:14696 MARK(name(before HIP LaunchKernel)) +129856207633427:129856207634203 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :202 +129856207635929:129856207636565 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :203 +129856207638289:129856207641619 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :204 +129856207643379:129856207643380 14696:14696 MARK(name(after HIP LaunchKernel)) +129856207645338:129856209486625 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :206 +129856214367871:129856215199634 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :207 +129856215201421:129856215201422 14696:14696 MARK(name(before HIP LaunchKernel)) +129856215205034:129856215205701 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :209 +129856215207421:129856215208068 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :210 +129856215209926:129856215213001 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :211 +129856215214576:129856215214577 14696:14696 MARK(name(after HIP LaunchKernel)) +129856215216591:129856217062762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :213 +129856221865656:129856222702390 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :214 +129856222704143:129856222704144 14696:14696 MARK(name(before HIP LaunchKernel)) +129856222707593:129856222708263 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :216 +129856222709907:129856222710533 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :217 +129856222712408:129856222715305 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :218 +129856222716820:129856222716821 14696:14696 MARK(name(after HIP LaunchKernel)) +129856222718703:129856224572291 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :220 +129856229369321:129856230206171 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :221 +129856230207933:129856230207934 14696:14696 MARK(name(before HIP LaunchKernel)) +129856230211408:129856230212070 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :223 +129856230213729:129856230214356 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :224 +129856230216306:129856230219552 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :225 +129856230221084:129856230221085 14696:14696 MARK(name(after HIP LaunchKernel)) +129856230222856:129856232061167 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :227 +129856236820359:129856237552651 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :228 +129856237554349:129856237554350 14696:14696 MARK(name(before HIP LaunchKernel)) +129856237557958:129856237558615 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :230 +129856237560382:129856237561016 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :231 +129856237562876:129856237566063 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :232 +129856237567608:129856237567609 14696:14696 MARK(name(after HIP LaunchKernel)) +129856237569296:129856239419101 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :234 +129856244174381:129856245010977 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :235 +129856245012718:129856245012719 14696:14696 MARK(name(before HIP LaunchKernel)) +129856245025693:129856245026451 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :237 +129856245028210:129856245028855 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :238 +129856245030730:129856245034177 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :239 +129856245035805:129856245035806 14696:14696 MARK(name(after HIP LaunchKernel)) +129856245038122:129856246876538 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :241 +129856251653109:129856252435896 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :242 +129856252437833:129856252437834 14696:14696 MARK(name(before HIP LaunchKernel)) +129856252441362:129856252442017 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :244 +129856252443660:129856252444296 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :245 +129856252446165:129856252449155 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :246 +129856252450809:129856252450810 14696:14696 MARK(name(after HIP LaunchKernel)) +129856252452579:129856254303055 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :248 +129856259101952:129856259882749 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :249 +129856259884515:129856259884516 14696:14696 MARK(name(before HIP LaunchKernel)) +129856259886742:129856259887392 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :251 +129856259889040:129856259889671 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :252 +129856259891415:129856259894919 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :253 +129856259896631:129856259896632 14696:14696 MARK(name(after HIP LaunchKernel)) +129856259898324:129856261743974 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :255 +129856266541050:129856267374498 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :256 +129856267376266:129856267376267 14696:14696 MARK(name(before HIP LaunchKernel)) +129856267379647:129856267380320 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :258 +129856267381929:129856267382540 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :259 +129856267384409:129856267387474 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :260 +129856267389033:129856267389034 14696:14696 MARK(name(after HIP LaunchKernel)) +129856267390764:129856269239563 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :262 +129856274008890:129856274843415 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :263 +129856274845095:129856274845096 14696:14696 MARK(name(before HIP LaunchKernel)) +129856274847806:129856274848470 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :265 +129856274850117:129856274850733 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :266 +129856274852427:129856274855749 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :267 +129856274857358:129856274857359 14696:14696 MARK(name(after HIP LaunchKernel)) +129856274859228:129856276707873 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :269 +129856281498759:129856282330118 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :270 +129856282332044:129856282332045 14696:14696 MARK(name(before HIP LaunchKernel)) +129856282335358:129856282336015 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :272 +129856282338029:129856282338668 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :273 +129856282340644:129856282343485 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :274 +129856282345028:129856282345029 14696:14696 MARK(name(after HIP LaunchKernel)) +129856282347024:129856284203838 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :276 +129856288978096:129856289708673 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :277 +129856289710414:129856289710415 14696:14696 MARK(name(before HIP LaunchKernel)) +129856289714250:129856289714924 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :279 +129856289716689:129856289717305 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :280 +129856289719150:129856289722057 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :281 +129856289723677:129856289723678 14696:14696 MARK(name(after HIP LaunchKernel)) +129856289725380:129856291571314 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :283 +129856296341271:129856297072486 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :284 +129856297074313:129856297074314 14696:14696 MARK(name(before HIP LaunchKernel)) +129856297077733:129856297078380 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :286 +129856297080109:129856297080733 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :287 +129856297082729:129856297085646 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :288 +129856297087184:129856297087185 14696:14696 MARK(name(after HIP LaunchKernel)) +129856297089004:129856298926004 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :290 PASSED! ## Iteration (58) ################# PASSED! ## Iteration (57) ################# PASSED! ## Iteration (56) ################# -3802702004315971:3802702006430907 0:0 CopyHostToDevice:249:1991 -3802702006513343:3802702007718885 0:0 KernelExecution:253:1991 -3802702006490217:3802702009041896 0:0 CopyDeviceToHost:255:1991 -3802702018262184:3802702019943876 0:0 CopyHostToDevice:256:1991 -3802702020050568:3802702021249295 0:0 KernelExecution:260:1991 -3802702020026907:3802702022584386 0:0 CopyDeviceToHost:262:1991 -3802702029541468:3802702031219270 0:0 CopyHostToDevice:263:1991 -3802702031312763:3802702032510305 0:0 KernelExecution:267:1991 -3802702031289161:3802702033843490 0:0 CopyDeviceToHost:269:1991 -3802702040805082:3802702042480244 0:0 CopyHostToDevice:270:1991 -3802702042572785:3802702043776105 0:0 KernelExecution:274:1991 -3802702042549004:3802702045110673 0:0 CopyDeviceToHost:276:1991 -3802702052065204:3802702053741167 0:0 CopyHostToDevice:277:1991 -3802702053835958:3802702055052463 0:0 KernelExecution:281:1991 -3802702053813487:3802702056374447 0:0 CopyDeviceToHost:283:1991 -3802702063333568:3802702065014061 0:0 CopyHostToDevice:284:1991 -3802702065111999:3802702066319615 0:0 KernelExecution:288:1991 -3802702065088771:3802702067654340 0:0 CopyDeviceToHost:290:1991 -3802702074618962:3802702076284625 0:0 CopyHostToDevice:291:1991 -3802702076384443:3802702077569835 0:0 KernelExecution:295:1991 -3802702076360685:3802702078904404 0:0 CopyDeviceToHost:297:1991 -3802702085881125:3802702087555758 0:0 CopyHostToDevice:298:1991 -3802702087649675:3802702088847958 0:0 KernelExecution:302:1991 -3802702087626608:3802702090183277 0:0 CopyDeviceToHost:304:1991 -3802702097151929:3802702098830722 0:0 CopyHostToDevice:305:1991 -3802702098924116:3802702100140473 0:0 KernelExecution:309:1991 -3802702098901192:3802702101472621 0:0 CopyDeviceToHost:311:1991 PASSED! ## Iteration (55) ################# PASSED! @@ -244,35 +521,140 @@ PASSED! ## Iteration (48) ################# PASSED! ## Iteration (47) ################# -3802702108615424:3802702110296796 0:0 CopyHostToDevice:312:1991 -3802702110392443:3802702111600207 0:0 KernelExecution:316:1991 -3802702110368957:3802702112934696 0:0 CopyDeviceToHost:318:1991 -3802702119898217:3802702121579670 0:0 CopyHostToDevice:319:1991 -3802702121673899:3802702122873960 0:0 KernelExecution:323:1991 -3802702121650880:3802702124193909 0:0 CopyDeviceToHost:325:1991 -3802702131156331:3802702132834494 0:0 CopyHostToDevice:326:1991 -3802702132927702:3802702134121984 0:0 KernelExecution:330:1991 -3802702132904324:3802702135456513 0:0 CopyDeviceToHost:332:1991 -3802702142434925:3802702144099207 0:0 CopyHostToDevice:333:1991 -3802702144200141:3802702145401090 0:0 KernelExecution:337:1991 -3802702144175248:3802702146735777 0:0 CopyDeviceToHost:339:1991 -3802702153706898:3802702155385711 0:0 CopyHostToDevice:340:1991 -3802702155488005:3802702156685843 0:0 KernelExecution:344:1991 -3802702155464581:3802702158018890 0:0 CopyDeviceToHost:346:1991 -3802702164987312:3802702166668385 0:0 CopyHostToDevice:347:1991 -3802702166762069:3802702167965537 0:0 KernelExecution:351:1991 -3802702166739105:3802702169298644 0:0 CopyDeviceToHost:353:1991 -3802702176260016:3802702177933188 0:0 CopyHostToDevice:354:1991 -3802702178026430:3802702179223971 0:0 KernelExecution:358:1991 -3802702178002518:3802702180540757 0:0 CopyDeviceToHost:360:1991 -3802702187490789:3802702189167931 0:0 CopyHostToDevice:361:1991 -3802702189262737:3802702190474501 0:0 KernelExecution:365:1991 -3802702189239082:3802702191808141 0:0 CopyDeviceToHost:367:1991 -3802702198761922:3802702200425845 0:0 CopyHostToDevice:368:1991 PASSED! ## Iteration (46) ################# PASSED! ## Iteration (45) ################# +129856303845436:129856304622018 0:0 CopyHostToDevice:291:14696 +129856304714456:129856305941176 0:0 KernelExecution:295:14696 +129856304686879:129856306490313 0:0 CopyDeviceToHost:297:14696 +129856311333818:129856312045157 0:0 CopyHostToDevice:298:14696 +129856312128568:129856313351929 0:0 KernelExecution:302:14696 +129856312100713:129856313892452 0:0 CopyDeviceToHost:304:14696 +129856318773490:129856319480599 0:0 CopyHostToDevice:305:14696 +129856319573103:129856320793904 0:0 KernelExecution:309:14696 +129856319544959:129856321343459 0:0 CopyDeviceToHost:311:14696 +129856326211019:129856326977511 0:0 CopyHostToDevice:312:14696 +129856327061875:129856328282996 0:0 KernelExecution:316:14696 +129856327034134:129856328825473 0:0 CopyDeviceToHost:318:14696 +129856333673698:129856334437330 0:0 CopyHostToDevice:319:14696 +129856334523567:129856335745168 0:0 KernelExecution:323:14696 +129856334495713:129856336293262 0:0 CopyDeviceToHost:325:14696 +129856341101442:129856341984561 0:0 CopyHostToDevice:326:14696 +129856342071670:129856343294870 0:0 KernelExecution:330:14696 +129856342043988:129856343840850 0:0 CopyDeviceToHost:332:14696 +129856348646308:129856349354803 0:0 CopyHostToDevice:333:14696 +129856349441279:129856350662399 0:0 KernelExecution:337:14696 +129856349413003:129856351203503 0:0 CopyDeviceToHost:339:14696 +129856356094471:129856356820623 0:0 CopyHostToDevice:340:14696 +129856356907355:129856358130235 0:0 KernelExecution:344:14696 +129856356879789:129856358671945 0:0 CopyDeviceToHost:346:14696 +129856363528023:129856364288036 0:0 CopyHostToDevice:347:14696 +129856364405580:129856365626380 0:0 KernelExecution:351:14696 +129856364377906:129856366172703 0:0 CopyDeviceToHost:353:14696 +129856371087592:129856371798847 0:0 CopyHostToDevice:354:14696 +129856371883929:129856373108889 0:0 KernelExecution:358:14696 +129856371855593:129856373655534 0:0 CopyDeviceToHost:360:14696 +129856378493711:129856379257336 0:0 CopyHostToDevice:361:14696 +129856379342581:129856380565301 0:0 KernelExecution:365:14696 +129856379314699:129856381113012 0:0 CopyDeviceToHost:367:14696 +129856385977586:129856386744228 0:0 CopyHostToDevice:368:14696 +129856386831442:129856388055123 0:0 KernelExecution:372:14696 +129856386803378:129856388598263 0:0 CopyDeviceToHost:374:14696 +129856393484361:129856394251866 0:0 CopyHostToDevice:375:14696 +129856394339138:129856395561058 0:0 KernelExecution:379:14696 +129856394311639:129856396103600 0:0 CopyDeviceToHost:381:14696 +129856303774990:129856304626161 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :291 +129856304627884:129856304627885 14696:14696 MARK(name(before HIP LaunchKernel)) +129856304631072:129856304631723 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :293 +129856304633373:129856304634007 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :294 +129856304635811:129856304639104 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :295 +129856304640848:129856304640849 14696:14696 MARK(name(after HIP LaunchKernel)) +129856304642651:129856306501959 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :297 +129856311264292:129856312048766 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :298 +129856312050539:129856312050540 14696:14696 MARK(name(before HIP LaunchKernel)) +129856312053498:129856312054174 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :300 +129856312055946:129856312056653 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :301 +129856312058397:129856312061589 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :302 +129856312063201:129856312063202 14696:14696 MARK(name(after HIP LaunchKernel)) +129856312065053:129856313904746 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :304 +129856318704110:129856319483869 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :305 +129856319485543:129856319485544 14696:14696 MARK(name(before HIP LaunchKernel)) +129856319499258:129856319500048 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :307 +129856319501759:129856319502401 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :308 +129856319504307:129856319507787 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :309 +129856319509535:129856319509536 14696:14696 MARK(name(after HIP LaunchKernel)) +129856319511552:129856321356021 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :311 +129856326144210:129856326980680 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :312 +129856326982483:129856326982484 14696:14696 MARK(name(before HIP LaunchKernel)) +129856326986163:129856326986815 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :314 +129856326988581:129856326989210 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :315 +129856326991095:129856326994082 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :316 +129856326995650:129856326995651 14696:14696 MARK(name(after HIP LaunchKernel)) +129856326997461:129856328838450 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :318 +129856333608209:129856334440902 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :319 +129856334442697:129856334442698 14696:14696 MARK(name(before HIP LaunchKernel)) +129856334446427:129856334447095 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :321 +129856334448793:129856334449426 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :322 +129856334451308:129856334454120 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :323 +129856334455718:129856334455719 14696:14696 MARK(name(after HIP LaunchKernel)) +129856334457508:129856336307654 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :325 +129856341084552:129856341987761 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :326 +129856341989501:129856341989502 14696:14696 MARK(name(before HIP LaunchKernel)) +129856341992961:129856341993616 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :328 +129856341995311:129856341995915 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :329 +129856341997784:129856342000844 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :330 +129856342002457:129856342002458 14696:14696 MARK(name(after HIP LaunchKernel)) +129856342004209:129856343852827 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :332 +129856348628207:129856349358297 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :333 +129856349360014:129856349360015 14696:14696 MARK(name(before HIP LaunchKernel)) +129856349363641:129856349364301 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :335 +129856349365955:129856349366590 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :336 +129856349368410:129856349371392 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :337 +129856349373001:129856349373002 14696:14696 MARK(name(after HIP LaunchKernel)) +129856349374736:129856351215163 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :339 +129856356026231:129856356823939 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :340 +129856356825939:129856356825940 14696:14696 MARK(name(before HIP LaunchKernel)) +129856356829316:129856356829967 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :342 +129856356831607:129856356832235 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :343 +129856356834103:129856356837300 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :344 +129856356838880:129856356838881 14696:14696 MARK(name(after HIP LaunchKernel)) +129856356840997:129856358683474 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :346 +129856363457621:129856364292098 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :347 +129856364293909:129856364293910 14696:14696 MARK(name(before HIP LaunchKernel)) +129856364296242:129856364296921 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :349 +129856364298665:129856364299325 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :350 +129856364301137:129856364304805 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :351 +129856364306614:129856364306615 14696:14696 MARK(name(after HIP LaunchKernel)) +129856364308432:129856366185192 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :353 +129856371019019:129856371802348 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :354 +129856371804072:129856371804073 14696:14696 MARK(name(before HIP LaunchKernel)) +129856371807407:129856371808089 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :356 +129856371809769:129856371810408 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :357 +129856371812409:129856371815399 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :358 +129856371816938:129856371816939 14696:14696 MARK(name(after HIP LaunchKernel)) +129856371818730:129856373668223 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :360 +129856378427685:129856379260530 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :361 +129856379262413:129856379262414 14696:14696 MARK(name(before HIP LaunchKernel)) +129856379266028:129856379266680 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :363 +129856379268334:129856379268974 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :364 +129856379270951:129856379274011 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :365 +129856379275576:129856379275577 14696:14696 MARK(name(after HIP LaunchKernel)) +129856379277516:129856381125442 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :367 +129856385912709:129856386747747 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :368 +129856386749617:129856386749618 14696:14696 MARK(name(before HIP LaunchKernel)) +129856386753015:129856386753700 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :370 +129856386755603:129856386756230 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :371 +129856386758107:129856386761145 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :372 +129856386762828:129856386762829 14696:14696 MARK(name(after HIP LaunchKernel)) +129856386764527:129856388613300 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :374 +129856393418103:129856394255127 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :375 +129856394257084:129856394257085 14696:14696 MARK(name(before HIP LaunchKernel)) +129856394260727:129856394261393 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :377 +129856394263117:129856394263752 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :378 +129856394266100:129856394269007 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :379 +129856394270594:129856394270595 14696:14696 MARK(name(after HIP LaunchKernel)) +129856394272528:129856396115719 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :381 PASSED! ## Iteration (44) ################# PASSED! @@ -287,33 +669,6 @@ PASSED! ## Iteration (39) ################# PASSED! ## Iteration (38) ################# -3802702200526879:3802702201738792 0:0 KernelExecution:372:1991 -3802702200502735:3802702203073505 0:0 CopyDeviceToHost:374:1991 -3802702210101096:3802702211781099 0:0 CopyHostToDevice:375:1991 -3802702211874278:3802702213070339 0:0 KernelExecution:379:1991 -3802702211851149:3802702214405528 0:0 CopyDeviceToHost:381:1991 -3802702221371170:3802702223046872 0:0 CopyHostToDevice:382:1991 -3802702223141537:3802702224348264 0:0 KernelExecution:386:1991 -3802702223118273:3802702225680452 0:0 CopyDeviceToHost:388:1991 -3802702232644404:3802702234313936 0:0 CopyHostToDevice:389:1991 -3802702234409358:3802702235607788 0:0 KernelExecution:393:1991 -3802702234385867:3802702236943196 0:0 CopyDeviceToHost:395:1991 -3802702243900787:3802702245580279 0:0 CopyHostToDevice:396:1991 -3802702245674663:3802702246878279 0:0 KernelExecution:400:1991 -3802702245651760:3802702248198969 0:0 CopyDeviceToHost:402:1991 -3802702255168930:3802702256847073 0:0 CopyHostToDevice:403:1991 -3802702256941454:3802702258155589 0:0 KernelExecution:407:1991 -3802702256918733:3802702259489683 0:0 CopyDeviceToHost:409:1991 -3802702266456174:3802702268121957 0:0 CopyHostToDevice:410:1991 -3802702268222984:3802702269419637 0:0 KernelExecution:414:1991 -3802702268198287:3802702270718936 0:0 CopyDeviceToHost:416:1991 -3802702277684438:3802702279355020 0:0 CopyHostToDevice:417:1991 -3802702279449065:3802702280656977 0:0 KernelExecution:421:1991 -3802702279425380:3802702281990519 0:0 CopyDeviceToHost:423:1991 -3802702288963001:3802702290626813 0:0 CopyHostToDevice:424:1991 -3802702290725647:3802702291916077 0:0 KernelExecution:428:1991 -3802702290702274:3802702293249973 0:0 CopyDeviceToHost:430:1991 -3802702300213905:3802702301888607 0:0 CopyHostToDevice:431:1991 PASSED! ## Iteration (37) ################# PASSED! @@ -326,38 +681,152 @@ PASSED! ## Iteration (33) ################# PASSED! ## Iteration (32) ################# +129856400949298:129856401690102 0:0 CopyHostToDevice:382:14696 +129856401774737:129856402998097 0:0 KernelExecution:386:14696 +129856401746598:129856403538591 0:0 CopyDeviceToHost:388:14696 +129856408364229:129856409075649 0:0 CopyHostToDevice:389:14696 +129856409181579:129856410405739 0:0 KernelExecution:393:14696 +129856409154049:129856410946890 0:0 CopyDeviceToHost:395:14696 +129856415833858:129856416545026 0:0 CopyHostToDevice:396:14696 +129856416634688:129856417856288 0:0 KernelExecution:400:14696 +129856416607076:129856418397645 0:0 CopyDeviceToHost:402:14696 +129856423255064:129856423962733 0:0 CopyHostToDevice:403:14696 +129856424049344:129856425272224 0:0 KernelExecution:407:14696 +129856424021555:129856425837337 0:0 CopyDeviceToHost:409:14696 +129856430719717:129856431443207 0:0 CopyHostToDevice:410:14696 +129856431530370:129856432753411 0:0 KernelExecution:414:14696 +129856431502760:129856433290891 0:0 CopyDeviceToHost:416:14696 +129856438127461:129856438893077 0:0 CopyHostToDevice:417:14696 +129856438981153:129856440204834 0:0 KernelExecution:421:14696 +129856438953062:129856440755527 0:0 CopyDeviceToHost:423:14696 +129856445658301:129856446425541 0:0 CopyHostToDevice:424:14696 +129856446512512:129856447734433 0:0 KernelExecution:428:14696 +129856446484748:129856448303143 0:0 CopyDeviceToHost:430:14696 +129856453131279:129856453895371 0:0 CopyHostToDevice:431:14696 +129856453982502:129856455205222 0:0 KernelExecution:435:14696 +129856453954390:129856455747092 0:0 CopyDeviceToHost:437:14696 +129856460549446:129856461267384 0:0 CopyHostToDevice:438:14696 +129856461354488:129856462578648 0:0 KernelExecution:442:14696 +129856461327009:129856463119514 0:0 CopyDeviceToHost:444:14696 +129856467954463:129856468665082 0:0 CopyHostToDevice:445:14696 +129856468756966:129856469978566 0:0 KernelExecution:449:14696 +129856468728958:129856470519550 0:0 CopyDeviceToHost:451:14696 +129856475396016:129856476106990 0:0 CopyHostToDevice:452:14696 +129856476191506:129856477415026 0:0 KernelExecution:456:14696 +129856476164143:129856477979522 0:0 CopyDeviceToHost:458:14696 +129856482841902:129856483550322 0:0 CopyHostToDevice:459:14696 +129856483636804:129856484858245 0:0 KernelExecution:463:14696 +129856483608842:129856485404598 0:0 CopyDeviceToHost:465:14696 +129856490264533:129856491036044 0:0 CopyHostToDevice:466:14696 +129856491121979:129856492348219 0:0 KernelExecution:470:14696 +129856491094217:129856492893929 0:0 CopyDeviceToHost:472:14696 +129856497730065:129856498496809 0:0 CopyHostToDevice:473:14696 +129856498583201:129856499806882 0:0 KernelExecution:477:14696 +129856498555486:129856500349740 0:0 CopyDeviceToHost:479:14696 +129856400931528:129856401693841 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :382 +129856401695697:129856401695698 14696:14696 MARK(name(before HIP LaunchKernel)) +129856401698086:129856401698763 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :384 +129856401700644:129856401701356 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :385 +129856401703387:129856401706670 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :386 +129856401708283:129856401708284 14696:14696 MARK(name(after HIP LaunchKernel)) +129856401710202:129856403550731 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :388 +129856408346178:129856409079144 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :389 +129856409080946:129856409080947 14696:14696 MARK(name(before HIP LaunchKernel)) +129856409119575:129856409120361 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :391 +129856409122350:129856409122982 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :392 +129856409124716:129856409127974 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :393 +129856409129722:129856409129723 14696:14696 MARK(name(after HIP LaunchKernel)) +129856409131595:129856410958682 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :395 +129856415764088:129856416549283 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :396 +129856416551147:129856416551148 14696:14696 MARK(name(before HIP LaunchKernel)) +129856416554753:129856416555457 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :398 +129856416557440:129856416558065 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :399 +129856416560077:129856416563543 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :400 +129856416565220:129856416565221 14696:14696 MARK(name(after HIP LaunchKernel)) +129856416567086:129856418410890 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :402 +129856423185992:129856423965984 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :403 +129856423967686:129856423967687 14696:14696 MARK(name(before HIP LaunchKernel)) +129856423971156:129856423971813 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :405 +129856423973453:129856423974058 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :406 +129856423975959:129856423979023 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :407 +129856423980620:129856423980621 14696:14696 MARK(name(after HIP LaunchKernel)) +129856423982481:129856425851437 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :409 +129856430649566:129856431446819 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :410 +129856431448647:129856431448648 14696:14696 MARK(name(before HIP LaunchKernel)) +129856431451980:129856431452627 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :412 +129856431454467:129856431455103 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :413 +129856431457061:129856431460021 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :414 +129856431461633:129856431461634 14696:14696 MARK(name(after HIP LaunchKernel)) +129856431463427:129856433305223 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :416 +129856438060199:129856438896337 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :417 +129856438898056:129856438898057 14696:14696 MARK(name(before HIP LaunchKernel)) +129856438901614:129856438902293 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :419 +129856438903944:129856438904582 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :420 +129856438906471:129856438909460 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :421 +129856438910995:129856438910996 14696:14696 MARK(name(after HIP LaunchKernel)) +129856438913099:129856440770029 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :423 +129856445589904:129856446428787 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :424 +129856446430525:129856446430526 14696:14696 MARK(name(before HIP LaunchKernel)) +129856446434097:129856446434755 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :426 +129856446436446:129856446437074 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :427 +129856446438958:129856446442103 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :428 +129856446443705:129856446443706 14696:14696 MARK(name(after HIP LaunchKernel)) +129856446445611:129856448319675 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :430 +129856453113306:129856453898651 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :431 +129856453900443:129856453900444 14696:14696 MARK(name(before HIP LaunchKernel)) +129856453903924:129856453904588 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :433 +129856453906239:129856453906854 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :434 +129856453908740:129856453911874 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :435 +129856453913486:129856453913487 14696:14696 MARK(name(after HIP LaunchKernel)) +129856453915356:129856455761272 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :437 +129856460531599:129856461270590 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :438 +129856461272368:129856461272369 14696:14696 MARK(name(before HIP LaunchKernel)) +129856461275845:129856461276515 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :440 +129856461278198:129856461278850 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :441 +129856461280791:129856461283899 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :442 +129856461285595:129856461285596 14696:14696 MARK(name(after HIP LaunchKernel)) +129856461287388:129856463133280 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :444 +129856467884995:129856468668564 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :445 +129856468670291:129856468670292 14696:14696 MARK(name(before HIP LaunchKernel)) +129856468673055:129856468673710 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :447 +129856468675408:129856468676048 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :448 +129856468677942:129856468681455 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :449 +129856468683148:129856468683149 14696:14696 MARK(name(after HIP LaunchKernel)) +129856468685101:129856470532724 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :451 +129856475326269:129856476110399 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :452 +129856476112220:129856476112221 14696:14696 MARK(name(before HIP LaunchKernel)) +129856476115691:129856476116355 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :454 +129856476118083:129856476118692 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :455 +129856476120553:129856476123478 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :456 +129856476125144:129856476125145 14696:14696 MARK(name(after HIP LaunchKernel)) +129856476126929:129856477993159 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :458 +129856482771986:129856483553655 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :459 +129856483555435:129856483555436 14696:14696 MARK(name(before HIP LaunchKernel)) +129856483559048:129856483559715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :461 +129856483561368:129856483561995 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :462 +129856483563875:129856483567045 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :463 +129856483569037:129856483569038 14696:14696 MARK(name(after HIP LaunchKernel)) +129856483570875:129856485418803 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :465 +129856490199703:129856491039451 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :466 +129856491041225:129856491041226 14696:14696 MARK(name(before HIP LaunchKernel)) +129856491044551:129856491045204 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :468 +129856491046844:129856491047481 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :469 +129856491049291:129856491052245 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :470 +129856491053805:129856491053806 14696:14696 MARK(name(after HIP LaunchKernel)) +129856491055528:129856492907612 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :472 +129856497665310:129856498500405 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :473 +129856498502066:129856498502067 14696:14696 MARK(name(before HIP LaunchKernel)) +129856498505506:129856498506141 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :475 +129856498507858:129856498508491 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :476 +129856498510523:129856498513554 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :477 +129856498515137:129856498515138 14696:14696 MARK(name(after HIP LaunchKernel)) +129856498517011:129856500365762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :479 PASSED! ## Iteration (31) ################# PASSED! ## Iteration (30) ################# PASSED! ## Iteration (29) ################# -3802702301982442:3802702303186799 0:0 KernelExecution:435:1991 -3802702301959178:3802702304503997 0:0 CopyDeviceToHost:437:1991 -3802702311466108:3802702313146390 0:0 CopyHostToDevice:438:1991 -3802702313238825:3802702314439626 0:0 KernelExecution:442:1991 -3802702313215791:3802702315773720 0:0 CopyDeviceToHost:444:1991 -3802702322736361:3802702324399864 0:0 CopyHostToDevice:445:1991 -3802702324503098:3802702325721085 0:0 KernelExecution:449:1991 -3802702324478794:3802702327055594 0:0 CopyDeviceToHost:451:1991 -3802702334030715:3802702335709388 0:0 CopyHostToDevice:452:1991 -3802702335806620:3802702337014532 0:0 KernelExecution:456:1991 -3802702335783968:3802702338348468 0:0 CopyDeviceToHost:458:1991 -3802702345327399:3802702347004601 0:0 CopyHostToDevice:459:1991 -3802702347117082:3802702348318476 0:0 KernelExecution:463:1991 -3802702347074012:3802702349651691 0:0 CopyDeviceToHost:465:1991 -3802702356616483:3802702358289405 0:0 CopyHostToDevice:466:1991 -3802702358382881:3802702359585164 0:0 KernelExecution:470:1991 -3802702358359406:3802702360920335 0:0 CopyDeviceToHost:472:1991 -3802702367904117:3802702369585909 0:0 CopyHostToDevice:473:1991 -3802702369679903:3802702370875371 0:0 KernelExecution:477:1991 -3802702369656630:3802702372196308 0:0 CopyDeviceToHost:479:1991 -3802702379156600:3802702380837792 0:0 CopyHostToDevice:480:1991 -3802702380930326:3802702382134830 0:0 KernelExecution:484:1991 -3802702380907402:3802702383472292 0:0 CopyDeviceToHost:486:1991 -3802702390441713:3802702392106996 0:0 CopyHostToDevice:487:1991 -3802702392207713:3802702393417847 0:0 KernelExecution:491:1991 -3802702392183556:3802702394752325 0:0 CopyDeviceToHost:493:1991 PASSED! ## Iteration (28) ################# PASSED! @@ -376,37 +845,147 @@ PASSED! ## Iteration (21) ################# PASSED! ## Iteration (20) ################# -3802702401709737:3802702403387670 0:0 CopyHostToDevice:494:1991 -3802702403489293:3802702404695872 0:0 KernelExecution:498:1991 -3802702403465280:3802702406030229 0:0 CopyDeviceToHost:500:1991 -3802702412982171:3802702414646163 0:0 CopyHostToDevice:501:1991 -3802702414739104:3802702415951461 0:0 KernelExecution:505:1991 -3802702414715973:3802702417282642 0:0 CopyDeviceToHost:507:1991 -3802702424250984:3802702425925207 0:0 CopyHostToDevice:508:1991 -3802702426022614:3802702427240009 0:0 KernelExecution:512:1991 -3802702425999277:3802702428556726 0:0 CopyDeviceToHost:514:1991 -3802702435521608:3802702437497583 0:0 CopyHostToDevice:515:1991 -3802702437591756:3802702438798483 0:0 KernelExecution:519:1991 -3802702437567843:3802702440117692 0:0 CopyDeviceToHost:521:1991 -3802702447076184:3802702448752496 0:0 CopyHostToDevice:522:1991 -3802702448844326:3802702450040979 0:0 KernelExecution:526:1991 -3802702448821457:3802702451374905 0:0 CopyDeviceToHost:528:1991 -3802702458338087:3802702460115460 0:0 CopyHostToDevice:529:1991 -3802702460219046:3802702461421625 0:0 KernelExecution:533:1991 -3802702460192041:3802702462758090 0:0 CopyDeviceToHost:535:1991 -3802702469730872:3802702471408304 0:0 CopyHostToDevice:536:1991 -3802702471502923:3802702472699724 0:0 KernelExecution:540:1991 -3802702471478905:3802702474035724 0:0 CopyDeviceToHost:542:1991 -3802702481000815:3802702482659947 0:0 CopyHostToDevice:543:1991 -3802702482757759:3802702483952190 0:0 KernelExecution:547:1991 -3802702482734898:3802702485283566 0:0 CopyDeviceToHost:549:1991 -3802702492244298:3802702493917401 0:0 CopyHostToDevice:550:1991 -3802702494011385:3802702495222705 0:0 KernelExecution:554:1991 -3802702493988441:3802702496538570 0:0 CopyDeviceToHost:556:1991 PASSED! ## Iteration (19) ################# PASSED! ## Iteration (18) ################# +129856505198157:129856505972108 0:0 CopyHostToDevice:480:14696 +129856506058878:129856507279678 0:0 KernelExecution:484:14696 +129856506031181:129856507818608 0:0 CopyDeviceToHost:486:14696 +129856512668452:129856513378344 0:0 CopyHostToDevice:487:14696 +129856513463906:129856514683906 0:0 KernelExecution:491:14696 +129856513435880:129856515225665 0:0 CopyDeviceToHost:493:14696 +129856520057898:129856520789533 0:0 CopyHostToDevice:494:14696 +129856520877018:129856522100858 0:0 KernelExecution:498:14696 +129856520849406:129856522643928 0:0 CopyDeviceToHost:500:14696 +129856527495540:129856528214422 0:0 CopyHostToDevice:501:14696 +129856528300948:129856529522228 0:0 KernelExecution:505:14696 +129856528273469:129856530060374 0:0 CopyDeviceToHost:507:14696 +129856534970413:129856535678341 0:0 CopyHostToDevice:508:14696 +129856535767312:129856536986193 0:0 KernelExecution:512:14696 +129856535739484:129856537527830 0:0 CopyDeviceToHost:514:14696 +129856542452848:129856543222239 0:0 CopyHostToDevice:515:14696 +129856543308707:129856544531907 0:0 KernelExecution:519:14696 +129856543281047:129856545069937 0:0 CopyDeviceToHost:521:14696 +129856549924160:129856550693828 0:0 CopyHostToDevice:522:14696 +129856550779510:129856552004150 0:0 KernelExecution:526:14696 +129856550751409:129856552552270 0:0 CopyDeviceToHost:528:14696 +129856557413139:129856558179223 0:0 CopyHostToDevice:529:14696 +129856558266309:129856559487269 0:0 KernelExecution:533:14696 +129856558237736:129856560027323 0:0 CopyDeviceToHost:535:14696 +129856564827841:129856565542599 0:0 CopyHostToDevice:536:14696 +129856565630041:129856566854841 0:0 KernelExecution:540:14696 +129856565602389:129856567397324 0:0 CopyDeviceToHost:542:14696 +129856572247710:129856572954375 0:0 CopyHostToDevice:543:14696 +129856573041963:129856574264203 0:0 KernelExecution:547:14696 +129856573013452:129856574809983 0:0 CopyDeviceToHost:549:14696 +129856579656436:129856580368439 0:0 CopyHostToDevice:550:14696 +129856580456039:129856581680039 0:0 KernelExecution:554:14696 +129856580428344:129856582226693 0:0 CopyDeviceToHost:556:14696 +129856587092681:129856587802199 0:0 CopyHostToDevice:557:14696 +129856587888587:129856589111627 0:0 KernelExecution:561:14696 +129856587861029:129856589654526 0:0 CopyDeviceToHost:563:14696 +129856594498640:129856595270698 0:0 CopyHostToDevice:564:14696 +129856595356053:129856596579733 0:0 KernelExecution:568:14696 +129856595328424:129856597128257 0:0 CopyDeviceToHost:570:14696 +129856601984341:129856602751266 0:0 CopyHostToDevice:571:14696 +129856505180003:129856505975222 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :480 +129856505976980:129856505976981 14696:14696 MARK(name(before HIP LaunchKernel)) +129856505980587:129856505981234 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :482 +129856505982935:129856505983566 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :483 +129856505985434:129856505988514 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :484 +129856505990096:129856505990097 14696:14696 MARK(name(after HIP LaunchKernel)) +129856505991997:129856507832334 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :486 +129856512649603:129856513382084 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :487 +129856513384599:129856513384600 14696:14696 MARK(name(before HIP LaunchKernel)) +129856513388119:129856513389080 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :489 +129856513391435:129856513392275 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :490 +129856513394697:129856513399367 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :491 +129856513401523:129856513401524 14696:14696 MARK(name(after HIP LaunchKernel)) +129856513404257:129856515239416 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :493 +129856519992571:129856520793180 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :494 +129856520794974:129856520794975 14696:14696 MARK(name(before HIP LaunchKernel)) +129856520798420:129856520799070 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :496 +129856520800911:129856520801530 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :497 +129856520803611:129856520806841 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :498 +129856520808737:129856520808738 14696:14696 MARK(name(after HIP LaunchKernel)) +129856520810545:129856522657358 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :500 +129856527425346:129856528218117 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :501 +129856528219874:129856528219875 14696:14696 MARK(name(before HIP LaunchKernel)) +129856528221975:129856528222627 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :503 +129856528224439:129856528225291 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :504 +129856528227108:129856528230172 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :505 +129856528231752:129856528231753 14696:14696 MARK(name(after HIP LaunchKernel)) +129856528233473:129856530074548 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :507 +129856534899214:129856535681957 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :508 +129856535683676:129856535683677 14696:14696 MARK(name(before HIP LaunchKernel)) +129856535686401:129856535687061 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :510 +129856535688790:129856535689423 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :511 +129856535691153:129856535694294 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :512 +129856535695868:129856535695869 14696:14696 MARK(name(after HIP LaunchKernel)) +129856535697671:129856537541753 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :514 +129856542387175:129856543225418 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :515 +129856543227192:129856543227193 14696:14696 MARK(name(before HIP LaunchKernel)) +129856543230911:129856543231570 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :517 +129856543233243:129856543233871 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :518 +129856543235930:129856543238762 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :519 +129856543240359:129856543240360 14696:14696 MARK(name(after HIP LaunchKernel)) +129856543242179:129856545084137 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :521 +129856549857104:129856550696919 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :522 +129856550698874:129856550698875 14696:14696 MARK(name(before HIP LaunchKernel)) +129856550702196:129856550702852 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :524 +129856550704612:129856550705254 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :525 +129856550707079:129856550709869 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :526 +129856550711442:129856550711443 14696:14696 MARK(name(after HIP LaunchKernel)) +129856550713182:129856552568840 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :528 +129856557336788:129856558182426 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :529 +129856558184195:129856558184196 14696:14696 MARK(name(before HIP LaunchKernel)) +129856558187727:129856558188380 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :531 +129856558190122:129856558190752 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :532 +129856558192774:129856558195554 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :533 +129856558197324:129856558197325 14696:14696 MARK(name(after HIP LaunchKernel)) +129856558199234:129856560041419 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :535 +129856564809360:129856565545640 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :536 +129856565547393:129856565547394 14696:14696 MARK(name(before HIP LaunchKernel)) +129856565549636:129856565550299 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :538 +129856565551969:129856565552581 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :539 +129856565554301:129856565557438 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :540 +129856565559047:129856565559048 14696:14696 MARK(name(after HIP LaunchKernel)) +129856565560847:129856567411065 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :542 +129856572215770:129856572957492 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :543 +129856572959234:129856572959235 14696:14696 MARK(name(before HIP LaunchKernel)) +129856572962526:129856572963184 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :545 +129856572964912:129856572965546 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :546 +129856572967421:129856572970453 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :547 +129856572972097:129856572972098 14696:14696 MARK(name(after HIP LaunchKernel)) +129856572974076:129856574823083 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :549 +129856579588261:129856580372449 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :550 +129856580374262:129856580374263 14696:14696 MARK(name(before HIP LaunchKernel)) +129856580376547:129856580377227 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :552 +129856580378975:129856580379619 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :553 +129856580381546:129856580384467 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :554 +129856580386225:129856580386226 14696:14696 MARK(name(after HIP LaunchKernel)) +129856580388205:129856582240020 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :556 +129856587022783:129856587805709 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :557 +129856587807440:129856587807441 14696:14696 MARK(name(before HIP LaunchKernel)) +129856587811171:129856587811825 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :559 +129856587813530:129856587814170 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :560 +129856587816040:129856587819243 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :561 +129856587820912:129856587820913 14696:14696 MARK(name(after HIP LaunchKernel)) +129856587822927:129856589666874 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :563 +129856594433516:129856595273993 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :564 +129856595275800:129856595275801 14696:14696 MARK(name(before HIP LaunchKernel)) +129856595278990:129856595279652 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :566 +129856595281384:129856595282018 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :567 +129856595283991:129856595287449 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :568 +129856595289101:129856595289102 14696:14696 MARK(name(after HIP LaunchKernel)) +129856595291045:129856597140491 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :570 +129856601919460:129856602754655 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :571 +129856602756445:129856602756446 14696:14696 MARK(name(before HIP LaunchKernel)) +129856602769740:129856602770661 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :573 +129856602772396:129856602773016 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :574 +129856602775079:129856602778192 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :575 +129856602779755:129856602779756 14696:14696 MARK(name(after HIP LaunchKernel)) PASSED! ## Iteration (17) ################# PASSED! @@ -419,33 +998,6 @@ PASSED! ## Iteration (13) ################# PASSED! ## Iteration (12) ################# -3802702503514682:3802702505171794 0:0 CopyHostToDevice:557:1991 -3802702505267652:3802702506468157 0:0 KernelExecution:561:1991 -3802702505243905:3802702507802814 0:0 CopyDeviceToHost:563:1991 -3802702514761386:3802702516425008 0:0 CopyHostToDevice:564:1991 -3802702516524080:3802702517731252 0:0 KernelExecution:568:1991 -3802702516500289:3802702519068477 0:0 CopyDeviceToHost:570:1991 -3802702526022439:3802702527700141 0:0 CopyHostToDevice:571:1991 -3802702527794104:3802702528982164 0:0 KernelExecution:575:1991 -3802702527771042:3802702530315231 0:0 CopyDeviceToHost:577:1991 -3802702537274272:3802702538953635 0:0 CopyHostToDevice:578:1991 -3802702539050334:3802702540254987 0:0 KernelExecution:582:1991 -3802702539025425:3802702541587134 0:0 CopyDeviceToHost:584:1991 -3802702548553016:3802702550225609 0:0 CopyHostToDevice:585:1991 -3802702550319337:3802702551518805 0:0 KernelExecution:589:1991 -3802702550295569:3802702552853758 0:0 CopyDeviceToHost:591:1991 -3802702559816479:3802702561495482 0:0 CopyHostToDevice:592:1991 -3802702561589855:3802702562788137 0:0 KernelExecution:596:1991 -3802702561565542:3802702564108601 0:0 CopyDeviceToHost:598:1991 -3802702571069693:3802702572746995 0:0 CopyHostToDevice:599:1991 -3802702572840650:3802702574041747 0:0 KernelExecution:603:1991 -3802702572817856:3802702575375565 0:0 CopyDeviceToHost:605:1991 -3802702582343137:3802702584028249 0:0 CopyHostToDevice:606:1991 -3802702584131111:3802702585314874 0:0 KernelExecution:610:1991 -3802702584098390:3802702586648988 0:0 CopyDeviceToHost:612:1991 -3802702593620890:3802702595300582 0:0 CopyHostToDevice:613:1991 -3802702595394737:3802702596603391 0:0 KernelExecution:617:1991 -3802702595371233:3802702597936882 0:0 CopyDeviceToHost:619:1991 PASSED! ## Iteration (11) ################# PASSED! @@ -460,37 +1012,143 @@ PASSED! ## Iteration (6) ################# PASSED! ## Iteration (5) ################# +129856602843500:129856604064780 0:0 KernelExecution:575:14696 +129856602815760:129856604621212 0:0 CopyDeviceToHost:577:14696 +129856609545997:129856610317997 0:0 CopyHostToDevice:578:14696 +129856610406851:129856611631491 0:0 KernelExecution:582:14696 +129856610379025:129856612168754 0:0 CopyDeviceToHost:584:14696 +129856616987475:129856617701731 0:0 CopyHostToDevice:585:14696 +129856617790176:129856619014496 0:0 KernelExecution:589:14696 +129856617761809:129856619559063 0:0 CopyDeviceToHost:591:14696 +129856624349579:129856625289209 0:0 CopyHostToDevice:592:14696 +129856625377836:129856626603916 0:0 KernelExecution:596:14696 +129856625350001:129856627147692 0:0 CopyDeviceToHost:598:14696 +129856632033149:129856632742303 0:0 CopyHostToDevice:599:14696 +129856632836527:129856634057647 0:0 KernelExecution:603:14696 +129856632808948:129856634598487 0:0 CopyDeviceToHost:605:14696 +129856639443412:129856640151030 0:0 CopyHostToDevice:606:14696 +129856640260250:129856641484890 0:0 KernelExecution:610:14696 +129856640232509:129856642041965 0:0 CopyDeviceToHost:612:14696 +129856646912100:129856647619752 0:0 CopyHostToDevice:613:14696 +129856647705914:129856648930874 0:0 KernelExecution:617:14696 +129856647678197:129856649476287 0:0 CopyDeviceToHost:619:14696 +129856654338593:129856655101879 0:0 CopyHostToDevice:620:14696 +129856655189659:129856656412699 0:0 KernelExecution:624:14696 +129856655161891:129856656960409 0:0 CopyDeviceToHost:626:14696 +129856661822483:129856662586330 0:0 CopyHostToDevice:627:14696 +129856662679432:129856663900712 0:0 KernelExecution:631:14696 +129856662650940:129856664447428 0:0 CopyDeviceToHost:633:14696 +129856669274444:129856670036595 0:0 CopyHostToDevice:634:14696 +129856670129015:129856671350615 0:0 KernelExecution:638:14696 +129856670101388:129856671895354 0:0 CopyDeviceToHost:640:14696 +129856676687339:129856677401038 0:0 CopyHostToDevice:641:14696 +129856677491350:129856678712950 0:0 KernelExecution:645:14696 +129856677463387:129856679258027 0:0 CopyDeviceToHost:647:14696 +129856684088485:129856684823542 0:0 CopyHostToDevice:648:14696 +129856684910895:129856686132975 0:0 KernelExecution:652:14696 +129856684882539:129856686675228 0:0 CopyDeviceToHost:654:14696 +129856691574066:129856692284982 0:0 CopyHostToDevice:655:14696 +129856692371897:129856693594617 0:0 KernelExecution:659:14696 +129856692344278:129856694142257 0:0 CopyDeviceToHost:661:14696 +129856699000899:129856699713058 0:0 CopyHostToDevice:662:14696 +129856699797526:129856701023446 0:0 KernelExecution:666:14696 +129856699769937:129856701569372 0:0 CopyDeviceToHost:668:14696 +129856602781709:129856604636152 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :577 +129856609479851:129856610321075 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :578 +129856610323078:129856610323079 14696:14696 MARK(name(before HIP LaunchKernel)) +129856610326500:129856610327162 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :580 +129856610328857:129856610329498 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :581 +129856610331492:129856610334664 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :582 +129856610336290:129856610336291 14696:14696 MARK(name(after HIP LaunchKernel)) +129856610338048:129856612222255 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :584 +129856616969217:129856617705105 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :585 +129856617706989:129856617706990 14696:14696 MARK(name(before HIP LaunchKernel)) +129856617710485:129856617711142 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :587 +129856617712846:129856617713491 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :588 +129856617715518:129856617718644 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :589 +129856617720274:129856617720275 14696:14696 MARK(name(after HIP LaunchKernel)) +129856617722118:129856619570993 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :591 +129856624331436:129856625292310 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :592 +129856625294207:129856625294208 14696:14696 MARK(name(before HIP LaunchKernel)) +129856625297113:129856625297761 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :594 +129856625299459:129856625300093 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :595 +129856625301835:129856625305409 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :596 +129856625307116:129856625307117 14696:14696 MARK(name(after HIP LaunchKernel)) +129856625309051:129856627159676 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :598 +129856631962417:129856632745795 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :599 +129856632747622:129856632747623 14696:14696 MARK(name(before HIP LaunchKernel)) +129856632761013:129856632761762 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :601 +129856632763565:129856632764219 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :602 +129856632766094:129856632769110 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :603 +129856632770707:129856632770708 14696:14696 MARK(name(after HIP LaunchKernel)) +129856632772662:129856634610068 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :605 +129856639375744:129856640154106 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :606 +129856640155933:129856640155934 14696:14696 MARK(name(before HIP LaunchKernel)) +129856640159565:129856640160216 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :608 +129856640161841:129856640162476 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :609 +129856640164410:129856640167293 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :610 +129856640168886:129856640168887 14696:14696 MARK(name(after HIP LaunchKernel)) +129856640170703:129856642054780 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :612 +129856646841774:129856647623131 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :613 +129856647624849:129856647624850 14696:14696 MARK(name(before HIP LaunchKernel)) +129856647628076:129856647628742 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :615 +129856647630426:129856647631050 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :616 +129856647632957:129856647636281 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :617 +129856647637872:129856647637873 14696:14696 MARK(name(after HIP LaunchKernel)) +129856647639599:129856649488719 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :619 +129856654273909:129856655105030 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :620 +129856655106878:129856655106879 14696:14696 MARK(name(before HIP LaunchKernel)) +129856655109847:129856655110497 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :622 +129856655112292:129856655112914 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :623 +129856655114757:129856655118162 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :624 +129856655119835:129856655119836 14696:14696 MARK(name(after HIP LaunchKernel)) +129856655121792:129856656973292 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :626 PASSED! ## Iteration (4) ################# +129856661755424:129856662589447 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :627 +129856662591236:129856662591237 14696:14696 MARK(name(before HIP LaunchKernel)) +129856662604066:129856662604831 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :629 +129856662606611:129856662607261 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :630 +129856662608995:129856662611988 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :631 +129856662613644:129856662613645 14696:14696 MARK(name(after HIP LaunchKernel)) +129856662615584:129856664462467 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :633 +129856669256336:129856670039683 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :634 +129856670041634:129856670041635 14696:14696 MARK(name(before HIP LaunchKernel)) +129856670054499:129856670055254 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :636 +129856670056982:129856670057615 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :637 +129856670059351:129856670062513 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :638 +129856670064113:129856670064114 14696:14696 MARK(name(after HIP LaunchKernel)) +129856670066200:129856671906923 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :640 +129856676668791:129856677404223 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :641 +129856677406068:129856677406069 14696:14696 MARK(name(before HIP LaunchKernel)) +129856677408812:129856677409484 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :643 +129856677411095:129856677411722 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :644 +129856677413461:129856677416941 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :645 +129856677418503:129856677418504 14696:14696 MARK(name(after HIP LaunchKernel)) +129856677420242:129856679269939 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :647 +129856684019418:129856684826552 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :648 +129856684828363:129856684828364 14696:14696 MARK(name(before HIP LaunchKernel)) +129856684832034:129856684832695 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :650 +129856684834368:129856684834970 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :651 +129856684836877:129856684839963 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :652 +129856684841560:129856684841561 14696:14696 MARK(name(after HIP LaunchKernel)) +129856684843320:129856686688518 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :654 +129856691504696:129856692288950 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :655 +129856692290798:129856692290799 14696:14696 MARK(name(before HIP LaunchKernel)) +129856692292859:129856692293513 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :657 +129856692295227:129856692295860 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :658 +129856692297819:129856692300821 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :659 +129856692302355:129856692302356 14696:14696 MARK(name(after HIP LaunchKernel)) +129856692304530:129856694153679 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :661 +129856698928289:129856699716162 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :662 +129856699717890:129856699717891 14696:14696 MARK(name(before HIP LaunchKernel)) +129856699720061:129856699720715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :664 +129856699722330:129856699722941 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :665 +129856699724836:129856699728198 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :666 +129856699729953:129856699729954 14696:14696 MARK(name(after HIP LaunchKernel)) +129856699731887:129856701581422 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :668 PASSED! ## Iteration (3) ################# -3802702605023015:3802702606699537 0:0 CopyHostToDevice:620:1991 -3802702606793386:3802702607994484 0:0 KernelExecution:624:1991 -3802702606770418:3802702609331847 0:0 CopyDeviceToHost:626:1991 -3802702616295619:3802702617971351 0:0 CopyHostToDevice:627:1991 -3802702618064582:3802702619276198 0:0 KernelExecution:631:1991 -3802702618041252:3802702620593170 0:0 CopyDeviceToHost:633:1991 -3802702627572022:3802702629249514 0:0 CopyHostToDevice:634:1991 -3802702629343204:3802702630550228 0:0 KernelExecution:638:1991 -3802702629319715:3802702631886524 0:0 CopyDeviceToHost:640:1991 -3802702638854896:3802702640514568 0:0 CopyHostToDevice:641:1991 -3802702640601153:3802702641794250 0:0 KernelExecution:645:1991 -3802702640583338:3802702643131137 0:0 CopyDeviceToHost:647:1991 -3802702650106259:3802702651784942 0:0 CopyHostToDevice:648:1991 -3802702651876671:3802702653079250 0:0 KernelExecution:652:1991 -3802702651853582:3802702654414351 0:0 CopyDeviceToHost:654:1991 -3802702661383522:3802702663061155 0:0 CopyHostToDevice:655:1991 -3802702663154356:3802702664347453 0:0 KernelExecution:659:1991 -3802702663130645:3802702665680984 0:0 CopyDeviceToHost:661:1991 -3802702672630496:3802702674303238 0:0 CopyHostToDevice:662:1991 -3802702674398093:3802702675599190 0:0 KernelExecution:666:1991 -3802702674374489:3802702676932868 0:0 CopyDeviceToHost:668:1991 -3802702683898880:3802702685606503 0:0 CopyHostToDevice:669:1991 -3802702685701165:3802702686898410 0:0 KernelExecution:673:1991 -3802702685678193:3802702688219002 0:0 CopyDeviceToHost:675:1991 -3802702695162453:3802702696838515 0:0 CopyHostToDevice:676:1991 -3802702696932444:3802702698137097 0:0 KernelExecution:680:1991 -3802702696909796:3802702699473165 0:0 CopyDeviceToHost:682:1991 PASSED! ## Iteration (2) ################# PASSED! @@ -498,12 +1156,55 @@ PASSED! PASSED! ## Iteration (0) ################# PASSED! -3802702706580728:3802702708245350 0:0 CopyHostToDevice:683:1991 -3802702708346791:3802702709549370 0:0 KernelExecution:687:1991 -3802702708322181:3802702710885410 0:0 CopyDeviceToHost:689:1991 -3802702717849822:3802702719525044 0:0 CopyHostToDevice:690:1991 -3802702719618857:3802702720813139 0:0 KernelExecution:694:1991 -3802702719594825:3802702722149644 0:0 CopyDeviceToHost:696:1991 -3802702729111215:3802702730788167 0:0 CopyHostToDevice:697:1991 -3802702730881622:3802702732076497 0:0 KernelExecution:701:1991 -3802702730858498:3802702733412517 0:0 CopyDeviceToHost:703:1991 +129856706468741:129856707235310 0:0 CopyHostToDevice:669:14696 +129856707327230:129856708548510 0:0 KernelExecution:673:14696 +129856707299810:129856709098218 0:0 CopyDeviceToHost:675:14696 +129856713958124:129856714730788 0:0 CopyHostToDevice:676:14696 +129856714818472:129856716040872 0:0 KernelExecution:680:14696 +129856714790211:129856716592662 0:0 CopyDeviceToHost:682:14696 +129856721429109:129856722193080 0:0 CopyHostToDevice:683:14696 +129856722282194:129856723505714 0:0 KernelExecution:687:14696 +129856722254384:129856724056420 0:0 CopyDeviceToHost:689:14696 +129856728891611:129856729607012 0:0 CopyHostToDevice:690:14696 +129856729693911:129856730917431 0:0 KernelExecution:694:14696 +129856729665766:129856731460761 0:0 CopyDeviceToHost:696:14696 +129856736249266:129856736963101 0:0 CopyHostToDevice:697:14696 +129856737053267:129856738276147 0:0 KernelExecution:701:14696 +129856737025461:129856738822547 0:0 CopyDeviceToHost:703:14696 +129856706409352:129856707238410 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :669 +129856707240341:129856707240342 14696:14696 MARK(name(before HIP LaunchKernel)) +129856707253495:129856707254390 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :671 +129856707256214:129856707256878 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :672 +129856707258659:129856707261885 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :673 +129856707263518:129856707263519 14696:14696 MARK(name(after HIP LaunchKernel)) +129856707265698:129856709110388 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :675 +129856713891418:129856714734007 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :676 +129856714735794:129856714735795 14696:14696 MARK(name(before HIP LaunchKernel)) +129856714739058:129856714739715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :678 +129856714741339:129856714741972 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :679 +129856714743986:129856714747316 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :680 +129856714748993:129856714748994 14696:14696 MARK(name(after HIP LaunchKernel)) +129856714750976:129856716607126 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :682 +129856721364192:129856722196489 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :683 +129856722198322:129856722198323 14696:14696 MARK(name(before HIP LaunchKernel)) +129856722202102:129856722202759 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :685 +129856722204452:129856722205080 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :686 +129856722207098:129856722210100 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :687 +129856722211652:129856722211653 14696:14696 MARK(name(after HIP LaunchKernel)) +129856722213452:129856724068250 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :689 +129856728873958:129856729610520 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :690 +129856729612474:129856729612475 14696:14696 MARK(name(before HIP LaunchKernel)) +129856729615953:129856729616618 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :692 +129856729618275:129856729618880 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :693 +129856729620844:129856729623983 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :694 +129856729625525:129856729625526 14696:14696 MARK(name(after HIP LaunchKernel)) +129856729627363:129856731472859 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :696 +129856736212718:129856736966611 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :697 +129856736968384:129856736968385 14696:14696 MARK(name(before HIP LaunchKernel)) +129856736971498:129856736972186 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :699 +129856736973934:129856736974581 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :700 +129856736976433:129856736979849 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :701 +129856736981559:129856736981560 14696:14696 MARK(name(after HIP LaunchKernel)) +129856736983603:129856738834349 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :703 +129856743571751:129856743607276 14696:14696 hipFree(ptr=0x7fd65ce00000) :704 +129856743609591:129856743621235 14696:14696 hipFree(ptr=0x7fd65c800000) :705 diff --git a/test/golden_traces/tests_trace_cmp_levels.txt b/test/golden_traces/tests_trace_cmp_levels.txt index 5e6dbaa7..5311d813 100644 --- a/test/golden_traces/tests_trace_cmp_levels.txt +++ b/test/golden_traces/tests_trace_cmp_levels.txt @@ -10,3 +10,5 @@ MatrixTranspose_hip_flush_trace --check-order .* MatrixTranspose_kfd_trace --check-events .* ctrl_hsa_trace --check-event .* ctrl_hsa_input_trace --check-event .* +hsa_co_trace --check-none +code_obj_trace --check-none diff --git a/test/run.sh b/test/run.sh index c5931061..577bdd30 100755 --- a/test/run.sh +++ b/test/run.sh @@ -151,6 +151,13 @@ echo " From c1632440e704aa3f62ede3d7a979ee247beabd1d Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 13 Oct 2020 16:43:18 -0500 Subject: [PATCH 19/38] SWDEV-254329 : extending debug trace with timestamps, cmake option '-DCMAKE_DEBUG_TRACE=1' Change-Id: Id16c01a6c00f6384c37fa9b5a9709a5e98e1fb57 --- src/core/roctracer.cpp | 23 ++++++++++++++--------- test/tool/tracer_tool.cpp | 12 ++++++------ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 21203f91..891b005e 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -93,7 +93,6 @@ THE SOFTWARE. #define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") #define ONLOAD_TRACE_END() ONLOAD_TRACE("end") - static inline uint32_t GetPid() { return syscall(__NR_getpid); } /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -172,6 +171,9 @@ void RestoreHsaApi() { } namespace roctracer { +// timestamp definitino +typedef hsa_rt_utils::Timer::timestamp_t timestamp_t; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; @@ -347,6 +349,8 @@ void* HIP_SyncApiDataCallback( const void* callback_data, void* arg) { + static hsa_rt_utils::Timer timer; + void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); hip_api_data_t* data_ptr = const_cast(data); @@ -392,8 +396,8 @@ void* HIP_SyncApiDataCallback( } const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); - DEBUG_TRACE("HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu)\n", - name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0); + DEBUG_TRACE("HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) time_ns(%lu)\n", + name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0, timer.timestamp_ns()); return ret; } @@ -405,6 +409,7 @@ void* HIP_SyncActivityCallback( void* arg) { static hsa_rt_utils::Timer timer; + const timestamp_t timestamp_ns = timer.timestamp_ns(); void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); @@ -435,7 +440,7 @@ void* HIP_SyncActivityCallback( // Filing record info record->domain = ACTIVITY_DOMAIN_HIP_API; record->op = op_id; - record->begin_ns = timer.timestamp_ns(); + record->begin_ns = timestamp_ns; // Correlation ID generating uint64_t correlation_id = data->correlation_id; @@ -460,7 +465,7 @@ void* HIP_SyncActivityCallback( } // Filing record info - record->end_ns = timer.timestamp_ns(); + record->end_ns = timestamp_ns; record->process_id = syscall(__NR_getpid); record->thread_id = syscall(__NR_gettid); @@ -484,8 +489,8 @@ void* HIP_SyncActivityCallback( } const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); - DEBUG_TRACE("HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu)\n", - name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0); + DEBUG_TRACE("HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) beg_ns(%lu) end_ns(%lu)\n", + name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0, timestamp_ns); return ret; } @@ -502,8 +507,8 @@ void HCC_AsyncActivityCallback(uint32_t op_id, void* record, void* arg) { pool->Write(*record_ptr); const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HCC_OPS, record_ptr->op, record_ptr->kind); - DEBUG_TRACE("HCC_AsyncActivityCallback(\"%s\"): op(%u) kind(%u) record(%p) pool(%p) correlation_id(%d)\n", - name, record_ptr->op, record_ptr->kind, record, pool, record_ptr->correlation_id); + DEBUG_TRACE("HCC_AsyncActivityCallback(\"%s\"): op(%u) kind(%u) record(%p) pool(%p) correlation_id(%d) beg_ns(%lu) end_ns(%lu)\n", + name, record_ptr->op, record_ptr->kind, record, pool, record_ptr->correlation_id, record_ptr->begin_ns, record_ptr->end_ns); } // Open output file diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index d6be6f4f..1aeb75a3 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -454,8 +454,8 @@ void hip_api_callback( } const char * name = roctracer_op_string(domain, cid, 0); - DEBUG_TRACE("hip_api_callback(\"%s\") phase(%d): cid(%u) data(%p) entry(%p) name(\"%s\") correlation_id(%lu)\n", - name, data->phase, cid, data, entry, (entry) ? entry->name : NULL, data->correlation_id); + DEBUG_TRACE("hip_api_callback(\"%s\") phase(%d): cid(%u) data(%p) entry(%p) name(\"%s\") correlation_id(%lu) timestamp(%lu)\n", + name, data->phase, cid, data, entry, (entry) ? entry->name : NULL, data->correlation_id, timestamp); } void mark_api_callback( @@ -500,8 +500,8 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { oss << std::dec << rec_ss.str() << " " << str; const char * name = roctracer_op_string(entry->domain, entry->cid, 0); - DEBUG_TRACE("hip_api_flush_cb(\"%s\"): domain(%u) cid(%u) entry(%p) name(\"%s\" correlation_id(%lu))\n", - name, entry->domain, entry->cid, entry, entry->name, correlation_id); + DEBUG_TRACE("hip_api_flush_cb(\"%s\"): domain(%u) cid(%u) entry(%p) name(\"%s\" correlation_id(%lu) beg(%lu) end(%lu))\n", + name, entry->domain, entry->cid, entry, entry->name, correlation_id, begin_timestamp, end_timestamp); if (domain == ACTIVITY_DOMAIN_HIP_API) { #if HIP_PROF_HIP_API_STRING @@ -631,8 +631,8 @@ void pool_activity_callback(const char* begin, const char* end, void* arg) { while (record < end_record) { const char * name = roctracer_op_string(record->domain, record->op, record->kind); - DEBUG_TRACE("pool_activity_callback(\"%s\"): domain(%u) op(%u) kind(%u) record(%p) correlation_id(%lu)\n", - name, record->domain, record->op, record->kind, record, record->correlation_id); + DEBUG_TRACE("pool_activity_callback(\"%s\"): domain(%u) op(%u) kind(%u) record(%p) correlation_id(%lu) beg(%lu) end(%lu)\n", + name, record->domain, record->op, record->kind, record, record->correlation_id, record->begin_ns, record->end_ns); switch(record->domain) { case ACTIVITY_DOMAIN_HCC_OPS: From 2293663e097cfe806412b7cbb8a464b492fdb97a Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 15 Oct 2020 19:25:09 -0400 Subject: [PATCH 20/38] code obj tests: adding load base Change-Id: I5fdb25b67eaae43b3c01cd8de3824f9343c37794 --- test/app/codeobj_test.cpp | 8 ++++---- test/app/hsaco_test.cpp | 8 ++++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/test/app/codeobj_test.cpp b/test/app/codeobj_test.cpp index 086bcfb6..124715cd 100644 --- a/test/app/codeobj_test.cpp +++ b/test/app/codeobj_test.cpp @@ -44,15 +44,15 @@ void check_status(roctracer_status_t status) { // codeobj callback void codeobj_callback(uint32_t domain, uint32_t cid, const void* data, void* arg) { const hsa_evt_data_t* evt_data = reinterpret_cast(data); - const uint32_t uri_length = evt_data->codeobj.uri_length; const char* uri = evt_data->codeobj.uri; - printf("codeobj_callback domain(%u) cid(%u): load_delta(0x%lx) load_size(0x%lx) uri_length(%u) uri(\"%s\")\n", + printf("codeobj_callback domain(%u) cid(%u): load_base(0x%lx) load_size(0x%lx) load_delta(0x%lx) uri(\"%s\")\n", domain, cid, - evt_data->codeobj.load_delta, + evt_data->codeobj.load_base, evt_data->codeobj.load_size, - uri_length, + evt_data->codeobj.load_delta, uri); + free((void*)uri); fflush(stdout); } diff --git a/test/app/hsaco_test.cpp b/test/app/hsaco_test.cpp index 0f2e42ad..23200137 100644 --- a/test/app/hsaco_test.cpp +++ b/test/app/hsaco_test.cpp @@ -50,11 +50,16 @@ hsa_status_t code_object_callback( { printf("code_object_callback\n"); fflush(stdout); + uint64_t load_base = 0; uint64_t load_size = 0; uint64_t load_delta = 0; uint32_t uri_len = 0; char* uri_str = NULL; + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, + &load_base)); HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, @@ -79,11 +84,14 @@ hsa_status_t code_object_callback( HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI, uri_str)); + printf("load_base(0x%lx)\n", load_base); fflush(stdout); printf("load_size(0x%lx)\n", load_size); fflush(stdout); printf("load_delta(0x%lx)\n", load_delta); fflush(stdout); printf("uri_len(%u)\n", uri_len); fflush(stdout); printf("uri_str(\"%s\")\n", uri_str); fflush(stdout); + free(uri_str); + return HSA_STATUS_SUCCESS; } From 68b1850c9267368719d63ab7a903df830641fd0a Mon Sep 17 00:00:00 2001 From: Rachida Kebichi Date: Wed, 18 Nov 2020 16:47:45 -0500 Subject: [PATCH 21/38] SWDEV-255938 NEW - added regex and depth support to ostream ops Change-Id: I292255adab3a70fa00a1dd5685b788521687f35b --- inc/roctracer_hip.h | 2 +- inc/roctracer_hsa.h | 17 --- script/gen_ostream_ops.py | 215 ++++++++++++++++---------------------- script/hsaap.py | 5 +- script/kfdap.py | 4 +- src/CMakeLists.txt | 5 +- 6 files changed, 98 insertions(+), 150 deletions(-) diff --git a/inc/roctracer_hip.h b/inc/roctracer_hip.h index 091f3279..553ec1b8 100644 --- a/inc/roctracer_hip.h +++ b/inc/roctracer_hip.h @@ -37,8 +37,8 @@ inline static std::ostream& operator<<(std::ostream& out, const char& v) { } #endif // __cplusplus -#include #include +#include #include #include diff --git a/inc/roctracer_hsa.h b/inc/roctracer_hsa.h index d9daa5e5..1e50c3ab 100644 --- a/inc/roctracer_hsa.h +++ b/inc/roctracer_hsa.h @@ -66,23 +66,6 @@ typedef hsa_support::ops_properties_t hsa_ops_properties_t; #include "hsa_ostream_ops.h" -std::ostream& operator<<(std::ostream& out, const hsa_amd_memory_pool_t& v) -{ - roctracer::hsa_support::operator<<(out, v); - return out; -} - -std::ostream& operator<<(std::ostream& out, const hsa_ext_image_t& v) -{ - roctracer::hsa_support::operator<<(out, v); - return out; -} - -std::ostream& operator<<(std::ostream& out, const hsa_ext_sampler_t& v) -{ - roctracer::hsa_support::operator<<(out, v); - return out; -} #else // !__cplusplus typedef void* hsa_amd_queue_intercept_handler; diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index c8f23629..fcc379a6 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/python import os, sys, re import CppHeaderParser @@ -29,11 +29,6 @@ '*/\n' -header = 'template \n' + \ -'struct output_streamer {\n' + \ -' inline static std::ostream& put(std::ostream& out, const T& v) { return out; }\n' + \ -'};\n\n' - header_basic = \ 'template \n' + \ ' inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n' + \ @@ -43,8 +38,10 @@ ' return out; }\n' structs_analyzed = {} -global_ops_hip = '' +global_ops = '' global_str = '' +output_filename_h = None +apiname = "" # process_struct traverses recursively all structs to extract all fields def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, apiname): @@ -61,7 +58,6 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a return if cppHeader_struct in structs_analyzed: return - structs_analyzed[cppHeader_struct] = 1 for l in reversed(range(len(cppHeader.classes[cppHeader_struct]["properties"]["public"]))): key = 'name' @@ -90,17 +86,13 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a str = '' if "union" not in mtype: - if apiname.lower() == 'hip' or apiname.lower() == 'hsa': - str += " roctracer::" + apiname.lower() + "_support::operator<<(out, \"" + name + " = \");\n" - str += " roctracer::" + apiname.lower() + "_support::operator<<(out, v."+name+");\n" - str += " roctracer::" + apiname.lower() + "_support::operator<<(out, \", \");\n" - else: - str += " roctracer::" + apiname.lower() + "_support::output_streamer::put(out, \"" + name + " = \");\n" - if array_size == "": - str += " roctracer::" + apiname.lower() + "_support::output_streamer<" + mtype + ">::put(out, v." + name + ");\n" - else: - str += " roctracer::" + apiname.lower() + "_support::output_streamer<" + mtype + "[" + array_size + "]>::put(out, v." + name + ");\n" - str += " roctracer::" + apiname.lower() + "_support::output_streamer::put(out, \", \");\n" + indent = "" + str += " if (regex_match (\"" + cppHeader_struct + "::" + name + "\", std::regex(" + apiname.upper() + "_structs_regex))) {\n" + indent = " " + str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, \"" + name + "=\");\n" + str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, v." + name + ");\n" + str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, \", \");\n" + str += " }\n" if "void" not in mtype: global_str += str else: @@ -113,133 +105,104 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a process_struct(file_handle, next_cppHeader_struct, cppHeader, name, apiname) # Parses API header file and generates ostream ops files ostream_ops.h -def gen_cppheader(infilepath, outfilepath, structs_depth): +def gen_cppheader(infilepath, outfilepath, rank): # infilepath: API Header file to be parsed # outfilepath: Output file where ostream operators are written - global_ops_hip = '' - global_ops_hsa = '' + global global_ops + global output_filename_h + global apiname global global_str try: cppHeader = CppHeaderParser.CppHeader(infilepath) except CppHeaderParser.CppParseError as e: print(e) sys.exit(1) - mpath = os.path.dirname(outfilepath) - if mpath == "": - mpath = os.getcwd() - apiname = outfilepath.replace(mpath+"/","") - apiname = apiname.replace("_ostream_ops.h","") - apiname = apiname.upper() - f = open(outfilepath,"w+") - f.write("// automatically generated\n") - f.write(LICENSE + '\n') - header_s = \ - '#ifndef INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - '#define INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - '#ifdef __cplusplus\n' + \ - '#include \n' + \ - '\n' + \ - '#include "roctracer.h"\n' - if apiname.lower() == 'hip': - header_s = header_s + '\n' + \ - '#include "hip/hip_runtime_api.h"\n' + \ - '#include "hip/hcc_detail/hip_vector_types.h"\n\n' - - f.write(header_s) - f.write('\n') - f.write('namespace roctracer {\n') - f.write('namespace ' + apiname.lower() + '_support {\n') - if structs_depth != -1: - f.write('static int ' + apiname.upper() + '_depth_max = ' + str(structs_depth) + ';\n') - f.write('// begin ostream ops for '+ apiname + ' \n') - if apiname.lower() == "hip" or apiname.lower() == "hsa": - f.write("// basic ostream ops\n") - f.write(header_basic) - f.write("// End of basic ostream ops\n\n") - else: - f.write(header) + if rank == 0 or rank == 2: + mpath = os.path.dirname(outfilepath) + if mpath == "": + mpath = os.getcwd() + apiname = outfilepath.replace(mpath + "/","") + output_filename_h = open(outfilepath,"w+") + apiname = apiname.replace("_ostream_ops.h","") + apiname = apiname.upper() + output_filename_h.write("// automatically generated\n") + output_filename_h.write(LICENSE + '\n') + header_s = \ + '#ifndef INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ + '#define INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ + '#ifdef __cplusplus\n' + \ + '#include \n' + \ + '\n' + \ + '#include "roctracer.h"\n' + header_s += '#include \n#include \n' + + output_filename_h.write(header_s) + output_filename_h.write('\n') + output_filename_h.write('namespace roctracer {\n') + output_filename_h.write('namespace ' + apiname.lower() + '_support {\n') + output_filename_h.write('static int ' + apiname.upper() + '_depth_max = 1;\n') + output_filename_h.write('static int ' + apiname.upper() + '_depth_max_cnt = 0;\n') + output_filename_h.write('static std::string ' + apiname.upper() + '_structs_regex = \".*\";\n') + output_filename_h.write('// begin ostream ops for '+ apiname + ' \n') + output_filename_h.write("// basic ostream ops\n") + output_filename_h.write(header_basic) + output_filename_h.write("// End of basic ostream ops\n\n") for c in cppHeader.classes: if "union" in c: continue - if apiname.lower() == 'hsa': - if c == 'max_align_t' or c == '__fsid_t': #already defined for hip + if c in structs_analyzed: continue - if len(cppHeader.classes[c]["properties"]["public"])!=0: - if apiname.lower() == 'hip' or apiname.lower() == 'hsa': - f.write("inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") - f.write("{\n") - f.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '{');\n") - if structs_depth != -1: - f.write(" " + apiname.upper() + "_depth_max++;\n") - f.write(" if (" + apiname.upper() + "_depth_max <= " + str(structs_depth) + ") {\n" ) - process_struct(f, c, cppHeader, "", apiname) - global_str = "\n".join(global_str.split("\n")[0:-2]) - if structs_depth != -1: #reindent - global_str = global_str.split('\n') - global_str = [' ' + line.lstrip() for line in global_str] - global_str = "\n".join(global_str) - f.write(global_str+"\n") - if structs_depth != -1: - f.write(" };\n") - f.write(" " + apiname.upper() + "_depth_max--;\n") - f.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '}');\n") - f.write(" return out;\n") - f.write("}\n") - global_str = '' - else: - f.write("\ntemplate<>\n") - f.write("struct output_streamer<" + c + "&> {\n") - f.write(" inline static std::ostream& put(std::ostream& out, "+c+"& v)\n") - f.write("{\n") - f.write(" roctracer::" + apiname.lower() + "_support::output_streamer::put(out, '{');\n") - if structs_depth != -1: - f.write(apiname.upper() + "_depth_max++;\n") - f.write(" if (" + apiname.upper() + "_depth_max <= " + str(structs_depth) + ") {\n" ) - process_struct(f, c, cppHeader, "", apiname) - global_str = "\n".join(global_str.split("\n")[0:-2]) - if structs_depth != -1: #reindent - global_str = global_str.split('\n') - global_str = [' ' + line.lstrip() for line in global_str] - global_str = "\n".join(global_str) - f.write(global_str+"\n") - if structs_depth != -1: - f.write(" };\n") - f.write(" " + apiname.upper() + "_depth_max--;\n") - f.write(" roctracer::" + apiname.lower() + "_support::output_streamer::put(out, '}');\n") - f.write(" return out;\n") - f.write("}\n") - f.write("};\n") - global_str = '' - if apiname.lower() == 'hip': - global_ops_hip += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hip_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" - if apiname.lower() == 'hsa': - global_ops_hsa += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hsa_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" - - footer = \ - '// end ostream ops for '+ apiname + ' \n' - footer += '};};\n\n' - f.write(footer) - f.write(global_ops_hip) - f.write(global_ops_hsa) - footer = '#endif //__cplusplus\n' + \ - '#endif // INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - ' \n' - f.write(footer) - f.close() - print('File ' + outfilepath + ' generated') + if c == 'max_align_t' or c == '__fsid_t': # Skipping as it is defined in multiple domains + continue + if len(cppHeader.classes[c]["properties"]["public"]) != 0: + output_filename_h.write("inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") + output_filename_h.write("{\n") + output_filename_h.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '{');\n") + output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt++;\n") + output_filename_h.write(" if (" + apiname.upper() + "_depth_max == -1 || " + apiname.upper() + "_depth_max_cnt <= " + apiname.upper() + "_depth_max" + ") {\n" ) + process_struct(output_filename_h, c, cppHeader, "", apiname) + global_str = "\n".join(global_str.split("\n")[0:-3]) + if global_str != '': global_str += "\n }\n" + output_filename_h.write(global_str) + output_filename_h.write(" };\n") + output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt--;\n") + output_filename_h.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '}');\n") + output_filename_h.write(" return out;\n") + output_filename_h.write("}\n") + global_str = '' + global_ops += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::" + apiname.lower() + "_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" + + if rank == 1 or rank == 2: + footer = '// end ostream ops for '+ apiname + ' \n' + footer += '};};\n\n' + output_filename_h.write(footer) + output_filename_h.write(global_ops) + footer = '#endif //__cplusplus\n' + \ + '#endif // INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ + ' \n' + output_filename_h.write(footer) + output_filename_h.close() + print('File ' + outfilepath + ' generated') return parser = argparse.ArgumentParser(description='genOstreamOps.py: generates ostream operators for all typedefs in provided input file.') requiredNamed = parser.add_argument_group('Required arguments') -requiredNamed.add_argument('-in', metavar='file', help='Header file to be parsed', required=True) +requiredNamed.add_argument('-in', metavar='fileList', help='Comma separated list of header files to be parsed', required=True) requiredNamed.add_argument('-out', metavar='file', help='Output file with ostream operators', required=True) -requiredNamed.add_argument('-depth', metavar='N', type=int, help='Depth for nested structs', required=False) -structs_depth = 0 args = vars(parser.parse_args()) if __name__ == '__main__': - if args['depth'] != None: structs_depth = args['depth'] - gen_cppheader(args['in'], args['out'], structs_depth) + flist = args['in'].split(',') + if len(flist) == 1: + gen_cppheader(flist[0], args['out'],2) + else: + for i in range(len(flist)): + if i == 0: + gen_cppheader(flist[i], args['out'],0) + elif i == len(flist)-1: + gen_cppheader(flist[i], args['out'],1) + else: + gen_cppheader(flist[i], args['out'],-1) diff --git a/script/hsaap.py b/script/hsaap.py index 84ee9bbf..f9458ee9 100755 --- a/script/hsaap.py +++ b/script/hsaap.py @@ -456,7 +456,10 @@ def gen_out_stream(self, n, name, call, struct): for ind in range(len(arg_list)): arg_var = arg_list[ind] arg_val = 'api_data.args.' + call + '.' + arg_var - self.content += ' out << ' + arg_val + if re.search(r'char\* ', struct['astr'][arg_var]): + self.content += ' out << "0x" << std::hex << (uint64_t)' + arg_val + else: + self.content += ' out << ' + arg_val ''' arg_item = struct['tlst'][ind] if re.search(r'\(\* ', arg_item): arg_pref = '' diff --git a/script/kfdap.py b/script/kfdap.py index 9f560a35..378ea126 100755 --- a/script/kfdap.py +++ b/script/kfdap.py @@ -494,8 +494,7 @@ def gen_out_stream(self, n, name, call, struct): arg_val = 'api_data.args.' + call + '.' + arg_var if re.search(r'MemFlags',arg_var): continue - self.content_h += ' typedef decltype(' + arg_val.replace("[]","") + ') arg_val_type_t' + str(ind) + ';\n' - self.content_h += ' roctracer::kfd_support::output_streamer::put(out, ' + arg_val.replace("[]","") + ')' + self.content_h += ' out << ' + arg_val.replace("[]","") if ind < len(arg_list)-1: self.content_h += ' << ", ";\n' else: self.content_h += ';\n' if struct['ret'] != 'void': @@ -512,7 +511,6 @@ def gen_out_stream(self, n, name, call, struct): self.content_h += ' return out;\n' self.content_h += '}\n' self.content_h += '#endif\n' - self.content_cpp += 'inline std::ostream& operator<< (std::ostream& out, const HsaMemFlags& v) { out << "HsaMemFlags"; return out; }\n' # generate PUBLIC_API for all API fcts def gen_public_api(self, n, name, call, struct): diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e9c72f84..689ed637 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,14 +3,15 @@ set ( GEN_INC_DIR ${PROJECT_BINARY_DIR}/inc ) set ( GEN_SRC_DIR ${PROJECT_BINARY_DIR}/src ) execute_process ( COMMAND sh -xc "mkdir -p ${GEN_INC_DIR}" ) execute_process ( COMMAND sh -xc "mkdir -p ${GEN_SRC_DIR}" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${GEN_INC_DIR}/hsa_pp.h" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa_ext_amd.h > ${GEN_INC_DIR}/hsa_ext_amd_pp.h" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h,${GEN_INC_DIR}/hsa_ext_amd_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/hsaap.py ${PROJECT_BINARY_DIR} ${HSA_RUNTIME_INC_PATH}" ) execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/kfdap.py ${PROJECT_BINARY_DIR} ${HSA_KMT_INC_PATH}" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_KMT_INC_PATH}/hsakmttypes.h > ${GEN_INC_DIR}/hsakmttypes_pp.h" ) execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsakmttypes_pp.h -out ${GEN_INC_DIR}/kfd_ostream_ops.h" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HIP_PATH}/include/hip/hip_runtime_api.h ${HIP_DEFINES} -I${HIP_PATH}/include -I${ROCM_ROOT_DIR}/hsa/include > ${GEN_INC_DIR}/hip_runtime_api_pp.h" ) execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hip_runtime_api_pp.h -out ${GEN_INC_DIR}/hip_ostream_ops.h" ) -execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${GEN_INC_DIR}/hsa_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) execute_process ( COMMAND sh -xc "mkdir ${GEN_INC_DIR}/rocprofiler" ) execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/inc/rocprofiler.h ${GEN_INC_DIR}/rocprofiler/rocprofiler.h" ) execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/src/core/activity.h ${GEN_INC_DIR}/rocprofiler/activity.h" ) From ff8ac19b2c8d4414b35576d56e793bf43b1b01ac Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 23 Nov 2020 13:09:46 -0600 Subject: [PATCH 22/38] adding tests dry run to check if platform is in working state Change-Id: Ic430e3f959119983a65929fc70332e293cc3448d --- test/golden_traces/MatrixTranspose_dryrun_trace.txt | 0 test/golden_traces/ctrl_dryrun_trace.txt | 0 test/golden_traces/tests_trace_cmp_levels.txt | 2 ++ test/run.sh | 4 ++++ 4 files changed, 6 insertions(+) create mode 100644 test/golden_traces/MatrixTranspose_dryrun_trace.txt create mode 100644 test/golden_traces/ctrl_dryrun_trace.txt diff --git a/test/golden_traces/MatrixTranspose_dryrun_trace.txt b/test/golden_traces/MatrixTranspose_dryrun_trace.txt new file mode 100644 index 00000000..e69de29b diff --git a/test/golden_traces/ctrl_dryrun_trace.txt b/test/golden_traces/ctrl_dryrun_trace.txt new file mode 100644 index 00000000..e69de29b diff --git a/test/golden_traces/tests_trace_cmp_levels.txt b/test/golden_traces/tests_trace_cmp_levels.txt index 5311d813..ed27e868 100644 --- a/test/golden_traces/tests_trace_cmp_levels.txt +++ b/test/golden_traces/tests_trace_cmp_levels.txt @@ -1,4 +1,6 @@ # dummy +MatrixTranspose_dryrun_trace --check-none +ctrl_dryrun_trace --check-none MatrixTranspose_ctest_trace --check-count .* MatrixTranspose_test_trace --check-count .* --ignore-count hsaKmt.* MatrixTranspose_hipaact_test_trace --check-count .* --ignore-count hsaKmt.*|hipMemcpy|__hipPushCallConfiguration|hipLaunchKernel|__hipPopCallConfiguration diff --git a/test/run.sh b/test/run.sh index 577bdd30..75a5a680 100755 --- a/test/run.sh +++ b/test/run.sh @@ -106,6 +106,10 @@ eval_test() { test_number=$((test_number + 1)) } +# Tests dry run +eval_test "MatrixTranspose dry run" ./test/MatrixTranspose MatrixTranspose_dryrun_trace +eval_test "ctrl dry run" ./test/hsa/ctrl ctrl_dryrun_trace + # Standalone test # rocTrecer is used explicitely by test eval_test "standalone C test" "LD_PRELOAD=libkfdwrapper64.so ./test/MatrixTranspose_ctest" MatrixTranspose_ctest_trace From d4cd09139b3ec9e38cc18e83627dc9ab6e793ca1 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 3 Nov 2020 10:39:33 -0600 Subject: [PATCH 23/38] Adding dumping of HSA handles Change-Id: I18e2cfdf2574110bffa09d30c7ac1d3941252939 --- src/util/hsa_rsrc_factory.cpp | 13 +++++++++++++ src/util/hsa_rsrc_factory.h | 2 ++ test/CMakeLists.txt | 2 +- test/tool/tracer_tool.cpp | 5 +++++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index e1ef9268..cf172cae 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -742,6 +742,19 @@ hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t return hsa_api_.hsa_executable_freeze(executable, options);; } +void HsaRsrcFactory::DumpHandles(FILE* file) { + auto beg = agent_map_.begin(); + auto end = agent_map_.end(); + for (auto it = beg; it != end; ++it) { + const AgentInfo* agent_info = it->second; + fprintf(file, "0x%lx agent %s\n", agent_info->dev_id.handle, (agent_info->dev_type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu"); + if (agent_info->cpu_pool.handle != 0) fprintf(file, "0x%lx pool cpu\n", agent_info->cpu_pool.handle); + if (agent_info->kern_arg_pool.handle != 0) fprintf(file, "0x%lx pool cpu kernarg\n", agent_info->kern_arg_pool.handle); + if (agent_info->gpu_pool.handle != 0) fprintf(file, "0x%lx pool gpu\n", agent_info->gpu_pool.handle); + } + fflush(file); +} + std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index 466ccf1f..3bfeda68 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -439,6 +439,8 @@ class HsaRsrcFactory { return HSA_STATUS_SUCCESS; } + void DumpHandles(FILE* output_file); + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 148c60b0..a7511789 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,7 +44,7 @@ endif () ## Path to HSA test set ( HSA_TEST_DIR "${TEST_DIR}/hsa/test" ) -set ( HSA_REV "19b1191" ) +set ( HSA_REV "a4fcdae" ) ## test run script set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index 1aeb75a3..218652e9 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1127,6 +1127,11 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, ROCTRACER_CALL(roctracer_enable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_RESERVED1)); } + // Dumping HSA handles for agents and pools + FILE* handles_file_handle = open_output_file(output_prefix, "hsa_handles.txt"); + HsaRsrcFactory::Instance().DumpHandles(handles_file_handle); + close_output_file(handles_file_handle); + ONLOAD_TRACE_END(); return true; } From bf70a1a64555287eaef1755fb014297157764345 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 9 Dec 2020 22:16:43 -0500 Subject: [PATCH 24/38] SWDEV-264282 : fixing tracer_tool linking Change-Id: I0fd78c01595bbd506f42cf9dfb45f62b2124f704 --- test/CMakeLists.txt | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a7511789..e07e7d8b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -69,6 +69,14 @@ add_custom_target( mytest COMMAND sh -xc "cp ${TEST_DIR}/golden_traces/tests_trace_cmp_levels.txt ${PROJECT_BINARY_DIR}/test/" ) +## Build HSA test +execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" ) +execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git fetch origin && git checkout ${HSA_REV}; fi" ) +set ( TMP ${TEST_DIR} ) +set ( TEST_DIR ${HSA_TEST_DIR} ) +add_subdirectory ( ${HSA_TEST_DIR} ${PROJECT_BINARY_DIR}/test/hsa ) +set ( TEST_DIR ${TMP} ) + ## Util sources file( GLOB UTIL_SRC "${HSA_TEST_DIR}/util/*.cpp" ) @@ -95,12 +103,6 @@ add_library ( ${CO_LIB_NAME} SHARED ${CO_LIB_SRC} ) target_include_directories ( ${CO_LIB_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${GEN_INC_DIR} ${HSA_RUNTIME_INC_PATH} ${ROCM_INC_PATH} ) target_link_libraries ( ${CO_LIB_NAME} ${ROCTRACER_TARGET} c stdc++ ) -## Build HSA test -execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" ) -execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git fetch origin && git checkout ${HSA_REV}; fi" ) -set ( TEST_DIR ${HSA_TEST_DIR} ) -add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test/hsa ) - ## copying run script execute_process ( COMMAND sh -xc "cp ${RUN_SCRIPT} ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "ln -s run.sh ${PROJECT_BINARY_DIR}/run_ci.sh" ) From bfb90954b7e2f957d996489a087599dd124ec383 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 9 Dec 2020 23:07:41 -0500 Subject: [PATCH 25/38] calling python3 explictely Change-Id: I3dda55865bafa41cc6670e414b213f13a2a2a7ac --- script/check_trace.py | 10 ++++------ script/gen_ostream_ops.py | 2 -- script/hsaap.py | 1 - script/kfdap.py | 5 ++--- src/CMakeLists.txt | 10 +++++----- test/run.sh | 4 ++-- 6 files changed, 13 insertions(+), 19 deletions(-) diff --git a/script/check_trace.py b/script/check_trace.py index a4506a12..c10eb3c5 100644 --- a/script/check_trace.py +++ b/script/check_trace.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - #Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. # #Permission is hereby granted, free of charge, to any person obtaining a copy @@ -200,10 +198,10 @@ def check_trace_status(tracename, verbose, check_trace_flag): events_order_r[tid] = events_order[tid] cnt = gen_events_info(trace,trace_level,no_events_cnt,events2ignore,events2chkcnt,events2chkord,verbose) if verbose: - print '\n' + rtrace + ':\n' - print cnt_r - print '\n' + trace + ':\n' - print cnt + print ('\n' + rtrace + ':\n') + print (cnt_r) + print ('\n' + trace + ':\n') + print (cnt) diff_strings(cnt_r, cnt, metric) if cnt_r == cnt: diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index fcc379a6..ae60be84 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import os, sys, re import CppHeaderParser import argparse diff --git a/script/hsaap.py b/script/hsaap.py index f9458ee9..e1a3d717 100755 --- a/script/hsaap.py +++ b/script/hsaap.py @@ -1,4 +1,3 @@ -#!/usr/bin/python from __future__ import print_function import os, sys, re diff --git a/script/kfdap.py b/script/kfdap.py index 378ea126..8de1d19e 100755 --- a/script/kfdap.py +++ b/script/kfdap.py @@ -1,4 +1,3 @@ -#!/usr/bin/python from __future__ import print_function import os, sys, re @@ -80,7 +79,7 @@ def __init__(self, header, name, full_fct): if not os.path.isfile(header): self.fatal("file '" + header + "' not found") - self.inp = open(header, 'r') + self.inp = open(header, 'r', encoding='utf-8') self.beg_pattern = re.compile(name) self.end_pattern = re.compile('.*\)\s*;\s*$'); @@ -146,7 +145,7 @@ def __init__(self, header, array, data, full_fct): if not os.path.isfile(header): self.fatal("file '" + header + "' not found") - self.inp = open(header, 'r') + self.inp = open(header, 'r', encoding='utf-8') self.end_pattern = re.compile('\)\s*;\s*$') self.data = data diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 689ed637..556ea16d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,13 +5,13 @@ execute_process ( COMMAND sh -xc "mkdir -p ${GEN_INC_DIR}" ) execute_process ( COMMAND sh -xc "mkdir -p ${GEN_SRC_DIR}" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${GEN_INC_DIR}/hsa_pp.h" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa_ext_amd.h > ${GEN_INC_DIR}/hsa_ext_amd_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h,${GEN_INC_DIR}/hsa_ext_amd_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/hsaap.py ${PROJECT_BINARY_DIR} ${HSA_RUNTIME_INC_PATH}" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/kfdap.py ${PROJECT_BINARY_DIR} ${HSA_KMT_INC_PATH}" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h,${GEN_INC_DIR}/hsa_ext_amd_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/hsaap.py ${PROJECT_BINARY_DIR} ${HSA_RUNTIME_INC_PATH}" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/kfdap.py ${PROJECT_BINARY_DIR} ${HSA_KMT_INC_PATH}" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_KMT_INC_PATH}/hsakmttypes.h > ${GEN_INC_DIR}/hsakmttypes_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsakmttypes_pp.h -out ${GEN_INC_DIR}/kfd_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsakmttypes_pp.h -out ${GEN_INC_DIR}/kfd_ostream_ops.h" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HIP_PATH}/include/hip/hip_runtime_api.h ${HIP_DEFINES} -I${HIP_PATH}/include -I${ROCM_ROOT_DIR}/hsa/include > ${GEN_INC_DIR}/hip_runtime_api_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hip_runtime_api_pp.h -out ${GEN_INC_DIR}/hip_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hip_runtime_api_pp.h -out ${GEN_INC_DIR}/hip_ostream_ops.h" ) execute_process ( COMMAND sh -xc "mkdir ${GEN_INC_DIR}/rocprofiler" ) execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/inc/rocprofiler.h ${GEN_INC_DIR}/rocprofiler/rocprofiler.h" ) execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/src/core/activity.h ${GEN_INC_DIR}/rocprofiler/activity.h" ) diff --git a/test/run.sh b/test/run.sh index 75a5a680..e8d89095 100755 --- a/test/run.sh +++ b/test/run.sh @@ -86,11 +86,11 @@ eval_test() { is_failed=0; else if [ $is_failed = 0 ] ; then - python ./test/check_trace.py -in $test_name -ck $check_trace_flag + python3 ./test/check_trace.py -in $test_name -ck $check_trace_flag is_failed=$? if [ $is_failed != 0 ] ; then echo "Trace checker error:" - python ./test/check_trace.py -v -in $test_name -ck $check_trace_flag + python3 ./test/check_trace.py -v -in $test_name -ck $check_trace_flag fi fi fi From 3c1a4b38385dd944257c279d5b689cddd599dc48 Mon Sep 17 00:00:00 2001 From: Rachida Kebichi Date: Wed, 18 Nov 2020 10:39:33 -0500 Subject: [PATCH 26/38] SWDEV-259683 HIP API records filtering Change-Id: I43ca5e022d2c055b6a9bc2c09b4276b490a4b986 --- src/core/roctracer.cpp | 14 ++++++++++++++ test/run.sh | 5 +++++ test/tool/tracer_tool.cpp | 16 ++++++++++++++-- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 891b005e..8bc5ea29 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -719,11 +719,25 @@ PUBLIC_API roctracer_status_t roctracer_op_code( switch (domain) { case ACTIVITY_DOMAIN_HSA_API: { *op = roctracer::hsa_support::GetApiCode(str); + if (*op == HSA_API_ID_NUMBER) { + EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")"); + } if (kind != NULL) *kind = 0; break; } case ACTIVITY_DOMAIN_KFD_API: { *op = roctracer::kfd_support::GetApiCode(str); + if (*op == KFD_API_ID_NUMBER) { + EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")"); + } + if (kind != NULL) *kind = 0; + break; + } + case ACTIVITY_DOMAIN_HIP_API: { + *op = hipApiIdByName(str); + if (*op == HIP_API_ID_NUMBER) { + EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")"); + } if (kind != NULL) *kind = 0; break; } diff --git a/test/run.sh b/test/run.sh index e8d89095..9a7ffc74 100755 --- a/test/run.sh +++ b/test/run.sh @@ -134,6 +134,11 @@ export ROCTRACER_DOMAIN="hip" eval_test "tool period test" "ROCP_CTRL_RATE=10:100000:1000000 ./test/MatrixTranspose" MatrixTranspose_hip_period_trace eval_test "tool flushing test" "ROCP_FLUSH_RATE=100000 ./test/MatrixTranspose" MatrixTranspose_hip_flush_trace +#API records filtering +echo "" > input.xml +export ROCP_INPUT=input.xml +eval_test "tool HIP test input" ./test/MatrixTranspose hip_input_trace + # HSA test export ROCTRACER_DOMAIN="hsa" # test trace diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index 218652e9..cc5367d1 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -112,6 +112,7 @@ bool trace_pcs = false; // API trace vector std::vector hsa_api_vec; std::vector kfd_api_vec; +std::vector hip_api_vec; LOADER_INSTANTIATE(); TRACE_BUFFER_INSTANTIATE(); @@ -921,6 +922,7 @@ void tool_load() { found = true; trace_hip_api = true; trace_hip_activity = true; + hip_api_vec = api_vec; } if (name == "KFD") { found = true; @@ -1091,9 +1093,19 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, // Enable tracing if (trace_hip_api) { hip_api_file_handle = open_output_file(output_prefix, "hip_api_trace.txt"); - ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); + if (hip_api_vec.size() != 0) { + for (unsigned i = 0; i < hip_api_vec.size(); ++i) { + uint32_t cid = HIP_API_ID_NUMBER; + const char* api = hip_api_vec[i].c_str(); + ROCTRACER_CALL(roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, api, &cid, NULL)); + ROCTRACER_CALL(roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, cid, hip_api_callback, NULL)); + printf(" %s", api); + } + } + else { + ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); + } ROCTRACER_CALL(roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, HIP_API_ID_hipModuleUnload)); - if (is_stats_opt) { const char* path = NULL; FILE* f = open_output_file(output_prefix, "hip_api_stats.csv", &path); From 36b1b2fad306968a2463d5b1f6526fa3b905968b Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 22 Dec 2020 04:42:30 -0500 Subject: [PATCH 27/38] SWDEV-251491 : disabling tracing on exit Change-Id: Ifd5f0fbad70afa1e79da8b4b9aa639d899cbea76 --- test/tool/tracer_tool.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index cc5367d1..e9522f0b 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1024,10 +1024,18 @@ void tool_load() { ONLOAD_TRACE_END(); } +void exit_handler(int status, void* arg) { + ONLOAD_TRACE("status(" << status << ") arg(" << arg << ")"); + tool_unload(); + ONLOAD_TRACE_END(); +} + // HSA-runtime tool on-load method extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, const char* const* failed_tool_names) { ONLOAD_TRACE_BEG(); + on_exit(exit_handler, NULL); + timer = new hsa_rt_utils::Timer(table->core_->hsa_system_get_info_fn); const char* output_prefix = getenv("ROCP_OUTPUT_DIR"); @@ -1101,11 +1109,10 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, ROCTRACER_CALL(roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, cid, hip_api_callback, NULL)); printf(" %s", api); } - } - else { + } else { ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); } - ROCTRACER_CALL(roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, HIP_API_ID_hipModuleUnload)); + if (is_stats_opt) { const char* path = NULL; FILE* f = open_output_file(output_prefix, "hip_api_stats.csv", &path); @@ -1116,6 +1123,7 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, } } } + if (trace_hip_activity) { hcc_activity_file_handle = open_output_file(output_prefix, "hcc_ops_trace.txt"); ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS)); @@ -1166,8 +1174,8 @@ extern "C" CONSTRUCTOR_API void constructor() { } extern "C" DESTRUCTOR_API void destructor() { ONLOAD_TRACE_BEG(); - roctracer_flush_buf(); tool_unload(); + roctracer_flush_buf(); if (hip_api_stats) hip_api_stats->dump(); if (hip_kernel_stats) hip_kernel_stats->dump(); From 7ea1e9a369782bbc4f4d1434d7b38624e27d6e0b Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 25 Dec 2020 01:49:44 -0500 Subject: [PATCH 28/38] SWDEV-251491 : fixing tracing on exit Change-Id: I1bf2a6093331e7a08179b9f64394c5c49206ef0e --- src/core/roctracer.cpp | 25 ++++++++++++++----------- test/tool/tracer_tool.cpp | 2 ++ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 8bc5ea29..272bad26 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -279,7 +279,8 @@ struct record_pair_t { roctracer_api_data_t data; record_pair_t() {}; }; -static thread_local std::stack record_pair_stack; +typedef std::stack record_pair_stack_t; +static thread_local record_pair_stack_t* record_pair_stack = NULL; // Correlation id storage static thread_local activity_correlation_id_t correlation_id_tls = 0; @@ -350,6 +351,7 @@ void* HIP_SyncApiDataCallback( void* arg) { static hsa_rt_utils::Timer timer; + if (record_pair_stack == NULL) record_pair_stack = new record_pair_stack_t; void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); @@ -368,8 +370,8 @@ void* HIP_SyncApiDataCallback( // Allocating a record if NULL passed if (record == NULL) { if (data != NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback enter: record is NULL"); - record_pair_stack.push({}); - auto& top = record_pair_stack.top(); + record_pair_stack->push({}); + auto& top = record_pair_stack->top(); data = &(top.data.hip); data_ptr = const_cast(data); data_ptr->phase = phase; @@ -389,7 +391,7 @@ void* HIP_SyncApiDataCallback( ret = data_ptr; } else { // popping the record entry - if (!record_pair_stack.empty()) record_pair_stack.pop(); + if (!record_pair_stack->empty()) record_pair_stack->pop(); // Clearing correlatin ID correlation_id_tls = 0; @@ -397,7 +399,7 @@ void* HIP_SyncApiDataCallback( const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); DEBUG_TRACE("HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) time_ns(%lu)\n", - name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0, timer.timestamp_ns()); + name, phase, op_id, record, data, pool, (int)(record_pair_stack->size()), (data_ptr) ? data_ptr->correlation_id : 0, timer.timestamp_ns()); return ret; } @@ -410,6 +412,7 @@ void* HIP_SyncActivityCallback( { static hsa_rt_utils::Timer timer; const timestamp_t timestamp_ns = timer.timestamp_ns(); + if (record_pair_stack == NULL) record_pair_stack = new record_pair_stack_t; void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); @@ -428,8 +431,8 @@ void* HIP_SyncActivityCallback( // Allocating a record if NULL passed if (record == NULL) { if (data != NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback enter: record is NULL"); - record_pair_stack.push({}); - auto& top = record_pair_stack.top(); + record_pair_stack->push({}); + auto& top = record_pair_stack->top(); record = &(top.record); data = &(top.data.hip); data_ptr = const_cast(data); @@ -459,8 +462,8 @@ void* HIP_SyncActivityCallback( // Getting record of stacked if (record == NULL) { - if (record_pair_stack.empty()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback exit: record stack is empty"); - auto& top = record_pair_stack.top(); + if (record_pair_stack->empty()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback exit: record stack is empty"); + auto& top = record_pair_stack->top(); record = &(top.record); } @@ -482,7 +485,7 @@ void* HIP_SyncActivityCallback( pool->Write(*record); // popping the record entry - if (!record_pair_stack.empty()) record_pair_stack.pop(); + if (!record_pair_stack->empty()) record_pair_stack->pop(); // Clearing correlatin ID correlation_id_tls = 0; @@ -490,7 +493,7 @@ void* HIP_SyncActivityCallback( const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); DEBUG_TRACE("HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) beg_ns(%lu) end_ns(%lu)\n", - name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0, timestamp_ns); + name, phase, op_id, record, data, pool, (int)(record_pair_stack->size()), (data_ptr) ? data_ptr->correlation_id : 0, timestamp_ns); return ret; } diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index e9522f0b..c7e063ff 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1026,8 +1026,10 @@ void tool_load() { void exit_handler(int status, void* arg) { ONLOAD_TRACE("status(" << status << ") arg(" << arg << ")"); +#if 0 tool_unload(); ONLOAD_TRACE_END(); +#endif } // HSA-runtime tool on-load method From 2a64bd062befab2e22fd0416a82c32873b241891 Mon Sep 17 00:00:00 2001 From: Rachida Kebichi Date: Thu, 25 Mar 2021 09:32:28 -0400 Subject: [PATCH 29/38] SWDEV-271503 Fixed core dump Change-Id: Ia582a27482581c3b81c42da0add9f6743898da6c --- script/gen_ostream_ops.py | 6 +++--- test/tool/tracer_tool.cpp | 7 ++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index ae60be84..180103a5 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -85,7 +85,7 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a str = '' if "union" not in mtype: indent = "" - str += " if (regex_match (\"" + cppHeader_struct + "::" + name + "\", std::regex(" + apiname.upper() + "_structs_regex))) {\n" + str += " if (std::string(\"" + cppHeader_struct + "::" + name + "\").find(" + apiname.upper() + "_structs_regex" + ")) {\n" indent = " " str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, \"" + name + "=\");\n" str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, v." + name + ");\n" @@ -132,7 +132,7 @@ def gen_cppheader(infilepath, outfilepath, rank): '#include \n' + \ '\n' + \ '#include "roctracer.h"\n' - header_s += '#include \n#include \n' + header_s += '#include \n' output_filename_h.write(header_s) output_filename_h.write('\n') @@ -140,7 +140,7 @@ def gen_cppheader(infilepath, outfilepath, rank): output_filename_h.write('namespace ' + apiname.lower() + '_support {\n') output_filename_h.write('static int ' + apiname.upper() + '_depth_max = 1;\n') output_filename_h.write('static int ' + apiname.upper() + '_depth_max_cnt = 0;\n') - output_filename_h.write('static std::string ' + apiname.upper() + '_structs_regex = \".*\";\n') + output_filename_h.write('static std::string ' + apiname.upper() + '_structs_regex = \"\";\n') output_filename_h.write('// begin ostream ops for '+ apiname + ' \n') output_filename_h.write("// basic ostream ops\n") output_filename_h.write(header_basic) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index c7e063ff..cf555747 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -823,7 +823,6 @@ void tool_unload() { // Flush tracing pool close_tracing_pool(); roctracer::TraceBufferBase::FlushAll(); - close_file_handles(); ONLOAD_TRACE_END(); } @@ -1026,10 +1025,6 @@ void tool_load() { void exit_handler(int status, void* arg) { ONLOAD_TRACE("status(" << status << ") arg(" << arg << ")"); -#if 0 - tool_unload(); - ONLOAD_TRACE_END(); -#endif } // HSA-runtime tool on-load method @@ -1178,6 +1173,8 @@ extern "C" DESTRUCTOR_API void destructor() { ONLOAD_TRACE_BEG(); tool_unload(); roctracer_flush_buf(); + close_file_handles(); + if (hip_api_stats) hip_api_stats->dump(); if (hip_kernel_stats) hip_kernel_stats->dump(); From 89ab109f024c51674a309e94ec95866d1a56b97a Mon Sep 17 00:00:00 2001 From: Rachida Kebichi Date: Wed, 14 Apr 2021 11:17:53 -0400 Subject: [PATCH 30/38] SWDEV-281008 replace hcc_detail by amd_detail Change-Id: I180b18f9e1fae40c923d6210901f06cba14e8f13 --- inc/roctracer_hip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inc/roctracer_hip.h b/inc/roctracer_hip.h index 553ec1b8..391831bf 100644 --- a/inc/roctracer_hip.h +++ b/inc/roctracer_hip.h @@ -39,7 +39,7 @@ inline static std::ostream& operator<<(std::ostream& out, const char& v) { #include #include -#include +#include #include From e723f8ca67ce9eae793ba72478a4d1e20b4bf7d9 Mon Sep 17 00:00:00 2001 From: Icarus Sparry Date: Wed, 7 Jul 2021 16:40:24 +0000 Subject: [PATCH 31/38] Add dependency on rocm-core The intention is to make all rocm-packages depend on a tiny rocm-core package so that all of rocm can be removed by removing rocm-core. Obviously it is less than ideal that you install by using some variant of "apt install rocm" and remove everything by "apt remove rocm-core", but this is easy to document. The alternative "apt autoremove rocm" may remove unrelated packages. Signed-off-by: Icarus Sparry Change-Id: I74351c7be3c2d3dfec577d36ae78222b3fd22ef3 Signed-off-by: Icarus Sparry --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f8bff20c..7f678dec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,6 +198,7 @@ else() endif() message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" ) set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" ) +set ( CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core" ) ## Process the Debian install/remove scripts to update the CPACK variables configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in DEBIAN/postinst @ONLY ) configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in DEBIAN/prerm @ONLY ) @@ -222,7 +223,14 @@ if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" ) string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" ) endif() set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" ) +set ( CPACK_RPM_PACKAGE_REQUIRES "rocm-core" ) message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}") + +if(NOT ROCM_DEP_ROCMCORE) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS}) +endif() + ## Process the Rpm install/remove scripts to update the CPACK variables configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/post.in" RPM/post @ONLY ) configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/postun.in" RPM/postun @ONLY ) From f7adedb800491b01137c56c2b428e6811aa71665 Mon Sep 17 00:00:00 2001 From: Christophe Paquot Date: Mon, 21 Jun 2021 10:20:25 -0700 Subject: [PATCH 32/38] SWDEV-281658 - Preserve the callback IDs enum ordering Use HIP_API_ID_NONE to detect unsupported API instead of HIP_API_ID_NUMBER which can grow with a new version of the API. This HIP_API_ID_NONE enum has a fixed value of 0 so the HIP_API_IDs really start at FIRST. Change-Id: I760aa50ddf6fa6d46bf20555ad7d429335a53f97 --- src/core/roctracer.cpp | 70 ++++++++++++++++++++++++++------------- test/tool/tracer_tool.cpp | 18 +++++----- 2 files changed, 56 insertions(+), 32 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 272bad26..000ba7e3 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -738,7 +738,7 @@ PUBLIC_API roctracer_status_t roctracer_op_code( } case ACTIVITY_DOMAIN_HIP_API: { *op = hipApiIdByName(str); - if (*op == HIP_API_ID_NUMBER) { + if (*op == HIP_API_ID_NONE) { EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")"); } if (kind != NULL) *kind = 0; @@ -750,13 +750,29 @@ PUBLIC_API roctracer_status_t roctracer_op_code( API_METHOD_SUFFIX } -static inline uint32_t get_op_num(const uint32_t& domain) { +static inline uint32_t get_op_begin(uint32_t domain) { + switch (domain) { + case ACTIVITY_DOMAIN_HSA_OPS: return 0; + case ACTIVITY_DOMAIN_HSA_API: return 0; + case ACTIVITY_DOMAIN_HSA_EVT: return 0; + case ACTIVITY_DOMAIN_HCC_OPS: return 0; + case ACTIVITY_DOMAIN_HIP_API: return HIP_API_ID_FIRST; + case ACTIVITY_DOMAIN_KFD_API: return 0; + case ACTIVITY_DOMAIN_EXT_API: return 0; + case ACTIVITY_DOMAIN_ROCTX: return 0; + default: + EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")"); + } + return 0; +} + +static inline uint32_t get_op_end(uint32_t domain) { switch (domain) { case ACTIVITY_DOMAIN_HSA_OPS: return HSA_OP_ID_NUMBER; case ACTIVITY_DOMAIN_HSA_API: return HSA_API_ID_NUMBER; case ACTIVITY_DOMAIN_HSA_EVT: return HSA_EVT_ID_NUMBER; case ACTIVITY_DOMAIN_HCC_OPS: return HIP_OP_ID_NUMBER; - case ACTIVITY_DOMAIN_HIP_API: return HIP_API_ID_NUMBER; + case ACTIVITY_DOMAIN_HIP_API: return HIP_API_ID_LAST + 1;; case ACTIVITY_DOMAIN_KFD_API: return KFD_API_ID_NUMBER; case ACTIVITY_DOMAIN_EXT_API: return 0; case ACTIVITY_DOMAIN_ROCTX: return ROCTX_API_ID_NUMBER; @@ -850,8 +866,9 @@ PUBLIC_API roctracer_status_t roctracer_enable_domain_callback( void* user_data) { API_METHOD_PREFIX - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_enable_callback_impl(domain, op, callback, user_data); + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_enable_callback_impl(domain, op, callback, user_data); API_METHOD_SUFFIX } @@ -860,9 +877,10 @@ PUBLIC_API roctracer_status_t roctracer_enable_callback( void* user_data) { API_METHOD_PREFIX - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; domain++) { - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_enable_callback_impl(domain, op, callback, user_data); + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_enable_callback_impl(domain, op, callback, user_data); } API_METHOD_SUFFIX } @@ -943,17 +961,19 @@ PUBLIC_API roctracer_status_t roctracer_disable_domain_callback( roctracer_domain_t domain) { API_METHOD_PREFIX - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_disable_callback_impl(domain, op); + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_disable_callback_impl(domain, op); API_METHOD_SUFFIX } PUBLIC_API roctracer_status_t roctracer_disable_callback() { API_METHOD_PREFIX - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; domain++) { - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_disable_callback_impl(domain, op); + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_disable_callback_impl(domain, op); } API_METHOD_SUFFIX } @@ -1082,8 +1102,9 @@ PUBLIC_API roctracer_status_t roctracer_enable_domain_activity_expl( roctracer_pool_t* pool) { API_METHOD_PREFIX - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_enable_activity_impl(domain, op, pool); + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_enable_activity_impl(domain, op, pool); API_METHOD_SUFFIX } @@ -1091,9 +1112,10 @@ PUBLIC_API roctracer_status_t roctracer_enable_activity_expl( roctracer_pool_t* pool) { API_METHOD_PREFIX - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; domain++) { - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_enable_activity_impl(domain, op, pool); + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_enable_activity_impl(domain, op, pool); } API_METHOD_SUFFIX } @@ -1165,17 +1187,19 @@ PUBLIC_API roctracer_status_t roctracer_disable_domain_activity( roctracer_domain_t domain) { API_METHOD_PREFIX - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_disable_activity_impl(domain, op); + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_disable_activity_impl(domain, op); API_METHOD_SUFFIX } PUBLIC_API roctracer_status_t roctracer_disable_activity() { API_METHOD_PREFIX - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; domain++) { - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_disable_activity_impl(domain, op); + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_disable_activity_impl(domain, op); } API_METHOD_SUFFIX } diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index 42cb6ce4..608b40cf 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1094,7 +1094,7 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, hip_api_file_handle = open_output_file(output_prefix, "hip_api_trace.txt"); if (hip_api_vec.size() != 0) { for (unsigned i = 0; i < hip_api_vec.size(); ++i) { - uint32_t cid = HIP_API_ID_NUMBER; + uint32_t cid = HIP_API_ID_NONE; const char* api = hip_api_vec[i].c_str(); ROCTRACER_CALL(roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, api, &cid, NULL)); ROCTRACER_CALL(roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, cid, hip_api_callback, NULL)); @@ -1105,13 +1105,13 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, } if (is_stats_opt) { - const char* path = NULL; - FILE* f = open_output_file(output_prefix, "hip_api_stats.csv", &path); + const char* path = NULL; + FILE* f = open_output_file(output_prefix, "hip_api_stats.csv", &path); hip_api_stats = new EvtStats(f, path); - for (uint32_t id = 0; id < HIP_API_ID_NUMBER; id += 1) { + for (uint32_t id = HIP_API_ID_FIRST; id <= HIP_API_ID_LAST; id += 1) { const char* label = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, id, 0); hip_api_stats->set_label(id, label); - } + } } } @@ -1120,11 +1120,11 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS)); if (is_stats_opt) { - FILE* f = NULL; - const char* path = NULL; - f = open_output_file(output_prefix, "hip_kernel_stats.csv", &path); + FILE* f = NULL; + const char* path = NULL; + f = open_output_file(output_prefix, "hip_kernel_stats.csv", &path); hip_kernel_stats = new EvtStatsA(f, path); - f = open_output_file(output_prefix, "hip_memcpy_stats.csv", &path); + f = open_output_file(output_prefix, "hip_memcpy_stats.csv", &path); hip_memcpy_stats = new EvtStatsA(f, path); } } From f32619b8f109d48281bc1d684f93951e0589bd2e Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Wed, 27 Oct 2021 19:08:41 -0600 Subject: [PATCH 33/38] updating known issues --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index e700ee40..5dce1e8d 100644 --- a/README.md +++ b/README.md @@ -73,3 +73,8 @@ rocTX API: or make package && dpkg -i *.deb ``` + +## Known Issues: +- For workloads where the hip application might make more than 10 million HIP API calls, the application might crash with the error - "Profiling data corrupted" + - Suggested Workaround - Instead of profiling for the complete run, it is suggested to run profiling in parts by using the --trace-period option. +- OpenMP applications are not fully supported by the roctracer. From 38a1972edde07cc1be5e2cc37ea124287aaa46d3 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Fri, 21 Jan 2022 12:10:53 -0600 Subject: [PATCH 34/38] SWDEV-318551: Adding License file for tracer Making the new License file, Adding support in the CMakeLists.txt Change-Id: I43862b8b7f3025ae6200aeb442ea70c7993a7349 --- CMakeLists.txt | 5 +++++ LICENSE | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 128448c8..76be91d4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,6 +184,9 @@ set ( CPACK_PACKAGE_CONTACT "ROCm Profiler Support Date: Wed, 9 Feb 2022 14:21:12 -0600 Subject: [PATCH 35/38] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index e700ee40..28838c19 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,10 @@ rocTX API: - To build roctracer library: export CMAKE_BUILD_TYPE= # release by default + cd /roctracer && mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm .. && make -j + or + ./build.sh - To build and run test: make mytest From a19a826cd1c4b7c374cd7d6919835a54de47dd44 Mon Sep 17 00:00:00 2001 From: Ranjith Ramakrishnan Date: Mon, 14 Feb 2022 12:40:18 -0800 Subject: [PATCH 36/38] SWDEV-291455: Prefer rocm include path to hip include path Change-Id: I1fa96e72169fac689a3a2ed38e988d7f5d18bf04 (cherry picked from commit ebda880c4a11ef986134e46591120b1a695ad254) --- src/CMakeLists.txt | 2 +- test/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5bf02101..82250d87 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -24,7 +24,7 @@ set ( LIB_SRC ${LIB_DIR}/util/hsa_rsrc_factory.cpp ) add_library ( ${TARGET_LIB} ${LIBRARY_TYPE} ${LIB_SRC} ) -target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${ROCM_INC_PATH} ${GEN_INC_DIR} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${ROCM_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++ ) # Build ROCTX tracing library diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 55a7edea..a14ee2ac 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -85,7 +85,7 @@ if ( DEFINED ROCTRACER_TARGET ) set ( TEST_LIB "tracer_tool" ) set ( TEST_LIB_SRC ${TEST_DIR}/tool/tracer_tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) - target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) + target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${ROCM_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries ( ${TEST_LIB} ${ROCTRACER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt numa ) install ( TARGETS ${TEST_LIB} LIBRARY DESTINATION ${DEST_NAME}/tool ) endif () From 75f74bb3bf0d40db2acc054540da7babddcc4642 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Tue, 29 Mar 2022 22:27:25 +0000 Subject: [PATCH 37/38] SWDEV-307394: Fixing Download HSA-Class Issue Exchanging the git clone of the hsa-class to a local downloaded version pushed to the roctracer repo Change-Id: Id45a38b2d355102c2e0dee1e4bfde50398369047 (cherry picked from commit 7ee4f87b73c13cf8404ac11c7628661a01ef31e6) --- .gitignore | 1 - test/CMakeLists.txt | 3 - test/hsa/LICENSE | 20 + test/hsa/README.md | 4 + test/hsa/script/build_kernel.sh | 80 ++ test/hsa/src/hsa_rsrc_factory.cpp | 761 ++++++++++++++++++ test/hsa/src/hsa_rsrc_factory.h | 516 ++++++++++++ test/hsa/test/CMakeLists.txt | 64 ++ test/hsa/test/app/test.cpp | 86 ++ test/hsa/test/ctrl/run_kernel.h | 90 +++ test/hsa/test/ctrl/test_aql.h | 77 ++ test/hsa/test/ctrl/test_hsa.cpp | 279 +++++++ test/hsa/test/ctrl/test_hsa.h | 129 +++ test/hsa/test/ctrl/test_kernel.h | 138 ++++ test/hsa/test/dummy_kernel/dummy_kernel.cl | 28 + test/hsa/test/dummy_kernel/dummy_kernel.h | 71 ++ test/hsa/test/run.sh | 45 ++ .../simple_convolution/simple_convolution.cl | 76 ++ .../simple_convolution/simple_convolution.cpp | 388 +++++++++ .../simple_convolution/simple_convolution.h | 94 +++ test/hsa/test/util/evt_stats.h | 98 +++ test/hsa/test/util/helper_funcs.h | 86 ++ test/hsa/test/util/hsa_rsrc_factory.cpp | 1 + test/hsa/test/util/hsa_rsrc_factory.h | 1 + test/hsa/test/util/perf_timer.cpp | 179 ++++ test/hsa/test/util/perf_timer.h | 83 ++ test/hsa/test/util/test_assert.h | 35 + test/hsa/test/util/xml.h | 457 +++++++++++ 28 files changed, 3886 insertions(+), 4 deletions(-) create mode 100644 test/hsa/LICENSE create mode 100644 test/hsa/README.md create mode 100755 test/hsa/script/build_kernel.sh create mode 100644 test/hsa/src/hsa_rsrc_factory.cpp create mode 100644 test/hsa/src/hsa_rsrc_factory.h create mode 100644 test/hsa/test/CMakeLists.txt create mode 100644 test/hsa/test/app/test.cpp create mode 100644 test/hsa/test/ctrl/run_kernel.h create mode 100644 test/hsa/test/ctrl/test_aql.h create mode 100644 test/hsa/test/ctrl/test_hsa.cpp create mode 100644 test/hsa/test/ctrl/test_hsa.h create mode 100644 test/hsa/test/ctrl/test_kernel.h create mode 100644 test/hsa/test/dummy_kernel/dummy_kernel.cl create mode 100644 test/hsa/test/dummy_kernel/dummy_kernel.h create mode 100755 test/hsa/test/run.sh create mode 100644 test/hsa/test/simple_convolution/simple_convolution.cl create mode 100644 test/hsa/test/simple_convolution/simple_convolution.cpp create mode 100644 test/hsa/test/simple_convolution/simple_convolution.h create mode 100644 test/hsa/test/util/evt_stats.h create mode 100644 test/hsa/test/util/helper_funcs.h create mode 120000 test/hsa/test/util/hsa_rsrc_factory.cpp create mode 120000 test/hsa/test/util/hsa_rsrc_factory.h create mode 100644 test/hsa/test/util/perf_timer.cpp create mode 100644 test/hsa/test/util/perf_timer.h create mode 100644 test/hsa/test/util/test_assert.h create mode 100644 test/hsa/test/util/xml.h diff --git a/.gitignore b/.gitignore index 331d63fd..ef6bb1de 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,5 @@ *.swp *.Po build -test/hsa test/MatrixTranspose/MatrixTranspose test/MatrixTranspose_test/MatrixTranspose diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a14ee2ac..17a54c80 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,7 +44,6 @@ endif () ## Path to HSA test set ( HSA_TEST_DIR "${TEST_DIR}/hsa/test" ) -set ( HSA_REV "f8b3870" ) ## test run script set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) @@ -70,8 +69,6 @@ add_custom_target( mytest ) ## Build HSA test -execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" ) -execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git fetch origin && git checkout ${HSA_REV}; fi" ) set ( TMP ${TEST_DIR} ) set ( TEST_DIR ${HSA_TEST_DIR} ) add_subdirectory ( ${HSA_TEST_DIR} ${PROJECT_BINARY_DIR}/test/hsa ) diff --git a/test/hsa/LICENSE b/test/hsa/LICENSE new file mode 100644 index 00000000..597d1b16 --- /dev/null +++ b/test/hsa/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +[MITx11 license] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/test/hsa/README.md b/test/hsa/README.md new file mode 100644 index 00000000..20e09157 --- /dev/null +++ b/test/hsa/README.md @@ -0,0 +1,4 @@ +# HSA-class +``` +HSA high level C++ API +``` diff --git a/test/hsa/script/build_kernel.sh b/test/hsa/script/build_kernel.sh new file mode 100755 index 00000000..2007e7ab --- /dev/null +++ b/test/hsa/script/build_kernel.sh @@ -0,0 +1,80 @@ +#!/bin/sh -x +SO_EXT="hsaco" + +TEST_NAME=$1 +DST_DIR=$2 +ROCM_DIR=$3 +TGT_LIST=$4 + +if [ -z "$TEST_NAME" ] ; then + echo "Usage: $0 " + echo " Will look for .cl and will build .$SO_EXT dynamic code object library" + exit 1 +fi +OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') + +if [ -z "$DST_DIR" ] ; then + DST_DIR=$(dirname TEST_NAME) +fi + +if [ -z "$ROCM_DIR" ] ; then + ROCM_DIR=/opt/rocm +fi + +if [ -z "$TGT_LIST" ] ; then + TGT_LIST=`$ROCM_DIR/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p"` +fi + +if [ -z "$TGT_LIST" ] ; then + echo "Error: GPU targets not found" + exit 1 +fi + +OCL_VER="2.0" + +if [ -e $ROCM_DIR/llvm ] ; then + LLVM_DIR=$ROCM_DIR/llvm + LIB_DIR=$ROCM_DIR/lib +else + LLVM_DIR=$ROCM_DIR/hcc + LIB_DIR=$LLVM_DIR/lib +fi + +# Determine whether using new or old device-libs layout +if [ -e $LIB_DIR/bitcode/opencl.amdgcn.bc ]; then + BC_DIR=$LIB_DIR/bitcode +elif [ -e $LIB_DIR/opencl.amdgcn.bc ]; then + BC_DIR=$LIB_DIR +elif [ -e $ROCM_DIR/amdgcn/bitcode/opencl.bc ]; then + BC_DIR=$ROCM_DIR/amdgcn/bitcode +else + echo "Error: Cannot find amdgcn bitcode directory" + exit 1 +fi + +CLANG_ROOT=$LLVM_DIR/lib/clang +CLANG_DIR=`ls -d $CLANG_ROOT/* | head -n 1` +if [ "$CLANG_DIR" = "" ] ; then + echo "Error: LLVM clang library was not found" + exit 1 +fi + +BIN_DIR=$LLVM_DIR/bin +INC_DIR=$CLANG_DIR/include +if [ -e $BC_DIR/opencl.amdgcn.bc ]; then + BITCODE_OPTS="-nogpulib \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/opencl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ockl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ocml.amdgcn.bc" +else + BITCODE_OPTS="--hip-device-lib-path=$BC_DIR" +fi + +for GFXIP in $TGT_LIST ; do + OBJ_PREF=$GFXIP + OBJ_FILE="${OBJ_PREF}_${OBJ_NAME}.$SO_EXT" + $BIN_DIR/clang -cl-std=CL$OCL_VER -include $INC_DIR/opencl-c.h $BITCODE_OPTS -target amdgcn-amd-amdhsa -mcpu=$GFXIP $TEST_NAME.cl -o $DST_DIR/$OBJ_FILE + echo "'$OBJ_FILE' generated" +done + +exit 0 diff --git a/test/hsa/src/hsa_rsrc_factory.cpp b/test/hsa/src/hsa_rsrc_factory.cpp new file mode 100644 index 00000000..d2d8e79e --- /dev/null +++ b/test/hsa/src/hsa_rsrc_factory.cpp @@ -0,0 +1,761 @@ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#include "util/hsa_rsrc_factory.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +// Callback function to get available in the system agents +hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { + hsa_status_t status = HSA_STATUS_ERROR; + HsaRsrcFactory* hsa_rsrc = reinterpret_cast(data); + const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent); + if (agent_info != NULL) status = HSA_STATUS_SUCCESS; + return status; +} + +// This function checks to see if the provided +// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, +// the function adds an additional requirement that the pool have the +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false, +// pools must NOT have this property. +// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is +// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but +// no pool was found meeting the requirements. If an error is encountered, we +// return that error. +static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) { + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + + if (nullptr == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + if (HSA_AMD_SEGMENT_GLOBAL != segment) { + return HSA_STATUS_SUCCESS; + } + + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + + uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; + + if ((karg_st == 0 && kern_arg) || (karg_st != 0 && !kern_arg)) { + return HSA_STATUS_SUCCESS; + } + + *(reinterpret_cast(data)) = pool; + return HSA_STATUS_INFO_BREAK; +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, false); +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, true); +} + +// Constructor of the class +HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { + hsa_status_t status; + + cpu_pool_ = NULL; + kern_arg_pool_ = NULL; + + InitHsaApiTable(NULL); + + // Initialize the Hsa Runtime + if (initialize_hsa_) { + status = hsa_api_.hsa_init(); + CHECK_STATUS("Error in hsa_init", status); + } + + // Discover the set of Gpu devices available on the platform + status = hsa_api_.hsa_iterate_agents(GetHsaAgentsCallback, this); + CHECK_STATUS("Error Calling hsa_iterate_agents", status); + if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); + if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); + + // Get AqlProfile API table + aqlprofile_api_ = {0}; +#ifdef ROCP_LD_AQLPROFILE + status = LoadAqlProfileLib(&aqlprofile_api_); +#else + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); +#endif + CHECK_STATUS("aqlprofile API table load failed", status); + + // Get Loader API table + loader_api_ = {0}; + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); + CHECK_STATUS("loader API table query failed", status); + + // Instantiate HSA timer + timer_ = new HsaTimer(&hsa_api_); + CHECK_STATUS("HSA timer allocation failed", + (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + + // Time correlation + const uint32_t corr_iters = 1000; + CorrelateTime(HsaTimer::TIME_ID_CLOCK_REALTIME, corr_iters); + CorrelateTime(HsaTimer::TIME_ID_CLOCK_MONOTONIC, corr_iters); + + // System timeout + timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); +} + +// Destructor of the class +HsaRsrcFactory::~HsaRsrcFactory() { + delete timer_; + for (auto p : cpu_list_) delete p; + for (auto p : gpu_list_) delete p; + if (initialize_hsa_) { + hsa_status_t status = hsa_api_.hsa_shut_down(); + CHECK_STATUS("Error in hsa_shut_down", status); + } +} + +void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { + std::lock_guard lck(mutex_); + + if (hsa_api_.hsa_init == NULL) { + if (table != NULL) { + hsa_api_.hsa_init = table->core_->hsa_init_fn; + hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn; + hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn; + hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn; + + hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; + hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn; + hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn; + hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; + + hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; + hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; + hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn; + hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn; + hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn; + hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn; + + hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn; + hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; + hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; + hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; + hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn; + hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn; + + hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn; + hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn; + hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn; + hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn; + hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn; + hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn; + + hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn; + hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn; + hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn; + } else { + hsa_api_.hsa_init = hsa_init; + hsa_api_.hsa_shut_down = hsa_shut_down; + hsa_api_.hsa_agent_get_info = hsa_agent_get_info; + hsa_api_.hsa_iterate_agents = hsa_iterate_agents; + + hsa_api_.hsa_queue_create = hsa_queue_create; + hsa_api_.hsa_queue_destroy = hsa_queue_destroy; + hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; + hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed; + hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; + + hsa_api_.hsa_signal_create = hsa_signal_create; + hsa_api_.hsa_signal_destroy = hsa_signal_destroy; + hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; + hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed; + hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire; + hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; + + hsa_api_.hsa_code_object_reader_create_from_file = hsa_code_object_reader_create_from_file; + hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; + hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; + hsa_api_.hsa_executable_freeze = hsa_executable_freeze; + hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; + hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; + hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; + + hsa_api_.hsa_system_get_info = hsa_system_get_info; + hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools; + hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; + hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; + hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; + hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; + + hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = hsa_amd_profiling_set_profiler_enabled; + hsa_api_.hsa_amd_profiling_get_async_copy_time = hsa_amd_profiling_get_async_copy_time; + hsa_api_.hsa_amd_profiling_get_dispatch_time = hsa_amd_profiling_get_dispatch_time; + } + } +} + +hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { + void* handle = dlopen(kAqlProfileLib, RTLD_NOW); + if (handle == NULL) { + fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror()); + return HSA_STATUS_ERROR; + } + dlerror(); /* Clear any existing error */ + + api->hsa_ven_amd_aqlprofile_error_string = + (decltype(::hsa_ven_amd_aqlprofile_error_string)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_error_string"); + api->hsa_ven_amd_aqlprofile_validate_event = + (decltype(::hsa_ven_amd_aqlprofile_validate_event)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_validate_event"); + api->hsa_ven_amd_aqlprofile_start = + (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); + api->hsa_ven_amd_aqlprofile_stop = + (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); +#ifdef AQLPROF_NEW_API + api->hsa_ven_amd_aqlprofile_read = + (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); +#endif + api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = + (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + api->hsa_ven_amd_aqlprofile_get_info = (decltype(::hsa_ven_amd_aqlprofile_get_info)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_get_info"); + api->hsa_ven_amd_aqlprofile_iterate_data = + (decltype(::hsa_ven_amd_aqlprofile_iterate_data)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_iterate_data"); + + return HSA_STATUS_SUCCESS; +} + +// Add system agent info +const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { + // Determine if device is a Gpu agent + hsa_status_t status; + AgentInfo* agent_info = NULL; + + hsa_device_type_t type; + status = hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + CHECK_STATUS("Error Calling hsa_agent_get_info", status); + + if (type == HSA_DEVICE_TYPE_CPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_CPU; + agent_info->dev_index = cpu_list_.size(); + + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool; + agent_info->gpu_pool = {}; + + cpu_list_.push_back(agent_info); + cpu_agents_.push_back(agent); + } + + if (type == HSA_DEVICE_TYPE_GPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_GPU; + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + strncpy(agent_info->gfxip, agent_info->name, 4); + agent_info->gfxip[4] = '\0'; + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), + &agent_info->cu_num); + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), + &agent_info->waves_per_cu); + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), + &agent_info->simds_per_cu); + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), + &agent_info->se_num); + hsa_api_.hsa_agent_get_info(agent, + static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), + &agent_info->shader_arrays_per_se); + + agent_info->cpu_pool = {}; + agent_info->kern_arg_pool = {}; + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); + + // GFX8 and GFX9 SGPR/VGPR block sizes + agent_info->sgpr_block_dflt = (strcmp(agent_info->gfxip, "gfx8") == 0) ? 1 : 2; + agent_info->sgpr_block_size = 8; + agent_info->vgpr_block_size = 4; + + // Set GPU index + agent_info->dev_index = gpu_list_.size(); + gpu_list_.push_back(agent_info); + gpu_agents_.push_back(agent); + } + + if (agent_info) agent_map_[agent.handle] = agent_info; + + return agent_info; +} + +// Return systen agent info +const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { + const AgentInfo* agent_info = NULL; + auto it = agent_map_.find(agent.handle); + if (it != agent_map_.end()) { + agent_info = it->second; + } + return agent_info; +} + +// Get the count of Hsa Gpu Agents available on the platform +// +// @return uint32_t Number of Gpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } + +// Get the count of Hsa Cpu Agents available on the platform +// +// @return uint32_t Number of Cpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } + +// Get the AgentInfo handle of a Gpu device +// +// @param idx Gpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(gpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = gpu_list_[idx]; + + return true; +} + +// Get the AgentInfo handle of a Cpu device +// +// @param idx Cpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(cpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = cpu_list_[idx]; + return true; +} + +// Create a Queue object and return its handle. The queue object is expected +// to support user requested number of Aql dispatch packets. +// +// @param agent_info Gpu Agent on which to create a queue object +// +// @param num_Pkts Number of packets to be held by queue +// +// @param queue Output parameter updated with handle of queue object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, + hsa_queue_t** queue) { + hsa_status_t status; + status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + UINT32_MAX, UINT32_MAX, queue); + return (status == HSA_STATUS_SUCCESS); +} + +// Create a Signal object and return its handle. +// @param value Initial value of signal object +// @param signal Output parameter updated with handle of signal object +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { + hsa_status_t status; + status = hsa_api_.hsa_signal_create(value, 0, NULL, signal); + return (status == HSA_STATUS_SUCCESS); +} + +// Allocate memory for use by a kernel of specified size in specified +// agent's memory region. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory to pass kernel parameters. +// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + if (!cpu_agents_.empty()) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the kernel arguments + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate system memory accessible by both CPU and GPU +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + if (!cpu_agents_.empty()) { + status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the memory + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory for command buffer. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + uint8_t* ptr = (agent_info->is_apu && CMD_MEMORY_MMAP) + ? reinterpret_cast( + mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, 0, 0)) + : AllocateSysMemory(agent_info, size); + return ptr; +} + +// Wait signal +hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + const hsa_signal_value_t exp_value = signal_value - 1; + hsa_signal_value_t ret_value = signal_value; + while (1) { + ret_value = + hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value, timeout_, HSA_WAIT_STATE_BLOCKED); + if (ret_value == exp_value) break; + if (ret_value != signal_value) { + std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" << signal_value + << "), ret_value(" << ret_value << ")" << std::endl << std::flush; + abort(); + } + } + return ret_value; +} + +// Wait signal with signal value restore +void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + SignalWait(signal, signal_value); + hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); +} + +// Copy data from GPU to host memory +bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + if (!cpu_agents_.empty()) { + hsa_signal_t s = {}; + status = hsa_api_.hsa_signal_create(1, 0, NULL, &s); + CHECK_STATUS("hsa_signal_create()", status); + status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + CHECK_STATUS("hsa_amd_memory_async_copy()", status); + SignalWait(s, 1); + status = hsa_api_.hsa_signal_destroy(s); + CHECK_STATUS("hsa_signal_destroy()", status); + } + return (status == HSA_STATUS_SUCCESS); +} +bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size) { + return Memcpy(agent_info->dev_id, dst, src, size); +} + +// Memory free method +bool HsaRsrcFactory::FreeMemory(void* ptr) { + const hsa_status_t status = hsa_memory_free(ptr); + CHECK_STATUS("hsa_memory_free", status); + return (status == HSA_STATUS_SUCCESS); +} + +// Loads an Assembled Brig file and Finalizes it into Device Isa +// @param agent_info Gpu device for which to finalize +// @param brig_path File path of the Assembled Brig file +// @param kernel_name Name of the kernel to finalize +// @param code_desc Handle of finalized Code Descriptor that could +// be used to submit for execution +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, + const char* kernel_name, hsa_executable_t* executable, + hsa_executable_symbol_t* code_desc) { + hsa_status_t status = HSA_STATUS_ERROR; + + // Build the code object filename + std::string filename(brig_path); + std::clog << "Code object filename: " << filename << std::endl; + + // Open the file containing code object + hsa_file_t file_handle = open(filename.c_str(), O_RDONLY); + if (file_handle == -1) { + std::cerr << "Error: failed to load '" << filename << "'" << std::endl; + assert(false); + return false; + } + + // Create code object reader + hsa_code_object_reader_t code_obj_rdr = {0}; + status = hsa_api_.hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); + if (status != HSA_STATUS_SUCCESS) { + std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; + return false; + } + + // Create executable. + status = hsa_api_.hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + NULL, executable); + CHECK_STATUS("Error in creating executable object", status); + + // Load code object. + status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, + NULL, NULL); + CHECK_STATUS("Error in loading executable object", status); + + // Freeze executable. + status = hsa_api_.hsa_executable_freeze(*executable, ""); + CHECK_STATUS("Error in freezing executable object", status); + + // Get symbol handle. + hsa_executable_symbol_t kernelSymbol; + status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, + &kernelSymbol); + CHECK_STATUS("Error in looking up kernel symbol", status); + + // Update output parameter + *code_desc = kernelSymbol; + return true; +} + +// Print the various fields of Hsa Gpu Agents +bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::cout << std::flush; + std::clog << header << " :" << std::endl; + + const AgentInfo* agent_info; + int size = uint32_t(gpu_list_.size()); + for (int idx = 0; idx < size; idx++) { + agent_info = gpu_list_[idx]; + + std::clog << "> agent[" << idx << "] :" << std::endl; + std::clog << ">> Name : " << agent_info->name << std::endl; + std::clog << ">> APU : " << agent_info->is_apu << std::endl; + std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl; + std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; + std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; + std::clog << ">> CU number : " << agent_info->cu_num << std::endl; + std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl; + std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl; + std::clog << ">> SE number : " << agent_info->se_num << std::endl; + std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl; + } + return true; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; + + // adevance command queue + const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue); + hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { + sched_yield(); + } + + uint32_t slot_idx = (uint32_t)(write_idx % queue->size); + uint32_t* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); + const uint32_t* slot_data = reinterpret_cast(packet); + + // Copy buffered commands into the queue slot. + // Overwrite the AQL invalid header (first dword) last. + // This prevents the slot from being read until it's fully written. + memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t)); + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&queue_slot[0]); + header_atomic_ptr->store(slot_data[0], std::memory_order_release); + + // ringdoor bell + hsa_api_.hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); + + return write_idx; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) { + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; + if ((size_bytes & (slot_size_b - 1)) != 0) { + fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); + abort(); + } + + const char* begin = reinterpret_cast(packet); + const char* end = begin + size_bytes; + uint64_t write_idx = 0; + for (const char* ptr = begin; ptr < end; ptr += slot_size_b) { + write_idx = Submit(queue, ptr); + } + + return write_idx; +} + +const char* HsaRsrcFactory::GetKernelName(uint64_t addr) { + std::lock_guard lck(mutex_); + const auto it = symbols_map_->find(addr); + if (it == symbols_map_->end()) { + fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); + abort(); + } + return strdup(it->second); +} + +void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { + std::lock_guard lck(mutex_); + executable_tracking_on_ = true; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; +} + +hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) { + hsa_symbol_kind_t value = (hsa_symbol_kind_t)0; + hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value); + CHECK_STATUS("Error in getting symbol info", status); + if (value == HSA_SYMBOL_KIND_KERNEL) { + uint64_t addr = 0; + uint32_t len = 0; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr); + CHECK_STATUS("Error in getting kernel object", status); + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); + CHECK_STATUS("Error in getting name len", status); + char *name = new char[len + 1]; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + CHECK_STATUS("Error in getting kernel name", status); + name[len] = 0; + auto ret = symbols_map_->insert({addr, name}); + if (ret.second == false) { + delete[] ret.first->second; + ret.first->second = name; + } + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) { + std::lock_guard lck(mutex_); + if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL); + CHECK_STATUS("Error in iterating executable symbols", status); + return hsa_api_.hsa_executable_freeze(executable, options);; +} + +void HsaRsrcFactory::DumpHandles(FILE* file) { + auto beg = agent_map_.begin(); + auto end = agent_map_.end(); + for (auto it = beg; it != end; ++it) { + const AgentInfo* agent_info = it->second; + fprintf(file, "0x%lx agent %s\n", agent_info->dev_id.handle, (agent_info->dev_type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu"); + if (agent_info->cpu_pool.handle != 0) fprintf(file, "0x%lx pool cpu\n", agent_info->cpu_pool.handle); + if (agent_info->kern_arg_pool.handle != 0) fprintf(file, "0x%lx pool cpu kernarg\n", agent_info->kern_arg_pool.handle); + if (agent_info->gpu_pool.handle != 0) fprintf(file, "0x%lx pool gpu\n", agent_info->gpu_pool.handle); + } + fflush(file); +} + +std::atomic HsaRsrcFactory::instance_{}; +HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; +HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; +hsa_pfn_t HsaRsrcFactory::hsa_api_{}; +bool HsaRsrcFactory::executable_tracking_on_ = false; +HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL; diff --git a/test/hsa/src/hsa_rsrc_factory.h b/test/hsa/src/hsa_rsrc_factory.h new file mode 100644 index 00000000..8383aa66 --- /dev/null +++ b/test/hsa/src/hsa_rsrc_factory.h @@ -0,0 +1,516 @@ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#ifndef _HSA_RSRC_FACTORY_H_ +#define _HSA_RSRC_FACTORY_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define HSA_ARGUMENT_ALIGN_BYTES 16 +#define HSA_QUEUE_ALIGN_BYTES 64 +#define HSA_PACKET_ALIGN_BYTES 64 + +#define CHECK_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_SUCCESS) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + abort(); \ + } \ +} while (0) + +#define CHECK_ITER_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_INFO_BREAK) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + abort(); \ + } \ +} while (0) + +static const size_t MEM_PAGE_BYTES = 0x1000; +static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; +typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; + +struct hsa_pfn_t { + decltype(hsa_init)* hsa_init; + decltype(hsa_shut_down)* hsa_shut_down; + decltype(hsa_agent_get_info)* hsa_agent_get_info; + decltype(hsa_iterate_agents)* hsa_iterate_agents; + + decltype(hsa_queue_create)* hsa_queue_create; + decltype(hsa_queue_destroy)* hsa_queue_destroy; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; + decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed; + decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + + decltype(hsa_signal_create)* hsa_signal_create; + decltype(hsa_signal_destroy)* hsa_signal_destroy; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed; + decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed; + decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire; + decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease; + + decltype(hsa_code_object_reader_create_from_file)* hsa_code_object_reader_create_from_file; + decltype(hsa_executable_create_alt)* hsa_executable_create_alt; + decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; + decltype(hsa_executable_freeze)* hsa_executable_freeze; + decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; + decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; + decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; + + decltype(hsa_system_get_info)* hsa_system_get_info; + decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; + + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy; + + decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler; + decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled; + decltype(hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time; + decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time; +}; + +// Encapsulates information about a Hsa Agent such as its +// handle, name, max queue size, max wavefront size, etc. +struct AgentInfo { + // Handle of Agent + hsa_agent_t dev_id; + + // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 + uint32_t dev_type; + + // APU flag + bool is_apu; + + // Agent system index + uint32_t dev_index; + + // GFXIP name + char gfxip[64]; + + // Name of Agent whose length is less than 64 + char name[64]; + + // Max size of Wavefront size + uint32_t max_wave_size; + + // Max size of Queue buffer + uint32_t max_queue_size; + + // Hsail profile supported by agent + hsa_profile_t profile; + + // CPU/GPU/kern-arg memory pools + hsa_amd_memory_pool_t cpu_pool; + hsa_amd_memory_pool_t gpu_pool; + hsa_amd_memory_pool_t kern_arg_pool; + + // The number of compute unit available in the agent. + uint32_t cu_num; + + // Maximum number of waves possible in a Compute Unit. + uint32_t waves_per_cu; + + // Number of SIMD's per compute unit CU + uint32_t simds_per_cu; + + // Number of Shader Engines (SE) in Gpu + uint32_t se_num; + + // Number of Shader Arrays Per Shader Engines in Gpu + uint32_t shader_arrays_per_se; + + // SGPR/VGPR block sizes + uint32_t sgpr_block_dflt; + uint32_t sgpr_block_size; + uint32_t vgpr_block_size; +}; + +// HSA timer class +// Provides current HSA timestampa and system-clock/ns conversion API +class HsaTimer { + public: + typedef uint64_t timestamp_t; + static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; + typedef long double freq_t; + + enum time_id_t { + TIME_ID_CLOCK_REALTIME = 0, + TIME_ID_CLOCK_MONOTONIC = 1, + TIME_ID_NUMBER + }; + + HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) { + timestamp_t sysclock_hz = 0; + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); + sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; + } + + // Methods for system-clock/ns conversion + timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { + return timestamp_t((freq_t)sysclock * sysclock_factor_); + } + timestamp_t ns_to_sysclock(const timestamp_t& time) const { + return timestamp_t((freq_t)time / sysclock_factor_); + } + + // Method for timespec/ns conversion + static timestamp_t timespec_to_ns(const timespec& time) { + return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; + } + + // Return timestamp in 'ns' + timestamp_t timestamp_ns() const { + timestamp_t sysclock; + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); + return sysclock_to_ns(sysclock); + } + + // Return time in 'ns' + static timestamp_t clocktime_ns(clockid_t clock_id) { + timespec time; + clock_gettime(clock_id, &time); + return timespec_to_ns(time); + } + + // Return pair of correlated values of profiling timestamp and time with + // correlation error for a given time ID and number of iterations + void correlated_pair_ns(time_id_t time_id, uint32_t iters, + timestamp_t* timestamp_v, timestamp_t* time_v, timestamp_t* error_v) const { + clockid_t clock_id = 0; + switch (clock_id) { + case TIME_ID_CLOCK_REALTIME: + clock_id = CLOCK_REALTIME; + break; + case TIME_ID_CLOCK_MONOTONIC: + clock_id = CLOCK_MONOTONIC; + break; + default: + CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR); + } + + std::vector ts_vec(iters); + std::vector tm_vec(iters); + const uint32_t steps = iters - 1; + + for (uint32_t i = 0; i < iters; ++i) { + hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]); + clock_gettime(clock_id, &tm_vec[i]); + } + + const timestamp_t ts_base = sysclock_to_ns(ts_vec.front()); + const timestamp_t tm_base = timespec_to_ns(tm_vec.front()); + const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps); + + timestamp_t ts_accum = 0; + timestamp_t tm_accum = 0; + for (uint32_t i = 0; i < iters; ++i) { + ts_accum += (ts_vec[i] - ts_base); + tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base); + } + + *timestamp_v = (ts_accum / iters) + ts_base + error; + *time_v = (tm_accum / iters) + tm_base; + *error_v = error; + } + + private: + // Timestamp frequency factor + freq_t sysclock_factor_; + // HSA API table + const hsa_pfn_t* const hsa_api_; +}; + +class HsaRsrcFactory { + public: + static const size_t CMD_SLOT_SIZE_B = 0x40; + typedef std::recursive_mutex mutex_t; + typedef HsaTimer::timestamp_t timestamp_t; + + static HsaRsrcFactory* Create(bool initialize_hsa = true) { + std::lock_guard lck(mutex_); + HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed); + if (obj == NULL) { + obj = new HsaRsrcFactory(initialize_hsa); + instance_.store(obj, std::memory_order_release); + } + return obj; + } + + static HsaRsrcFactory& Instance() { + HsaRsrcFactory* obj = instance_.load(std::memory_order_acquire); + if (obj == NULL) obj = Create(false); + hsa_status_t status = (obj != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); + return *obj; + } + + static void Destroy() { + std::lock_guard lck(mutex_); + if (instance_) delete instance_.load(); + instance_ = NULL; + } + + // Return system agent info + const AgentInfo* GetAgentInfo(const hsa_agent_t agent); + + // Get the count of Hsa Gpu Agents available on the platform + // @return uint32_t Number of Gpu agents on platform + uint32_t GetCountOfGpuAgents(); + + // Get the count of Hsa Cpu Agents available on the platform + // @return uint32_t Number of Cpu agents on platform + uint32_t GetCountOfCpuAgents(); + + // Get the AgentInfo handle of a Gpu device + // @param idx Gpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Get the AgentInfo handle of a Cpu device + // @param idx Cpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Create a Queue object and return its handle. The queue object is expected + // to support user requested number of Aql dispatch packets. + // @param agent_info Gpu Agent on which to create a queue object + // @param num_Pkts Number of packets to be held by queue + // @param queue Output parameter updated with handle of queue object + // @return bool true if successful, false otherwise + bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); + + // Create a Signal object and return its handle. + // @param value Initial value of signal object + // @param signal Output parameter updated with handle of signal object + // @return bool true if successful, false otherwise + bool CreateSignal(uint32_t value, hsa_signal_t* signal); + + // Allocate local GPU memory + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory tp pass kernel parameters + // Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size); + + // Allocate system memory accessible from both CPU and GPU + // Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory for command buffer. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); + + // Wait signal + hsa_signal_value_t SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; + + // Wait signal with signal value restore + void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; + + // Copy data from GPU to host memory + bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); + bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); + + // Memory free method + static bool FreeMemory(void* ptr); + + // Loads an Assembled Brig file and Finalizes it into Device Isa + // @param agent_info Gpu device for which to finalize + // @param brig_path File path of the Assembled Brig file + // @param kernel_name Name of the kernel to finalize + // @param code_desc Handle of finalized Code Descriptor that could + // be used to submit for execution + // @return true if successful, false otherwise + bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, + hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc); + + // Print the various fields of Hsa Gpu Agents + bool PrintGpuAgents(const std::string& header); + + // Submit AQL packet to given queue + static uint64_t Submit(hsa_queue_t* queue, const void* packet); + static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + + // Enable executables loading tracking + static bool IsExecutableTracking() { return executable_tracking_on_; } + static void EnableExecutableTracking(HsaApiTable* table); + static const char* GetKernelName(uint64_t addr); + + // Initialize HSA API table + void static InitHsaApiTable(HsaApiTable* table); + static const hsa_pfn_t* HsaApi() { return &hsa_api_; } + + // Return AqlProfile API table + typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t; + const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } + + // Return Loader API table + const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + + // Methods for system-clock/ns conversion and timestamp in 'ns' + timestamp_t SysclockToNs(const timestamp_t& sysclock) const { return timer_->sysclock_to_ns(sysclock); } + timestamp_t NsToSysclock(const timestamp_t& time) const { return timer_->ns_to_sysclock(time); } + timestamp_t TimestampNs() const { return timer_->timestamp_ns(); } + + timestamp_t GetSysTimeout() const { return timeout_; } + static timestamp_t GetTimeoutNs() { return timeout_ns_; } + static void SetTimeoutNs(const timestamp_t& time) { + std::lock_guard lck(mutex_); + timeout_ns_ = time; + if (instance_ != NULL) Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); + } + + void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters) { + timestamp_t timestamp_v = 0; + timestamp_t time_v = 0; + timestamp_t error_v = 0; + timer_->correlated_pair_ns(time_id, iters, ×tamp_v, &time_v, &error_v); + time_shift_[time_id] = time_v - timestamp_v; + time_error_[time_id] = error_v; + } + + hsa_status_t GetTime(uint32_t time_id, timestamp_t value, uint64_t* time) { + if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; + *time = value + time_shift_[time_id]; + return HSA_STATUS_SUCCESS; + } + + hsa_status_t GetTimestamp(uint32_t time_id, uint64_t value, timestamp_t* timestamp) { + if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; + *timestamp = value - time_shift_[time_id]; + return HSA_STATUS_SUCCESS; + } + + void DumpHandles(FILE* output_file); + + private: + // System agents iterating callback + static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); + + // Callback function to find and bind kernarg region of an agent + static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data); + + // Load AQL profile HSA extension library directly + static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api); + + // Constructor of the class. Will initialize the Hsa Runtime and + // query the system topology to get the list of Cpu and Gpu devices + explicit HsaRsrcFactory(bool initialize_hsa); + + // Destructor of the class + ~HsaRsrcFactory(); + + // Add an instance of AgentInfo representing a Hsa Gpu agent + const AgentInfo* AddAgentInfo(const hsa_agent_t agent); + + // To mmap command buffer memory + static const bool CMD_MEMORY_MMAP = false; + + // HSA was initialized + const bool initialize_hsa_; + + static std::atomic instance_; + static mutex_t mutex_; + + // Used to maintain a list of Hsa Gpu Agent Info + std::vector gpu_list_; + std::vector gpu_agents_; + + // Used to maintain a list of Hsa Cpu Agent Info + std::vector cpu_list_; + std::vector cpu_agents_; + + // System agents map + std::map agent_map_; + + // Executables loading tracking + typedef std::map symbols_map_t; + static symbols_map_t* symbols_map_; + static bool executable_tracking_on_; + static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options); + static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data); + + // HSA runtime API table + static hsa_pfn_t hsa_api_; + + // AqlProfile API table + aqlprofile_pfn_t aqlprofile_api_; + + // Loader API table + hsa_ven_amd_loader_1_00_pfn_t loader_api_; + + // System timeout, ns + static timestamp_t timeout_ns_; + // System timeout, sysclock + timestamp_t timeout_; + + // HSA timer + HsaTimer* timer_; + + // Time shift array to support time conversion + timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER]; + timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER]; + + // CPU/kern-arg memory pools + hsa_amd_memory_pool_t *cpu_pool_; + hsa_amd_memory_pool_t *kern_arg_pool_; +}; + +#endif // _HSA_RSRC_FACTORY_H_ diff --git a/test/hsa/test/CMakeLists.txt b/test/hsa/test/CMakeLists.txt new file mode 100644 index 00000000..77727b23 --- /dev/null +++ b/test/hsa/test/CMakeLists.txt @@ -0,0 +1,64 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +cmake_minimum_required ( VERSION 2.8.12 ) +set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) + +set ( EXE_NAME "ctrl" ) + +if ( NOT DEFINED TEST_DIR ) + set ( TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR} ) + project ( ${EXE_NAME} ) + ## Set build environment + include ( env ) +endif () + +if ( NOT DEFINED ROCM_ROOT_DIR ) + set ( ROCM_ROOT_DIR "" ) +endif () +if ( NOT DEFINED GPU_TARGETS ) + set ( GPU_TARGETS "" ) +endif () + +## Util sources +file( GLOB UTIL_SRC "${TEST_DIR}/util/*.cpp" ) + +## Test control sources +set ( CTRL_SRC + ${TEST_DIR}/app/test.cpp + ${TEST_DIR}/ctrl/test_hsa.cpp +) + +## Dummy kernel +set ( DUMMY_NAME dummy_kernel ) +execute_process ( COMMAND sh -xc "${TEST_DIR}/../script/build_kernel.sh '${TEST_DIR}/${DUMMY_NAME}/${DUMMY_NAME}' '${PROJECT_BINARY_DIR}' '${ROCM_ROOT_DIR}' '${GPU_TARGETS}'" ) + +## Test kernel +set ( TEST_NAME simple_convolution ) +set ( KERN_SRC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) +execute_process ( COMMAND sh -xc "${TEST_DIR}/../script/build_kernel.sh '${TEST_DIR}/${TEST_NAME}/${TEST_NAME}' '${PROJECT_BINARY_DIR}' '${ROCM_ROOT_DIR}' '${GPU_TARGETS}'" ) + +## Building ctrl test executable +add_executable ( ${EXE_NAME} ${CTRL_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) diff --git a/test/hsa/test/app/test.cpp b/test/hsa/test/app/test.cpp new file mode 100644 index 00000000..23d39273 --- /dev/null +++ b/test/hsa/test/app/test.cpp @@ -0,0 +1,86 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "dummy_kernel/dummy_kernel.h" +#include "simple_convolution/simple_convolution.h" + +void thread_fun(const int kiter, const int diter, const uint32_t agents_number) { + const AgentInfo* agent_info[agents_number]; + hsa_queue_t* queue[agents_number]; + HsaRsrcFactory* rsrc = &HsaRsrcFactory::Instance(); + + for (uint32_t n = 0; n < agents_number; ++n) { + uint32_t agent_id = n % rsrc->GetCountOfGpuAgents(); + if (rsrc->GetGpuAgentInfo(agent_id, &agent_info[n]) == false) { + fprintf(stderr, "AgentInfo failed\n"); + abort(); + } + if (rsrc->CreateQueue(agent_info[n], 128, &queue[n]) == false) { + fprintf(stderr, "CreateQueue failed\n"); + abort(); + } + } + + for (int i = 0; i < kiter; ++i) { + for (uint32_t n = 0; n < agents_number; ++n) { + RunKernel(0, NULL, agent_info[n], queue[n], diter); + RunKernel(0, NULL, agent_info[n], queue[n], diter); + } + } + + for (uint32_t n = 0; n < agents_number; ++n) { + hsa_queue_destroy(queue[n]); + } +} + +int main(int argc, char** argv) { + const char* kiter_s = getenv("ROCP_KITER"); + const char* diter_s = getenv("ROCP_DITER"); + const char* agents_s = getenv("ROCP_AGENTS"); + const char* thrs_s = getenv("ROCP_THRS"); + + const int kiter = (kiter_s != NULL) ? atol(kiter_s) : 1; + const int diter = (diter_s != NULL) ? atol(diter_s) : 1; + const uint32_t agents_number = (agents_s != NULL) ? (uint32_t)atol(agents_s) : 1; + const int thrs = (thrs_s != NULL) ? atol(thrs_s) : 1; + + TestHsa::HsaInstantiate(); + + std::vector t(thrs); + for (int n = 0; n < thrs; ++n) { + t[n] = std::thread(thread_fun, kiter, diter, agents_number); + } + for (int n = 0; n < thrs; ++n) { + t[n].join(); + } + + TestHsa::HsaShutdown(); + return 0; +} diff --git a/test/hsa/test/ctrl/run_kernel.h b/test/hsa/test/ctrl/run_kernel.h new file mode 100644 index 00000000..846e0b68 --- /dev/null +++ b/test/hsa/test/ctrl/run_kernel.h @@ -0,0 +1,90 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_RUN_KERNEL_H_ +#define TEST_CTRL_RUN_KERNEL_H_ + +#include "ctrl/test_hsa.h" +#include "util/test_assert.h" + +template bool RunKernel(int argc = 0, char* argv[] = NULL, const AgentInfo* agent_info = NULL, hsa_queue_t* queue = NULL, int count = 1) { + bool ret_val = false; + + if (getenv("ROC_TEST_TRACE") == NULL) std::clog.rdbuf(NULL); + + + // Create test kernel object + Kernel test_kernel; + + TestHsa* test_hsa = new TestHsa(&test_kernel); + test_hsa->SetAgentInfo(agent_info); + test_hsa->SetQueue(queue); + + TestAql* test_aql = new Test(test_hsa); + TEST_ASSERT(test_aql != NULL); + if (test_aql == NULL) return 1; + + // Initialization of Hsa Runtime + ret_val = test_aql->Initialize(argc, argv); + if (ret_val == false) { + std::cerr << "Error in the test initialization" << std::endl; + // TEST_ASSERT(ret_val); + return false; + } + + // Setup Hsa resources needed for execution + ret_val = test_aql->Setup(); + if (ret_val == false) { + std::cerr << "Error in creating hsa resources" << std::endl; + TEST_ASSERT(ret_val); + return false; + } + + // Kernel dspatch iterations + for (int i = 0; i < count; ++i) { + // Run test kernel + ret_val = test_aql->Run(); + if (ret_val == false) { + std::cerr << "Error in running the test kernel" << std::endl; + TEST_ASSERT(ret_val); + return false; + } + + // Verify the results of the execution + ret_val = test_aql->VerifyResults(); + if (ret_val) { + std::clog << "Test : Passed" << std::endl; + } else { + std::clog << "Test : Failed" << std::endl; + } + } + + // Print time taken by sample + test_aql->PrintTime(); + + test_aql->Cleanup(); + delete test_aql; + + return ret_val; +} + +#endif // TEST_CTRL_RUN_KERNEL_H_ diff --git a/test/hsa/test/ctrl/test_aql.h b/test/hsa/test/ctrl/test_aql.h new file mode 100644 index 00000000..d77363ee --- /dev/null +++ b/test/hsa/test/ctrl/test_aql.h @@ -0,0 +1,77 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_AQL_H_ +#define TEST_CTRL_TEST_AQL_H_ + +#include +#include + +#include "util/hsa_rsrc_factory.h" + +// Test AQL interface +class TestAql { + public: + explicit TestAql(TestAql* t = 0) : test_(t) {} + virtual ~TestAql() { + if (test_) delete test_; + } + + TestAql* Test() { return test_; } + virtual const AgentInfo* GetAgentInfo() { return (test_) ? test_->GetAgentInfo() : 0; } + virtual hsa_queue_t* GetQueue() { return (test_) ? test_->GetQueue() : 0; } + virtual HsaRsrcFactory* GetRsrcFactory() { return (test_) ? test_->GetRsrcFactory() : 0; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + virtual bool Initialize(int argc, char** argv) { + return (test_) ? test_->Initialize(argc, argv) : true; + } + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + virtual bool Setup() { return (test_) ? test_->Setup() : true; } + + // Run the kernel + // @return bool true on success and false on failure + virtual bool Run() { return (test_) ? test_->Run() : true; } + + // Verify results + // @return bool true on success and false on failure + virtual bool VerifyResults() { return (test_) ? test_->VerifyResults() : true; } + + // Print to console the time taken to execute kernel + virtual void PrintTime() { + if (test_) test_->PrintTime(); + } + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + virtual bool Cleanup() { return (test_) ? test_->Cleanup() : true; } + + private: + TestAql* const test_; +}; + +#endif // TEST_CTRL_TEST_AQL_H_ diff --git a/test/hsa/test/ctrl/test_hsa.cpp b/test/hsa/test/ctrl/test_hsa.cpp new file mode 100644 index 00000000..638f7b1a --- /dev/null +++ b/test/hsa/test/ctrl/test_hsa.cpp @@ -0,0 +1,279 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "ctrl/test_hsa.h" + +#include + +#include "util/test_assert.h" +#include "util/helper_funcs.h" +#include "util/hsa_rsrc_factory.h" + +HsaRsrcFactory* TestHsa::hsa_rsrc_ = NULL; + +HsaRsrcFactory* TestHsa::HsaInstantiate() { + // Instantiate an instance of Hsa Resources Factory + if (hsa_rsrc_ == NULL) { + hsa_rsrc_ = HsaRsrcFactory::Create(); + // Print properties of the agents + hsa_rsrc_->PrintGpuAgents("> GPU agents"); + } + return hsa_rsrc_; +} + +void TestHsa::HsaShutdown() { + if (hsa_rsrc_) hsa_rsrc_->Destroy(); +} + +bool TestHsa::Initialize(int /*arg_cnt*/, char** /*arg_list*/) { + std::clog << "TestHsa::Initialize :" << std::endl; + + // Instantiate a Timer object + setup_timer_idx_ = hsa_timer_.CreateTimer(); + dispatch_timer_idx_ = hsa_timer_.CreateTimer(); + + if (hsa_rsrc_ == NULL) { + TEST_ASSERT(false); + return false; + } + + // Create an instance of Gpu agent + if (agent_info_ == NULL) { + const uint32_t agent_id = 0; + if (!hsa_rsrc_->GetGpuAgentInfo(agent_id, &agent_info_)) { + agent_info_ = NULL; + std::cerr << "> error: agent[" << agent_id << "] is not found" << std::endl; + return false; + } + } + std::clog << "> Using agent[" << agent_info_->dev_index << "] : " << agent_info_->name << std::endl; + + // Create an instance of Aql Queue + if (hsa_queue_ == NULL) { + const uint32_t num_pkts = 128; + if (hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_) == false) { + hsa_queue_ = NULL; + TEST_ASSERT(false); + } + my_queue_ = true; + } + + // Obtain handle of signal + hsa_rsrc_->CreateSignal(1, &hsa_signal_); + + // Obtain the code object file name + std::string agentName(agent_info_->name); + brig_path_obj_.append(agentName); + brig_path_obj_.append("_" + name_ + ".hsaco"); + + return true; +} + +bool TestHsa::Setup() { + std::clog << "TestHsa::setup :" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(setup_timer_idx_); + + // Load and Finalize Kernel Code Descriptor + const char* brig_path = brig_path_obj_.c_str(); + bool suc = hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, symb_.c_str(), &hsa_exec_, + &kernel_code_desc_); + if (suc == false) { + std::cerr << "Error in loading and finalizing Kernel" << std::endl; + return false; + } + + mem_map_t& mem_map = test_->GetMemMap(); + for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) { + mem_descr_t& des = it->second; + if (des.size == 0) continue; + + switch (des.id) { + case TestKernel::LOCAL_DES_ID: + des.ptr = hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size); + break; + case TestKernel::KERNARG_DES_ID: { + // Check the kernel args size + const size_t kernarg_size = des.size; + size_t size_info = 0; + const hsa_status_t status = hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &size_info); + TEST_ASSERT(status == HSA_STATUS_SUCCESS); + size_info = kernarg_size; + const bool kernarg_missmatch = (kernarg_size > size_info); + if (kernarg_missmatch) { + std::cout << "kernarg_size = " << kernarg_size << ", size_info = " << size_info + << std::flush << std::endl; + TEST_ASSERT(!kernarg_missmatch); + break; + } + // ALlocate kernarg memory + des.size = size_info; + des.ptr = hsa_rsrc_->AllocateKernArgMemory(agent_info_, size_info); + if (des.ptr) memset(des.ptr, 0, size_info); + break; + } + case TestKernel::SYS_DES_ID: + des.ptr = hsa_rsrc_->AllocateSysMemory(agent_info_, des.size); + if (des.ptr) memset(des.ptr, 0, des.size); + break; + case TestKernel::NULL_DES_ID: + des.ptr = NULL; + break; + default: + break; + } + TEST_ASSERT(des.ptr != NULL); + if (des.ptr == NULL) return false; + } + test_->Init(); + + // Stop the timer object + hsa_timer_.StopTimer(setup_timer_idx_); + setup_time_taken_ = hsa_timer_.ReadTimer(setup_timer_idx_); + total_time_taken_ = setup_time_taken_; + + return true; +} + +bool TestHsa::Run() { + std::clog << "TestHsa::run :" << std::endl; + + const uint32_t work_group_size = 64; + const uint32_t work_grid_size = test_->GetGridSize(); + uint32_t group_segment_size = 0; + uint32_t private_segment_size = 0; + uint64_t code_handle = 0; + + // Retrieve the amount of group memory needed + hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); + + // Retrieve the amount of private memory needed + hsa_executable_symbol_get_info(kernel_code_desc_, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &private_segment_size); + + + // Retrieve handle of the code block + hsa_executable_symbol_get_info(kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &code_handle); + + // Initialize the dispatch packet. + hsa_kernel_dispatch_packet_t aql; + memset(&aql, 0, sizeof(aql)); + // Set the packet's type, barrier bit, acquire and release fences + aql.header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE; + // Populate Aql packet with default values + aql.setup = 1; + aql.grid_size_x = work_grid_size; + aql.grid_size_y = 1; + aql.grid_size_z = 1; + aql.workgroup_size_x = work_group_size; + aql.workgroup_size_y = 1; + aql.workgroup_size_z = 1; + // Bind the kernel code descriptor and arguments + aql.kernel_object = code_handle; + aql.kernarg_address = test_->GetKernargPtr(); + aql.group_segment_size = group_segment_size; + aql.private_segment_size = private_segment_size; + // Initialize Aql packet with handle of signal + hsa_signal_store_relaxed(hsa_signal_, 1); + aql.completion_signal = hsa_signal_; + + std::clog << "> Executing kernel: \"" << name_ << "\"" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(dispatch_timer_idx_); + + // Submit AQL packet to the queue + const uint64_t que_idx = hsa_rsrc_->Submit(hsa_queue_, &aql); + + std::clog << "> Waiting on kernel dispatch signal, que_idx=" << que_idx << std::endl << std::flush; + + // Wait on the dispatch signal until the kernel is finished. + // Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling + if (hsa_signal_wait_scacquire(hsa_signal_, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) { + TEST_ASSERT("signal_wait failed"); + } + + std::clog << "> DONE, que_idx=" << que_idx << std::endl; + + // Stop the timer object + hsa_timer_.StopTimer(dispatch_timer_idx_); + dispatch_time_taken_ = hsa_timer_.ReadTimer(dispatch_timer_idx_); + total_time_taken_ += dispatch_time_taken_; + + return true; +} + +bool TestHsa::VerifyResults() { + bool cmp = false; + void* output = NULL; + const uint32_t size = test_->GetOutputSize(); + bool suc = false; + + if (size == 0) return true; + + // Copy local kernel output buffers from local memory into host memory + if (test_->IsOutputLocal()) { + output = hsa_rsrc_->AllocateSysMemory(agent_info_, size); + suc = hsa_rsrc_->Memcpy(agent_info_, output, test_->GetOutputPtr(), size); + if (!suc) std::clog << "> VerifyResults: Memcpy failed" << std::endl << std::flush; + } else { + output = test_->GetOutputPtr(); + suc = true; + } + + if ((output != NULL) && suc) { + // Print the test output + test_->PrintOutput(output); + // Compare the results and see if they match + cmp = (memcmp(output, test_->GetRefOut(), size) == 0); + } + + if (test_->IsOutputLocal() && (output != NULL)) hsa_rsrc_->FreeMemory(output); + + return cmp; +} + +void TestHsa::PrintTime() { + std::clog << "Time taken for Setup by " << this->name_ << " : " << this->setup_time_taken_ + << std::endl; + std::clog << "Time taken for Dispatch by " << this->name_ << " : " << this->dispatch_time_taken_ + << std::endl; + std::clog << "Time taken in Total by " << this->name_ << " : " << this->total_time_taken_ + << std::endl; +} + +bool TestHsa::Cleanup() { + hsa_executable_destroy(hsa_exec_); + hsa_signal_destroy(hsa_signal_); + if (my_queue_) hsa_queue_destroy(hsa_queue_); + hsa_queue_ = NULL; + agent_info_ = NULL; + return true; +} diff --git a/test/hsa/test/ctrl/test_hsa.h b/test/hsa/test/ctrl/test_hsa.h new file mode 100644 index 00000000..bb54c600 --- /dev/null +++ b/test/hsa/test/ctrl/test_hsa.h @@ -0,0 +1,129 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_HSA_H_ +#define TEST_CTRL_TEST_HSA_H_ + +#include "ctrl/test_aql.h" +#include "ctrl/test_kernel.h" +#include "util/hsa_rsrc_factory.h" +#include "util/perf_timer.h" + +// Class implements HSA test +class TestHsa : public TestAql { + public: + // Instantiate HSA resources + static HsaRsrcFactory* HsaInstantiate(); + static void HsaShutdown(); + + // Constructor + explicit TestHsa(TestKernel* test) : test_(test), name_(test->Name()), symb_(test->SymbName()) { + total_time_taken_ = 0; + setup_time_taken_ = 0; + dispatch_time_taken_ = 0; + agent_info_ = NULL; + hsa_queue_ = NULL; + my_queue_ = false; + hsa_exec_ = {}; + } + + // Get methods for Agent Info, HAS queue, HSA Resourcse Manager + HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; } + hsa_agent_t HsaAgent() { return agent_info_->dev_id; } + const AgentInfo* GetAgentInfo() { return agent_info_; } + void SetAgentInfo(const AgentInfo* agent_info) { agent_info_ = agent_info; } + hsa_queue_t* GetQueue() { return hsa_queue_; } + void SetQueue(hsa_queue_t* queue) { hsa_queue_ = queue; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + bool Initialize(int argc, char** argv); + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + bool Setup(); + + // Run the BinarySearch kernel + // @return bool true on success and false on failure + bool Run(); + + // Verify against reference implementation + // @return bool true on success and false on failure + bool VerifyResults(); + + // Print to console the time taken to execute kernel + void PrintTime(); + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + bool Cleanup(); + + private: + typedef TestKernel::mem_descr_t mem_descr_t; + typedef TestKernel::mem_map_t mem_map_t; + typedef TestKernel::mem_it_t mem_it_t; + + // Test object + TestKernel* test_; + + // Path of Brig file + std::string brig_path_obj_; + + // Used to track time taken to run the sample + double total_time_taken_; + double setup_time_taken_; + double dispatch_time_taken_; + + // Handle of signal + hsa_signal_t hsa_signal_; + + // Handle of Kernel Code Descriptor + hsa_executable_symbol_t kernel_code_desc_; + + // Instance of timer object + uint32_t setup_timer_idx_; + uint32_t dispatch_timer_idx_; + PerfTimer hsa_timer_; + + // Instance of Hsa Resources Factory + static HsaRsrcFactory* hsa_rsrc_; + + // Handle to an Hsa Gpu Agent + const AgentInfo* agent_info_; + + // Handle to an Hsa Queue + hsa_queue_t* hsa_queue_; + bool my_queue_; + + // Test kernel name + std::string name_; + + // Test kernel name + std::string symb_; + + // Kernel executable + hsa_executable_t hsa_exec_; +}; + +#endif // TEST_CTRL_TEST_HSA_H_ diff --git a/test/hsa/test/ctrl/test_kernel.h b/test/hsa/test/ctrl/test_kernel.h new file mode 100644 index 00000000..0ca89200 --- /dev/null +++ b/test/hsa/test/ctrl/test_kernel.h @@ -0,0 +1,138 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_KERNEL_H_ +#define TEST_CTRL_TEST_KERNEL_H_ + +#include +#include +#include +#include + +// Class implements kernel test +class TestKernel { + public: + // Exported buffers IDs + enum buf_id_t { KERNARG_EXP_ID, OUTPUT_EXP_ID, REFOUT_EXP_ID }; + // Memory descriptors IDs + enum des_id_t { NULL_DES_ID, LOCAL_DES_ID, KERNARG_DES_ID, SYS_DES_ID, REFOUT_DES_ID }; + + // Memory descriptors vector declaration + struct mem_descr_t { + des_id_t id; + void* ptr; + uint32_t size; + }; + + // Memory map declaration + typedef std::map mem_map_t; + typedef mem_map_t::iterator mem_it_t; + typedef mem_map_t::const_iterator mem_const_it_t; + + virtual ~TestKernel() {} + + // Initialize method + virtual void Init() = 0; + + // Return kernel memory map + mem_map_t& GetMemMap() { return mem_map_; } + + // Return NULL descriptor + static mem_descr_t NullDescriptor() { return {NULL_DES_ID, NULL, 0}; } + + // Check if decripter is local + bool IsLocal(const mem_descr_t& descr) const { return (descr.id == LOCAL_DES_ID); } + + // Methods to get the kernel attributes + const mem_descr_t& GetKernargDescr() { return *test_map_[KERNARG_EXP_ID]; } + const mem_descr_t& GetOutputDescr() { return *test_map_[OUTPUT_EXP_ID]; } + void* GetKernargPtr() { return GetKernargDescr().ptr; } + uint32_t GetKernargSize() { return GetKernargDescr().size; } + void* GetOutputPtr() { return GetOutputDescr().ptr; } + uint32_t GetOutputSize() { return GetOutputDescr().size; } + bool IsOutputLocal() { return IsLocal(GetOutputDescr()); } + virtual uint32_t GetGridSize() const = 0; + + // Return reference output + void* GetRefOut() { return test_map_[REFOUT_EXP_ID]->ptr; } + + // Print output + virtual void PrintOutput(const void* ptr) const = 0; + + // Return name + virtual std::string Name() const = 0; + + // Return name + virtual std::string SymbName() { return Name() + ".kd"; } + + protected: + // Set buffer descriptor + bool SetInDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + if (des_id == KERNARG_DES_ID) { + test_map_[KERNARG_EXP_ID] = &mem_map_[buf_id]; + } + return suc; + } + + // Set results descriptor + bool SetOutDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + test_map_[OUTPUT_EXP_ID] = &mem_map_[buf_id]; + return suc; + } + + // Set host descriptor + bool SetHostDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + if (suc) { + mem_descr_t& descr = mem_map_[buf_id]; + descr.ptr = malloc(size); + if (des_id == REFOUT_DES_ID) { + test_map_[REFOUT_EXP_ID] = &descr; + } + if (descr.ptr == NULL) suc = false; + } + return suc; + } + + // Get memory descriptor + mem_descr_t GetDescr(const uint32_t& buf_id) const { + mem_const_it_t it = mem_map_.find(buf_id); + return (it != mem_map_.end()) ? it->second : NullDescriptor(); + } + + private: + // Set memory descriptor + bool SetMemDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + const mem_descr_t des = {des_id, NULL, size}; + auto ret = mem_map_.insert(mem_map_t::value_type(buf_id, des)); + return ret.second; + } + + // Kernel memory map object + mem_map_t mem_map_; + // Test memory map object + std::map test_map_; +}; + +#endif // TEST_CTRL_TEST_KERNEL_H_ diff --git a/test/hsa/test/dummy_kernel/dummy_kernel.cl b/test/hsa/test/dummy_kernel/dummy_kernel.cl new file mode 100644 index 00000000..4ab159c8 --- /dev/null +++ b/test/hsa/test/dummy_kernel/dummy_kernel.cl @@ -0,0 +1,28 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +/** + dummy kernel + */ +__kernel void DummyKernel() { + uint tid = get_global_id(0); +} diff --git a/test/hsa/test/dummy_kernel/dummy_kernel.h b/test/hsa/test/dummy_kernel/dummy_kernel.h new file mode 100644 index 00000000..1b8ce430 --- /dev/null +++ b/test/hsa/test/dummy_kernel/dummy_kernel.h @@ -0,0 +1,71 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ +#define TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ + +#include +#include + +#include "ctrl/test_kernel.h" + +// Class implements DummyKernel kernel parameters +class DummyKernel : public TestKernel { + public: + // Kernel buffers IDs + enum { KERNARG_BUF_ID, LOCAL_BUF_ID }; + + // Constructor + DummyKernel() : + width_(64), + height_(64) + { + SetInDescr(KERNARG_BUF_ID, KERNARG_DES_ID, 0); + SetOutDescr(LOCAL_BUF_ID, LOCAL_DES_ID, 0); + } + + // Initialize method + void Init() {} + + // Return compute grid size + uint32_t GetGridSize() const { return width_ * height_; } + + // Print output + void PrintOutput(const void* ptr) const {} + + // Return name + std::string Name() const { return std::string("DummyKernel"); } + + private: + // Reference CPU implementation + bool ReferenceImplementation(uint32_t* output, const uint32_t* input, const float* mask, + const uint32_t width, const uint32_t height, + const uint32_t maskWidth, const uint32_t maskHeight) { return true; } + + // Width of the Input array + const uint32_t width_; + + // Height of the Input array + const uint32_t height_; +}; + +#endif // TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ diff --git a/test/hsa/test/run.sh b/test/hsa/test/run.sh new file mode 100755 index 00000000..32848317 --- /dev/null +++ b/test/hsa/test/run.sh @@ -0,0 +1,45 @@ +#!/bin/sh + +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +# test trace +export ROC_TEST_TRACE=1 + +# kernels loading iterations +export ROCP_KITER=50 +# kernels dispatching iterations per kernel load +# dispatching to the same queue +export ROCP_DITER=50 +# GPU agents number +export ROCP_AGENTS=2 +# host threads number +# each thread creates a queue pre GPU agent +export ROCP_THRS=3 + +eval ./test/ctrl + +#valgrind --leak-check=full $tbin +#valgrind --tool=massif $tbin +#ms_print massif.out. + +exit 0 diff --git a/test/hsa/test/simple_convolution/simple_convolution.cl b/test/hsa/test/simple_convolution/simple_convolution.cl new file mode 100644 index 00000000..3f8115a6 --- /dev/null +++ b/test/hsa/test/simple_convolution/simple_convolution.cl @@ -0,0 +1,76 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +/** + * SimpleConvolution is where each pixel of the output image + * is the weighted sum of the neighborhood pixels of the input image + * The neighborhood is defined by the dimensions of the mask and + * weight of each neighbor is defined by the mask itself. + * @param output Output matrix after performing convolution + * @param input Input matrix on which convolution is to be performed + * @param mask mask matrix using which convolution was to be performed + * @param inputDimensions dimensions of the input matrix + * @param maskDimensions dimensions of the mask matrix + */ +__kernel void SimpleConvolution(__global uint * output, + __global uint * input, + __global float * mask, + const uint2 inputDimensions, + const uint2 maskDimensions) { + + uint tid = get_global_id(0); + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint x = tid%width; + uint y = tid/width; + + uint maskWidth = maskDimensions.x; + uint maskHeight = maskDimensions.y; + + uint vstep = (maskWidth -1)/2; + uint hstep = (maskHeight -1)/2; + + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + uint left = (x < vstep) ? 0 : (x - vstep); + uint right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + uint top = (y < hstep) ? 0 : (y - hstep); + uint bottom = ((y + hstep) >= height)? height - 1: (y + hstep); + + // initializing wighted sum value + float sumFX = 0; + + for(uint i = left; i <= right; ++i) { + for(uint j = top; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint maskIndex = (j - (y - hstep)) * maskWidth + (i - (x - vstep)); + uint index = j * width + i; + sumFX += ((float)input[index] * mask[maskIndex]); + } + } + + // To round to the nearest integer + sumFX += 0.5f; + output[tid] = (uint)sumFX; +} diff --git a/test/hsa/test/simple_convolution/simple_convolution.cpp b/test/hsa/test/simple_convolution/simple_convolution.cpp new file mode 100644 index 00000000..546f9a6a --- /dev/null +++ b/test/hsa/test/simple_convolution/simple_convolution.cpp @@ -0,0 +1,388 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "simple_convolution/simple_convolution.h" + +#include +#include +#include + +#include "util/helper_funcs.h" +#include "util/test_assert.h" + +const uint32_t SimpleConvolution::input_data_[]{ + 15, 201, 51, 89, 92, 34, 96, 66, 11, 225, 161, 96, 81, 211, 108, 124, 202, 244, 182, + 90, 215, 92, 98, 20, 44, 225, 55, 247, 202, 0, 45, 218, 202, 97, 51, 39, 131, 147, + 105, 143, 116, 11, 239, 198, 222, 92, 67, 169, 81, 250, 3, 40, 86, 101, 60, 131, 70, + 116, 123, 17, 117, 168, 236, 64, 10, 31, 103, 142, 179, 209, 29, 40, 220, 13, 239, 187, + 105, 50, 100, 186, 44, 104, 227, 131, 205, 32, 6, 20, 149, 130, 38, 10, 43, 18, 75, + 53, 50, 178, 195, 230, 132, 225, 14, 96, 238, 253, 27, 88, 48, 128, 18, 92, 232, 246, + 224, 182, 23, 231, 203, 172, 105, 241, 183, 148, 4, 2, 202, 55, 181, 142, 29, 57, 111, + 43, 153, 93, 41, 181, 181, 89, 54, 200, 182, 31, 190, 150, 213, 213, 126, 160, 130, 232, + 146, 57, 125, 151, 59, 71, 206, 240, 213, 236, 42, 68, 24, 195, 162, 65, 121, 87, 155, + 175, 31, 81, 207, 222, 232, 164, 180, 102, 69, 55, 79, 216, 112, 204, 112, 171, 19, 63, + 156, 233, 43, 198, 46, 67, 138, 208, 132, 4, 39, 32, 180, 71, 113, 131, 38, 90, 40, + 219, 193, 109, 18, 16, 70, 131, 220, 182, 46, 240, 245, 203, 217, 32, 146, 7, 100, 28, + 216, 233, 32, 255, 9, 213, 71, 123, 88, 110, 213, 128, 74, 150, 238, 93, 166, 52, 224, + 131, 234, 15, 115, 224, 218, 76, 1, 108, 84, 101, 137, 44, 79, 170, 44, 88, 127, 116, + 211, 216, 226, 168, 88, 45, 63, 70, 138, 230, 123, 107, 105, 101, 122, 220, 70, 84, 41, + 71, 193, 125, 173, 75, 169, 252, 245, 213, 84, 117, 73, 40, 77, 44, 209, 166, 90, 16, + 237, 229, 246, 104, 80, 95, 206, 202, 60, 20, 31, 101, 92, 225, 226, 9, 44, 140, 5, + 34, 97, 89, 151, 171, 129, 229, 216, 82, 139, 51, 99, 120, 24, 89, 225, 104, 185, 175, + 50, 246, 196, 82, 91, 32, 51, 62, 42, 96, 202, 47, 130, 44, 137, 26, 215, 10, 255, + 176, 93, 138, 227, 193, 3, 251, 27, 229, 100, 212, 149, 151, 202, 89, 233, 38, 122, 29, + 100, 164, 125, 46, 212, 0, 90, 93, 26, 50, 103, 25, 226, 197, 164, 198, 135, 168, 194, + 162, 141, 38, 119, 34, 190, 66, 124, 167, 104, 247, 197, 204, 156, 67, 251, 112, 67, 85, + 205, 93, 135, 53, 119, 106, 251, 28, 49, 130, 196, 243, 36, 82, 26, 155, 117, 216, 221, + 241, 128, 70, 233, 70, 18, 133, 137, 14, 245, 204, 99, 195, 42, 235, 248, 161, 86, 243, + 190, 135, 118, 130, 123, 154, 213, 150, 54, 74, 111, 20, 60, 240, 90, 37, 54, 109, 171, + 191, 123, 161, 140, 222, 100, 182, 202, 93, 88, 32, 80, 23, 168, 198, 153, 36, 97, 111, + 187, 151, 185, 43, 172, 245, 27, 6, 27, 82, 115, 199, 18, 239, 104, 158, 206, 205, 85, + 152, 42, 174, 185, 123, 197, 98, 65, 95, 135, 163, 206, 66, 59, 136, 109, 231, 125, 137, + 237, 153, 219, 97, 96, 237, 81, 201, 140, 31, 150, 226, 183, 192, 144, 113, 59, 86, 212, + 125, 182, 91, 33, 132, 158, 92, 12, 12, 68, 138, 149, 50, 36, 113, 147, 133, 95, 229, + 78, 235, 4, 228, 206, 188, 165, 95, 45, 225, 181, 1, 94, 107, 93, 128, 240, 251, 220, + 252, 7, 32, 135, 156, 83, 171, 14, 230, 48, 109, 203, 126, 89, 208, 99, 39, 140, 9, + 134, 185, 234, 60, 187, 73, 167, 24, 201, 152, 20, 166, 148, 27, 199, 28, 184, 26, 199, + 198, 0, 248, 52, 204, 119, 141, 157, 218, 181, 41, 227, 59, 227, 206, 119, 159, 23, 31, + 184, 224, 183, 204, 134, 76, 231, 77, 105, 160, 103, 48, 103, 104, 41, 155, 53, 160, 41, + 210, 123, 222, 252, 95, 26, 223, 45, 146, 126, 68, 177, 54, 37, 105, 3, 171, 182, 235, + 249, 31, 139, 97, 80, 243, 202, 121, 143, 0, 26, 184, 210, 149, 151, 207, 244, 177, 174, + 34, 67, 45, 102, 245, 100, 140, 95, 104, 55, 21, 83, 49, 53, 223, 147, 134, 210, 93, + 0, 97, 93, 26, 26, 48, 175, 178, 255, 164, 99, 174, 198, 167, 220, 45, 156, 64, 185, + 252, 168, 241, 18, 252, 35, 71, 219, 182, 205, 173, 19, 206, 15, 113, 232, 42, 161, 152, + 220, 160, 60, 64, 79, 3, 231, 43, 49, 132, 108, 235, 128, 21, 220, 146, 17, 255, 218, + 236, 182, 168, 154, 201, 118, 170, 58, 94, 212, 220, 246, 177, 125, 51, 241, 204, 55, 216, + 248, 104, 92, 100, 83, 221, 121, 48, 111, 138, 47, 73, 119, 230, 241, 17, 175, 103, 187, + 234, 198, 144, 199, 188, 65, 68, 240, 51, 17, 39, 11, 9, 143, 104, 109, 227, 70, 231, + 19, 181, 113, 66, 255, 233, 41, 241, 250, 217, 89, 182, 196, 31, 71, 139, 220, 137, 208, + 204, 188, 225, 243, 200, 234, 131, 48, 88, 102, 119, 63, 121, 44, 177, 188, 44, 154, 229, + 29, 149, 190, 118, 76, 130, 150, 147, 14, 114, 28, 222, 62, 217, 191, 50, 161, 170, 181, + 210, 2, 28, 73, 66, 149, 117, 243, 81, 162, 141, 55, 191, 35, 245, 54, 111, 120, 204, + 2, 134, 62, 31, 100, 125, 248, 36, 175, 153, 206, 101, 107, 209, 129, 181, 19, 22, 43, + 7, 104, 205, 149, 159, 140, 184, 149, 195, 39, 14, 143, 42, 148, 205, 73, 249, 74, 66, + 30, 250, 219, 237, 96, 71, 190, 225, 253, 210, 248, 40, 218, 96, 245, 111, 0, 130, 39, + 150, 69, 79, 165, 212, 122, 57, 162, 195, 51, 237, 6, 82, 231, 225, 63, 71, 41, 253, + 41, 38, 208, 33, 78, 170, 130, 68, 26, 131, 198, 66, 26, 12, 145, 191, 224, 11, 249, + 130, 207, 44, 112, 213, 126, 88, 183, 190, 160, 225, 187, 201, 8, 140, 235, 87, 55, 109, + 155, 81, 241, 98, 147, 11, 110, 37, 202, 79, 49, 195, 210, 0, 240, 66, 214, 110, 154, + 142, 44, 58, 111, 232, 4, 119, 117, 239, 207, 172, 93, 106, 254, 78, 205, 145, 89, 59, + 183, 35, 138, 232, 230, 92, 233, 214, 159, 191, 69, 58, 78, 114, 116, 189, 91, 121, 53, + 208, 104, 4, 125, 198, 111, 123, 20, 60, 13, 109, 120, 196, 145, 3, 172, 119, 95, 150, + 78, 255, 85, 147, 57, 163, 6, 174, 97, 97, 39, 151, 50, 144, 155, 175, 86, 11, 43, + 107, 71, 56, 216, 191, 253, 105, 194, 170, 225, 34, 64, 47, 34, 150, 195, 91, 58, 201, + 10, 155, 43, 49, 50, 93, 194, 206, 13, 25, 217, 56, 132, 33, 112, 92, 225, 109, 198, + 164, 23, 167, 199, 88, 215, 234, 238, 155, 69, 40, 100, 80, 196, 144, 129, 246, 237, 68, + 197, 250, 93, 159, 51, 225, 193, 163, 62, 163, 17, 4, 71, 41, 172, 15, 130, 132, 249, + 112, 31, 63, 152, 132, 143, 92, 20, 17, 83, 1, 86, 25, 252, 179, 185, 47, 149, 122, + 211, 211, 29, 229, 216, 101, 15, 133, 117, 145, 9, 111, 1, 40, 175, 154, 173, 62, 247, + 193, 80, 75, 194, 166, 100, 191, 90, 29, 239, 239, 152, 194, 195, 182, 168, 156, 27, 183, + 33, 145, 73, 43, 0, 75, 83, 175, 229, 0, 238, 221, 194, 63, 40, 133, 230, 140, 68, + 64, 170, 51, 48, 66, 246, 243, 248, 159, 144, 20, 87, 177, 165, 160, 220, 166, 235, 48, + 86, 209, 49, 68, 174, 243, 132, 214, 120, 106, 99, 189, 170, 13, 241, 219, 80, 232, 207, + 72, 135, 95, 92, 223, 16, 2, 127, 237, 169, 107, 29, 255, 61, 79, 68, 236, 67, 200, + 194, 188, 50, 38, 121, 221, 52, 107, 184, 132, 84, 136, 204, 219, 231, 41, 186, 248, 44, + 58, 229, 213, 166, 3, 212, 227, 82, 25, 207, 150, 225, 146, 82, 20, 185, 204, 242, 237, + 55, 170, 113, 139, 50, 62, 103, 26, 103, 34, 18, 148, 93, 247, 105, 3, 251, 62, 231, + 77, 87, 182, 227, 57, 73, 54, 77, 2, 2, 63, 239, 57, 234, 97, 197, 29, 159, 44, + 55, 7, 79, 74, 155, 172, 66, 5, 175, 61, 67, 150, 139, 155, 77, 111, 212, 151, 165, + 34, 153, 167, 98, 137, 225, 77, 234, 166, 107, 138, 211, 163, 145, 34, 237, 45, 206, 47, + 50, 126, 108, 117, 21, 248, 17, 98, 103, 230, 249, 12, 9, 147, 179, 107, 29, 149, 185, + 7, 59, 37, 146, 14, 200, 35, 49, 182, 80, 0, 230, 130, 126, 83, 248, 148, 75, 9, + 247, 178, 240, 240, 190, 249, 132, 114, 101, 161, 7, 30, 169, 67, 68, 59, 82, 12, 95, + 131, 195, 176, 131, 169, 51, 2, 252, 44, 150, 72, 54, 141, 250, 38, 126, 185, 31, 3, + 44, 132, 165, 52, 163, 78, 120, 231, 138, 202, 244, 234, 77, 183, 155, 209, 97, 207, 212, + 94, 251, 107, 166, 49, 249, 161, 88, 120, 91, 120, 123, 135, 253, 33, 188, 160, 112, 52, + 136, 250, 254, 125, 229, 76, 53, 128, 30, 150, 79, 243, 244, 75, 95, 155, 125, 88, 60, + 213, 209, 152, 78, 77, 32, 75, 110, 220, 236, 222, 17, 117, 217, 15, 242, 190, 92, 39, + 63, 123, 190, 143, 111, 178, 219, 206, 78, 88, 38, 138, 46, 247, 34, 124, 69, 66, 199, + 179, 31, 179, 145, 48, 41, 106, 64, 27, 41, 157, 67, 105, 24, 1, 249, 135, 179, 212, + 86, 1, 44, 124, 140, 91, 116, 175, 215, 185, 242, 159, 108, 17, 83, 254, 66, 124, 105, + 131, 151, 146, 32, 218, 252, 57, 219, 245, 193, 143, 201, 23, 145, 246, 148, 30, 82, 8, + 206, 41, 194, 192, 201, 47, 210, 28, 46, 20, 152, 151, 151, 48, 42, 184, 11, 38, 241, + 231, 28, 179, 119, 230, 202, 8, 220, 94, 39, 46, 103, 245, 88, 42, 181, 33, 90, 136, + 62, 136, 156, 214, 31, 52, 7, 74, 237, 19, 113, 223, 250, 141, 146, 113, 115, 92, 122, + 80, 187, 161, 126, 35, 150, 215, 78, 76, 249, 168, 212, 55, 48, 113, 14, 80, 166, 21, + 154, 147, 40, 12, 114, 35, 153, 5, 148, 12, 98, 15, 92, 29, 176, 219, 65, 71, 179, + 143, 147, 172, 56, 104, 227, 104, 218, 241, 185, 128, 7, 84, 20, 47, 96, 135, 82, 249, + 140, 231, 6, 238, 246, 99, 12, 167, 63, 77, 238, 242, 221, 130, 158, 21, 235, 129, 126, + 197, 114, 56, 69, 121, 140, 90, 169, 237, 225, 252, 231, 109, 228, 237, 91, 219, 81, 104, + 130, 144, 181, 113, 130, 147, 244, 32, 169, 223, 162, 39, 164, 21, 95, 234, 143, 236, 68, + 57, 217, 37, 53, 192, 147, 25, 174, 239, 245, 0, 87, 119, 144, 13, 232, 19, 160, 220, + 51, 73, 188, 214, 113, 96, 235, 209, 75, 122, 190, 144, 179, 151, 181, 233, 88, 73, 3, + 7, 56, 248, 7, 143, 112, 152, 156, 89, 171, 61, 53, 223, 135, 242, 181, 248, 83, 161, + 202, 158, 28, 136, 46, 208, 32, 228, 186, 121, 45, 189, 128, 102, 182, 136, 246, 38, 32, + 147, 127, 204, 208, 181, 171, 87, 167, 97, 80, 250, 2, 26, 153, 31, 163, 200, 239, 195, + 172, 169, 60, 218, 103, 188, 65, 30, 69, 55, 68, 102, 202, 196, 50, 154, 121, 221, 242, + 33, 63, 67, 28, 66, 93, 181, 97, 0, 126, 81, 196, 43, 251, 0, 5, 98, 189, 70, + 128, 3, 126, 197, 105, 72, 137, 155, 227, 3, 121, 214, 36, 184, 25, 65, 250, 118, 247, + 91, 119, 117, 173, 60, 160, 168, 60, 166, 10, 250, 237, 139, 253, 107, 80, 102, 180, 217, + 2, 151, 221, 123, 109, 1, 52, 134, 66, 46, 253, 57, 138, 117, 175, 55, 178, 79, 223, + 239, 245, 234, 233, 226, 117, 231, 78, 198, 78, 2, 159, 80, 154, 124, 204, 7, 126, 0, + 142, 193, 47, 140, 251, 185, 2, 170, 241, 180, 249, 208, 163, 239, 186, 141, 210, 48, 116, + 32, 246, 195, 34, 150, 19, 188, 19, 224, 196, 146, 224, 83, 83, 15, 224, 78, 201, 226, + 249, 186, 151, 243, 139, 58, 226, 70, 199, 181, 118, 60, 213, 109, 255, 248, 3, 19, 181, + 23, 243, 122, 169, 212, 205, 252, 228, 173, 75, 173, 144, 68, 104, 39, 55, 243, 98, 26, + 57, 41, 207, 175, 102, 165, 29, 102, 158, 32, 121, 83, 56, 109, 205, 225, 66, 155, 222, + 38, 73, 42, 212, 218, 110, 60, 1, 166, 48, 99, 193, 105, 141, 145, 25, 244, 54, 54, + 90, 213, 87, 212, 40, 143, 66, 246, 112, 132, 146, 79, 171, 220, 121, 128, 182, 232, 189, + 184, 143, 237, 27, 80, 86, 169, 226, 112, 158, 25, 166, 248, 238, 253, 204, 23, 141, 15, + 13, 254, 147, 160, 77, 63, 124, 199, 191, 50, 175, 124, 234, 62, 105, 6, 143, 192, 176, + 113, 48, 78, 139, 215, 71, 121, 213, 20, 144, 98, 35, 158, 96, 183, 62, 174, 246, 187, + 117, 182, 237, 37, 50, 216, 99, 156, 223, 243, 93, 143, 101, 142, 222, 240, 101, 37, 106, + 58, 57, 250, 157, 93, 153, 254, 20, 216, 172, 10, 147, 34, 192, 129, 71, 243, 90, 171, + 144, 57, 159, 238, 201, 4, 124, 167, 244, 225, 205, 95, 28, 7, 89, 185, 100, 243, 184, + 121, 203, 100, 131, 95, 135, 68, 224, 207, 56, 58, 122, 201, 115, 25, 183, 61, 30, 51, + 229, 18, 21, 178, 113, 49, 186, 203, 235, 31, 191, 163, 152, 138, 8, 28, 233, 143, 97, + 202, 95, 153, 4, 217, 98, 120, 243, 26, 182, 17, 77, 155, 36, 99, 78, 150, 149, 8, + 98, 128, 39, 33, 36, 192, 172, 45, 220, 149, 189, 61, 96, 28, 215, 100, 246, 58, 221, + 233, 84, 147, 251, 162, 47, 31, 5, 125, 181, 154, 134, 23, 27, 174, 57, 64, 110, 229, + 109, 75, 123, 43, 136, 219, 71, 95, 64, 61, 154, 29, 39, 238, 177, 34, 145, 225, 65, + 150, 94, 247, 49, 229, 15, 77, 147, 72, 141, 2, 45, 251, 77, 169, 38, 213, 132, 110, + 53, 196, 172, 207, 226, 212, 190, 148, 246, 79, 117, 56, 230, 212, 48, 23, 185, 63, 100, + 76, 136, 242, 78, 181, 237, 156, 95, 20, 113, 227, 131, 167, 168, 47, 119, 139, 3, 53, + 31, 250, 133, 149, 50, 107, 105, 99, 130, 34, 162, 231, 111, 42, 217, 190, 224, 199, 90, + 63, 220, 204, 35, 95, 115, 203, 143, 234, 86, 147, 32, 118, 141, 165, 11, 192, 16, 117, + 35, 147, 152, 198, 123, 7, 240, 84, 198, 209, 28, 33, 17, 248, 237, 52, 88, 97, 255, + 231, 76, 86, 122, 109, 204, 8, 18, 216, 201, 35, 77, 237, 183, 229, 179, 50, 237, 164, + 135, 179, 118, 164, 213, 135, 157, 195, 187, 245, 36, 187, 220, 113, 18, 87, 222, 222, 96, + 241, 183, 42, 21, 4, 23, 205, 233, 203, 0, 214, 112, 136, 138, 230, 44, 95, 110, 201, + 34, 41, 191, 71, 229, 155, 185, 247, 243, 151, 214, 84, 137, 141, 126, 159, 146, 149, 108, + 124, 97, 109, 82, 209, 245, 221, 183, 34, 60, 37, 236, 95, 79, 171, 167, 53, 71, 96, + 45, 58, 248, 3, 142, 129, 145, 12, 33, 36, 162, 142, 160, 3, 251, 243, 213, 240, 208, + 141, 19, 13, 178, 255, 109, 2, 170, 20, 55, 241, 116, 101, 44, 108, 105, 186, 238, 251, + 199, 15, 31, 106, 157, 191, 110, 152, 178, 67, 137, 131, 208, 156, 144, 131, 155, 253, 134, + 70, 18, 190, 55, 134, 35, 99, 243, 140, 30, 225, 135, 230, 240, 166, 81, 142, 102, 191, + 39, 25, 3, 177, 156, 211, 77, 45, 87, 233, 43, 221, 48, 61, 155, 103, 195, 191, 203, + 182, 75, 233, 152, 211, 208, 136, 121, 33, 23, 224, 224, 62, 249, 227, 239, 149, 183, 61, + 195, 15, 39, 238, 236, 87, 43, 136, 191, 239, 71, 138, 166, 147, 116, 62, 102, 68, 199, + 224, 101, 223, 193, 70, 29, 186, 42, 13, 80, 225, 75, 19, 241, 115, 1, 221, 202, 45, + 102, 137, 29, 174, 20, 195, 66, 136, 2, 168, 205, 201, 137, 50, 168, 74, 121, 198, 4, + 163, 212, 85, 133, 31, 105, 118, 146, 106, 84, 93, 152, 187, 231, 181, 105, 251, 121, 171, + 132, 123, 84, 81, 69, 221, 132, 238, 40, 253, 181, 45, 161, 137, 130, 39, 169, 235, 158, + 59, 86, 242, 153, 239, 173, 128, 165, 23, 123, 30, 195, 0, 154, 23, 81, 224, 245, 214, + 206, 30, 212, 131, 75, 117, 12, 206, 157, 181, 186, 59, 241, 17, 45, 138, 0, 219, 11, + 165, 243, 135, 196, 182, 135, 95, 205, 217, 63, 195, 175, 14, 225, 131, 145, 45, 249, 158, + 251, 150, 84, 182, 209, 70, 199, 255, 209, 199, 219, 220, 109, 206, 99, 50, 132, 234, 146, + 82, 195, 209, 22, 114, 223, 247, 246, 113, 37, 239, 16, 33, 134, 100, 215, 88, 170, 158, + 87, 123, 102, 50, 88, 211, 1, 187, 6, 134, 165, 152, 216, 105, 106, 239, 220, 74, 231, + 210, 187, 12, 194, 204, 45, 72, 49, 4, 160, 219, 162, 248, 87, 8, 43, 176, 220, 44, + 107, 227, 178, 17, 124, 139, 122, 230, 122, 87, 48, 97, 42, 236, 110, 236, 185, 155, 53, + 234, 159, 214, 198, 66, 206, 30, 75, 249, 206, 40, 38, 57, 11, 217, 74, 136, 100, 197, + 110, 223, 29, 159, 65, 71, 140, 175, 51, 69, 74, 105, 48, 234, 63, 246, 45, 13, 20, + 121, 7, 226, 161, 46, 28, 173, 7, 103, 53, 108, 45, 164, 76, 74, 68, 141, 145, 208, + 61, 197, 22, 136, 46, 70, 115, 110, 60, 161, 124, 81, 26, 132, 51, 188, 178, 79, 106, + 186, 183, 160, 39, 228, 68, 115, 46, 136, 1, 192, 89, 62, 133, 112, 198, 180, 182, 58, + 34, 243, 219, 158, 69, 245, 34, 120, 178, 213, 200, 28, 143, 128, 188, 182, 100, 1, 41, + 146, 137, 43, 82, 227, 105, 216, 83, 48, 140, 10, 106, 175, 254, 70, 77, 67, 59, 112, + 188, 237, 69, 133, 10, 212, 5, 198, 138, 105, 199, 180, 252, 81, 223, 79, 53, 73, 39, + 137, 121, 180, 148, 228, 99, 146, 42, 177, 214, 102, 33, 147, 84, 102, 25, 94, 59, 31, + 37, 197, 137, 237, 122, 133, 63, 90, 213, 116, 163, 253, 253, 29, 177, 145, 2, 21, 36, + 45, 198, 251, 147, 231, 143, 232, 78, 168, 71, 137, 199, 108, 79, 80, 90, 201, 214, 153, + 35, 172, 13, 199, 169, 11, 228, 91, 157, 231, 112, 193, 20, 54, 189, 167, 30, 77, 144, + 108, 245, 215, 246, 189, 68, 69, 14, 158, 14, 228, 55, 50, 145, 69, 249, 58, 80, 222, + 149, 237, 198, 5, 175, 218, 60, 109, 130, 91, 186, 18, 200, 175, 234, 190, 109, 46, 3, + 123, 204, 18, 96, 4, 68, 241, 73, 62, 44, 154, 29, 193, 136, 227, 199, 55, 189, 4, + 164, 64, 95, 95, 82, 39, 15, 60, 230, 124, 107, 233, 248, 55, 251, 89, 60, 63, 75, + 134, 126, 119, 32, 156, 57, 168, 127, 0, 224, 61, 5, 133, 125, 100, 228, 208, 140, 243, + 12, 114, 111, 119, 92, 104, 175, 87, 193, 236, 151, 13, 114, 21, 132, 146, 177, 189, 59, + 49, 190, 27, 110, 195, 160, 236, 40, 132, 188, 181, 120, 201, 40, 232, 65, 132, 80, 241, + 220, 18, 221, 115, 31, 79, 137, 164, 226, 58, 98, 29, 108, 32, 57, 219, 228, 218, 199, + 13, 95, 132, 195, 215, 77, 235, 191, 143, 112, 16, 128, 76, 35, 93, 191, 66, 173, 73, + 231, 143, 132, 73, 173, 240, 106, 231, 203, 78, 193, 147, 92, 33, 23, 31, 248, 100, 11, + 184, 243, 123, 201, 115, 200, 236, 209, 135, 47, 126, 209, 22, 14, 85, 95, 188, 69, 202, + 163, 17, 24, 101, 164, 117, 134, 187, 148, 127, 31, 159, 55, 19, 27, 1, 135, 227, 237, + 89, 107, 28, 216, 60, 51, 230, 145, 147, 163, 215, 93, 70, 232, 118, 172, 140, 235, 50, + 71, 128, 177, 103, 32, 233, 123, 60, 234, 2, 31, 216, 91, 139, 244, 52, 200, 40, 26, + 90, 188, 189, 49, 25, 4, 25, 144, 176, 166, 124, 227, 237, 252, 148, 85, 29, 125, 208, + 89, 104, 210, 121, 64, 46, 4, 53, 99, 204, 93, 125, 38, 25, 59, 88, 51, 64, 113, + 195, 241, 23, 64, 212, 5, 60, 104, 90, 90, 230, 42, 179, 78, 253, 44, 143, 44, 49, + 196, 143, 254, 34, 13, 36, 60, 73, 125, 112, 137, 239, 52, 122, 7, 116, 79, 12, 177, + 183, 103, 11, 158, 146, 190, 237, 143, 235, 124, 188, 28, 65, 76, 26, 100, 89, 63, 160, + 163, 188, 17, 44, 172, 69, 167, 179, 185, 246, 191, 107, 174, 38, 118, 76, 184, 53, 58, + 72, 32, 182, 5, 61, 248, 81, 88, 92, 170, 152, 253, 77, 84, 14, 122, 1, 83, 34, + 180, 13, 25, 115, 120, 199, 154, 238, 20, 83, 36, 79, 155, 68, 5, 160, 130, 254, 242, + 218, 90, 156, 114, 87, 234, 199, 101, 101, 200, 185, 135, 124, 198, 160, 240, 62, 104, 138, + 45, 125, 222, 81, 204, 122, 150, 210, 26, 24, 208, 12, 242, 42, 169, 101, 130, 148, 44, + 232, 249, 245, 161, 128, 113, 103, 33, 98, 166, 137, 236, 212, 7, 202, 38, 211, 69, 188, + 165, 95, 212, 118, 108, 199, 161, 22, 45, 35, 170, 90, 11, 163, 79, 173, 36, 193, 20, + 69, 35, 187, 207, 16, 144, 214, 219, 182, 170, 32, 114, 79, 128, 71, 198, 237, 15, 103, + 4, 60, 139, 175, 150, 151, 82, 230, 68, 119, 168, 89, 188, 204, 20, 140, 220, 165, 98, + 184, 91, 12, 217, 205, 92, 90, 20, 35, 71, 36, 138, 76, 96, 22, 251, 247, 173, 78, + 222, 241, 197, 134, 75, 130, 83, 96, 14, 47, 5, 113, 232, 96, 126, 193, 45, 218, 28, + 66, 253, 99, 103, 136, 176, 200, 158, 171, 191, 76, 249, 158, 62, 190, 37, 137, 65, 120, + 233, 80, 168, 238, 193, 145, 79, 63, 82, 125, 26, 111, 191, 24, 210, 39, 161, 131, 239, + 64, 46, 175, 140, 39, 77, 202, 230, 115, 84, 40, 235, 62, 120, 148, 45, 57, 37, 124, + 121, 120, 249, 148, 231, 185, 172, 186, 224, 77, 61, 207, 141, 107, 126, 26, 147, 204, 229, + 121, 63, 58, 161, 43, 120, 25, 191, 165, 83, 228, 34, 205, 92, 27, 97, 67, 213, 13, + 253, 182, 91, 59, 133, 233, 166, 4, 4, 57, 209, 233, 179, 16, 35, 85, 59, 155, 111, + 250, 65, 194, 223, 99, 144, 59, 127, 241, 127, 85, 255, 125, 11, 90, 184, 145, 68, 95, + 150, 72, 153, 103, 49, 76, 120, 85, 161, 179, 241, 16, 174, 51, 211, 142, 150, 99, 201, + 22, 85, 73, 108, 84, 199, 120, 175, 128, 9, 243, 223, 160, 59, 120, 8, 109, 197, 128, + 194, 103, 52, 180, 119, 227, 231, 75, 113, 126, 175, 59, 148, 4, 132, 1, 89, 75, 121, + 8, 204, 131, 251, 171, 36, 55, 36, 44, 165, 233, 172, 103, 80, 224, 28, 200, 195, 3, + 20, 53, 129, 195, 112, 22, 200, 244, 23, 34, 64, 145, 42, 12, 20, 38, 184, 56, 94, + 220, 101, 3, 198, 17, 107, 22, 242, 135, 222, 182, 138, 243, 235, 11, 182, 91, 34, 127, + 80, 58, 161, 145, 203, 204, 158, 224, 242, 86, 24, 81, 51, 126, 84, 249, 143, 191, 15, + 130, 70, 238, 57, 209, 225, 36, 221, 152, 128, 255, 24, 208, 57, 186, 97, 4, 134, 255, + 229, 121, 86, 254, 202, 137, 124, 31, 130, 12, 222, 146, 142, 37, 129, 199, 247, 98, 236, + 212, 251, 108, 211, 20, 60, 13, 206, 158, 18, 84}; + +SimpleConvolution::SimpleConvolution() { + width_ = 64; + height_ = 64; + mask_width_ = 3; + mask_height_ = mask_width_; + randomize_seed_ = 0; + + if (!IsPowerOf2(width_)) { + width_ = RoundToPowerOf2(width_); + } + + if (!IsPowerOf2(height_)) { + height_ = RoundToPowerOf2(height_); + } + + if (!(mask_width_ % 2)) { + mask_width_++; + } + + if (!(mask_height_ % 2)) { + mask_height_++; + } + + if (width_ * height_ < 256) { + width_ = 64; + height_ = 64; + } + + const uint32_t input_size_bytes = width_ * height_ * sizeof(uint32_t); + const uint32_t mask_size_bytes = mask_width_ * mask_height_ * sizeof(float); + + SetInDescr(KERNARG_BUF_ID, KERNARG_DES_ID, sizeof(kernel_args_t)); + SetInDescr(INPUT_BUF_ID, SYS_DES_ID, input_size_bytes); + SetInDescr(MASK_BUF_ID, SYS_DES_ID, mask_size_bytes); + SetOutDescr(LOCAL_BUF_ID, LOCAL_DES_ID, input_size_bytes); + SetHostDescr(REFOUT_BUF_ID, REFOUT_DES_ID, input_size_bytes); + + if (!randomize_seed_) TEST_ASSERT(sizeof(input_data_) <= input_size_bytes); +} + +void SimpleConvolution::Init() { + std::clog << "SimpleConvolution::init :" << std::endl; + + mem_descr_t kernarg_des = GetDescr(KERNARG_BUF_ID); + mem_descr_t input_des = GetDescr(INPUT_BUF_ID); + mem_descr_t mask_des = GetDescr(MASK_BUF_ID); + mem_descr_t output_des = GetDescr(LOCAL_BUF_ID); +#if 0 + printf("kernarg_des %p 0x%x\n", kernarg_des.ptr, kernarg_des.size); + printf("input_des %p 0x%x\n", input_des.ptr, input_des.size); + printf("mask_des %p 0x%x\n", mask_des.ptr, mask_des.size); + printf("output_des %p 0x%x\n", output_des.ptr, output_des.size); +#endif + uint32_t* input = reinterpret_cast(input_des.ptr); + uint32_t* output_local = reinterpret_cast(output_des.ptr); + float* mask = reinterpret_cast(mask_des.ptr); + kernel_args_t* kernel_args = reinterpret_cast(kernarg_des.ptr); + + if (randomize_seed_) { + // random initialisation of input + FillRandom(input, width_, height_, 0, 255, randomize_seed_); + } else { + // initialization with preset values + memcpy(input, input_data_, width_ * height_ * sizeof(uint32_t)); + } + + // Fill a blurr filter or some other filter of your choice + const float val = 1.0f / (mask_width_ * 2.0f - 1.0f); + for (uint32_t i = 0; i < (mask_width_ * mask_height_); i++) { + mask[i] = 0; + } + for (uint32_t i = 0; i < mask_width_; i++) { + uint32_t y = mask_height_ / 2; + mask[y * mask_width_ + i] = val; + } + for (uint32_t i = 0; i < mask_height_; i++) { + uint32_t x = mask_width_ / 2; + mask[i * mask_width_ + x] = val; + } + + // Print the INPUT array. + std::clog << std::dec; + PrintArray("> Input[0]", input, width_, 1); + PrintArray("> Mask", mask, mask_width_, mask_height_); + + // Fill the kernel args + kernel_args->arg1 = output_local; + kernel_args->arg2 = input; + kernel_args->arg3 = mask; + kernel_args->arg4 = width_; + kernel_args->arg41 = height_; + kernel_args->arg5 = mask_width_; + kernel_args->arg51 = mask_height_; + + // Calculate the reference output + ReferenceImplementation(reinterpret_cast(GetRefOut()), input, mask, width_, height_, + mask_width_, mask_height_); +} + +void SimpleConvolution::PrintOutput(const void* ptr) const { + PrintArray("> Output[0]", reinterpret_cast(ptr), width_, 1); +} + +bool SimpleConvolution::ReferenceImplementation(uint32_t* output, const uint32_t* input, + const float* mask, const uint32_t width, + const uint32_t height, const uint32_t mask_width, + const uint32_t mask_height) { + const uint32_t vstep = (mask_width - 1) / 2; + const uint32_t hstep = (mask_height - 1) / 2; + + // for each pixel in the input + for (uint32_t x = 0; x < width; x++) { + for (uint32_t y = 0; y < height; y++) { + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + const uint32_t left = (x < vstep) ? 0 : (x - vstep); + const uint32_t right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + const uint32_t top = (y < hstep) ? 0 : (y - hstep); + const uint32_t bottom = ((y + hstep) >= height) ? height - 1 : (y + hstep); + + // initializing wighted sum value + float sum_fx = 0; + for (uint32_t i = left; i <= right; ++i) { + for (uint32_t j = top; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint32_t mask_idx = (j - (y - hstep)) * mask_width + (i - (x - vstep)); + uint32_t index = j * width + i; + + // to round to the nearest integer + sum_fx += ((float)input[index] * mask[mask_idx]); + } + } + sum_fx += 0.5f; + output[y * width + x] = uint32_t(sum_fx); + } + } + + return true; +} diff --git a/test/hsa/test/simple_convolution/simple_convolution.h b/test/hsa/test/simple_convolution/simple_convolution.h new file mode 100644 index 00000000..550d1320 --- /dev/null +++ b/test/hsa/test/simple_convolution/simple_convolution.h @@ -0,0 +1,94 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ +#define TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ + +#include +#include + +#include "ctrl/test_kernel.h" + +// Class implements SimpleConvolution kernel parameters +class SimpleConvolution : public TestKernel { + public: + // Kernel buffers IDs + enum { INPUT_BUF_ID, LOCAL_BUF_ID, MASK_BUF_ID, KERNARG_BUF_ID, REFOUT_BUF_ID }; + + // Constructor + SimpleConvolution(); + + // Initialize method + void Init(); + + // Return compute grid size + uint32_t GetGridSize() const { return width_ * height_; } + + // Print output + void PrintOutput(const void* ptr) const; + + // Return name + std::string Name() const { return std::string("SimpleConvolution"); } + + private: + // Local kernel arguments declaration + struct kernel_args_t { + void* arg1; + void* arg2; + void* arg3; + uint32_t arg4; + uint32_t arg41; + uint32_t arg5; + uint32_t arg51; + }; + + // Reference CPU implementation of Simple Convolution + // @param output Output matrix after performing convolution + // @param input Input matrix on which convolution is to be performed + // @param mask mask matrix using which convolution was to be performed + // @param input_dimensions dimensions of the input matrix + // @param mask_dimensions dimensions of the mask matrix + // @return bool true on success and false on failure + bool ReferenceImplementation(uint32_t* output, const uint32_t* input, const float* mask, + const uint32_t width, const uint32_t height, + const uint32_t maskWidth, const uint32_t maskHeight); + + // Width of the Input array + uint32_t width_; + + // Height of the Input array + uint32_t height_; + + // Mask dimensions + uint32_t mask_width_; + + // Mask dimensions + uint32_t mask_height_; + + // Randomize input data + unsigned randomize_seed_; + + // Input data + static const uint32_t input_data_[]; +}; + +#endif // TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ diff --git a/test/hsa/test/util/evt_stats.h b/test/hsa/test/util/evt_stats.h new file mode 100644 index 00000000..01bc1317 --- /dev/null +++ b/test/hsa/test/util/evt_stats.h @@ -0,0 +1,98 @@ +#ifndef EVT_STATS_H_ +#define EVT_STATS_H_ + +#include + +#include +#include +#include +#include + +template +class EvtStatsT { + public: + typedef std::mutex mutex_t; + typedef uint64_t evt_count_t; + typedef double evt_avr_t; + struct evt_record_t { + uint64_t count; + evt_avr_t avr; + evt_record_t() : count(0), avr(0) {} + }; + typedef typename std::map map_t; + typedef typename std::map labels_t; + + // Comparison function + struct cmpfun { + template bool operator()(const T& a, const T& b) const { + return (a.second.avr != b.second.avr) ? a.second.avr < b.second.avr : a.first < b.first; + } + }; + + inline void add_event(evt_id_t id, evt_weight_t weight) { + std::lock_guard lck(mutex_); + //printf("EvtStats %p ::add_event %u %lu\n", this, id, weight); fflush(stdout); + + evt_record_t& rec = map_[id]; + const evt_count_t prev_count = rec.count; + const evt_count_t new_count = prev_count + 1; + const evt_avr_t prev_avr = rec.avr; + const evt_avr_t new_avr = ((prev_avr * prev_count) + weight) / new_count; + + rec.count = new_count; + rec.avr = new_avr; + } + + void dump() { + std::lock_guard lck(mutex_); + fprintf(stdout, "Dumping %s\n", path_); fflush(stdout); + + typedef typename std::set, cmpfun> set_t; + set_t s_(map_.begin(), map_.end()); + + uint64_t index = 0; + for (auto& e : s_) { + const evt_id_t id = e.first; + const char* label = get_label(id); + std::ostringstream oss; + oss << index << ",\"" << label << "\"," << e.second.count << "," << (uint64_t)(e.second.avr) << "," << (uint64_t)(e.second.count * e.second.avr); + fprintf(fdes_, "%s\n", oss.str().c_str()); + index += 1; + } + + fclose(fdes_); + } + + const char* get_label(const uint32_t& id) { + auto ret = labels_.insert({id, NULL}); + const char* label = ret.first->second; + return label; + } + const char* get_label(const char* id) { + return id; + } + const char* get_label(const std::string& id) { + return id.c_str(); + } + + void set_label(evt_id_t id, const char* label) { + //printf("EvtStats %p ::set_label %u %s\n", this, id, label); fflush(stdout); + labels_[id] = label; + } + + EvtStatsT(FILE* f, const char* path) : fdes_(f), path_(path) { + //printf("EvtStats %p ::EvtStatsT()\n", this); fflush(stdout); + fprintf(fdes_, "Index,Name,Count,Avr,Total\n"); + } + + private: + mutex_t mutex_; + map_t map_; + labels_t labels_; + FILE* fdes_; + const char* path_; +}; + +typedef EvtStatsT EvtStats; + +#endif // EVT_STATS_H_ diff --git a/test/hsa/test/util/helper_funcs.h b/test/hsa/test/util/helper_funcs.h new file mode 100644 index 00000000..c76854ba --- /dev/null +++ b/test/hsa/test/util/helper_funcs.h @@ -0,0 +1,86 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_HELPER_FUNCS_H_ +#define TEST_UTIL_HELPER_FUNCS_H_ + +#include +#include +#include +#include +#include + +static inline void Error(std::string error_msg) { + std::cerr << "Error: " << error_msg << std::endl; +} + +template +void PrintArray(const std::string header, const T* data, const int width, const int height) { + std::clog << header << " :\n"; + for (int i = 0; i < height; i++) { + std::clog << "> "; + for (int j = 0; j < width; j++) { + std::clog << data[i * width + j] << " "; + } + std::clog << "\n"; + } +} + +template +bool FillRandom(T* array_ptr, const int width, const int height, const T range_min, + const T range_max, unsigned int seed = 123) { + if (!array_ptr) { + Error("Cannot fill array. NULL pointer."); + return false; + } + + if (!seed) seed = (unsigned int)time(NULL); + + srand(seed); + double range = double(range_max - range_min) + 1.0; + + /* random initialisation of input */ + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) { + int index = i * width + j; + array_ptr[index] = range_min + T(range * rand() / (RAND_MAX + 1.0)); + } + + return true; +} + +template T RoundToPowerOf2(T val) { + int bytes = sizeof(T); + + val--; + for (int i = 0; i < bytes; i++) val |= val >> (1 << i); + val++; + + return val; +} + +template bool IsPowerOf2(T val) { + long long long_val = val; + return (((long_val & (-long_val)) - long_val == 0) && (long_val != 0)); +} + +#endif // TEST_UTIL_HELPER_FUNCS_H_ diff --git a/test/hsa/test/util/hsa_rsrc_factory.cpp b/test/hsa/test/util/hsa_rsrc_factory.cpp new file mode 120000 index 00000000..f3726ccf --- /dev/null +++ b/test/hsa/test/util/hsa_rsrc_factory.cpp @@ -0,0 +1 @@ +../../src/hsa_rsrc_factory.cpp \ No newline at end of file diff --git a/test/hsa/test/util/hsa_rsrc_factory.h b/test/hsa/test/util/hsa_rsrc_factory.h new file mode 120000 index 00000000..64af96f1 --- /dev/null +++ b/test/hsa/test/util/hsa_rsrc_factory.h @@ -0,0 +1 @@ +../../src/hsa_rsrc_factory.h \ No newline at end of file diff --git a/test/hsa/test/util/perf_timer.cpp b/test/hsa/test/util/perf_timer.cpp new file mode 100644 index 00000000..85c490b6 --- /dev/null +++ b/test/hsa/test/util/perf_timer.cpp @@ -0,0 +1,179 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "util/perf_timer.h" + +PerfTimer::PerfTimer() { freq_in_100mhz_ = MeasureTSCFreqHz(); } + +PerfTimer::~PerfTimer() { + while (!timers_.empty()) { + Timer* temp = timers_.back(); + timers_.pop_back(); + delete temp; + } +} + +// New cretaed timer instantance index will be returned +int PerfTimer::CreateTimer() { + Timer* newTimer = new Timer; + newTimer->start = 0; + newTimer->clocks = 0; + +#ifdef _WIN32 + QueryPerformanceFrequency((LARGE_INTEGER*)&newTimer->freq); +#else + newTimer->freq = (long long)1.0E3; +#endif + + /* Push back the address of new Timer instance created */ + timers_.push_back(newTimer); + return (int)(timers_.size() - 1); +} + +int PerfTimer::StartTimer(int index) { + if (index >= (int)timers_.size()) { + Error("Cannot reset timer. Invalid handle."); + return FAILURE; + } + +#ifdef _WIN32 +// General Windows timing method +#ifndef _AMD + long long tmpStart; + QueryPerformanceCounter((LARGE_INTEGER*)&(tmpStart)); + timers_[index]->start = (double)tmpStart; +#else +// AMD Windows timing method +#endif +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + timers_[index]->start = s.tv_sec * 1.0E3 + ((double)(s.tv_usec / 1.0E3)); +#else + // AMD timing method + unsigned int unused; + timers_[index]->start = __rdtscp(&unused); +#endif +#endif + + return SUCCESS; +} + + +int PerfTimer::StopTimer(int index) { + double n = 0; + if (index >= (int)timers_.size()) { + Error("Cannot reset timer. Invalid handle."); + return FAILURE; + } +#ifdef _WIN32 +#ifndef _AMD + long long n1; + QueryPerformanceCounter((LARGE_INTEGER*)&(n1)); + n = (double)n1; +#else +// AMD Window Timing +#endif + +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + n = s.tv_sec * 1.0E3 + (double)(s.tv_usec / 1.0E3); +#else + // AMD Linux timing + unsigned int unused; + n = __rdtscp(&unused); +#endif +#endif + + n -= timers_[index]->start; + timers_[index]->start = 0; + +#ifndef _AMD + timers_[index]->clocks += n; +#else + // timers_[index]->clocks += 10 * n / freq_in_100mhz_; // unit is ns + timers_[index]->clocks += 1.0E-6 * 10 * n / freq_in_100mhz_; // convert to ms +#endif + + return SUCCESS; +} + +void PerfTimer::Error(std::string str) { std::cout << str << std::endl; } + + +double PerfTimer::ReadTimer(int index) { + if (index >= (int)timers_.size()) { + Error("Cannot read timer. Invalid handle."); + return FAILURE; + } + + double reading = double(timers_[index]->clocks); + + reading = double(reading / timers_[index]->freq); + + return reading; +} + + +uint64_t PerfTimer::CoarseTimestampUs() { +#ifdef _WIN32 + uint64_t freqHz, ticks; + QueryPerformanceFrequency((LARGE_INTEGER*)&freqHz); + QueryPerformanceCounter((LARGE_INTEGER*)&ticks); + + // Scale numerator and divisor until (ticks * 1000000) fits in uint64_t. + while (ticks > (1ULL << 44)) { + ticks /= 16; + freqHz /= 16; + } + + return (ticks * 1000000) / freqHz; +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000; +#endif +} + +uint64_t PerfTimer::MeasureTSCFreqHz() { + // Make a coarse interval measurement of TSC ticks for 1 gigacycles. + unsigned int unused; + uint64_t tscTicksEnd; + + uint64_t coarseBeginUs = CoarseTimestampUs(); + uint64_t tscTicksBegin = __rdtscp(&unused); + do { + tscTicksEnd = __rdtscp(&unused); + } while (tscTicksEnd - tscTicksBegin < 1000000000); + + uint64_t coarseEndUs = CoarseTimestampUs(); + + // Compute the TSC frequency and round to nearest 100MHz. + uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000; + uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin; + return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs; +} diff --git a/test/hsa/test/util/perf_timer.h b/test/hsa/test/util/perf_timer.h new file mode 100644 index 00000000..bfd55324 --- /dev/null +++ b/test/hsa/test/util/perf_timer.h @@ -0,0 +1,83 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_PERF_TIMER_H_ +#define TEST_UTIL_PERF_TIMER_H_ + +// Will use AMD timer or general Linux timer based on compilation flag +// Need to consider platform is Windows or Linux + +#include +#include +#include + +#if defined(_MSC_VER) +#include +#include +#include +#else +#if defined(__GNUC__) +#include +#include +#endif // __GNUC__ +#endif // _MSC_VER + +#include +#include +#include + +class PerfTimer { + public: + enum { SUCCESS = 0, FAILURE = 1 }; + + PerfTimer(); + ~PerfTimer(); + + // General Linux timing method + int CreateTimer(); + int StartTimer(int index); + int StopTimer(int index); + + // retrieve time + double ReadTimer(int index); + // write into a file + double WriteTimer(int index); + + private: + struct Timer { + std::string name; /* name of time object */ + long long freq; /* frequency */ + double clocks; /* number of ticks at end */ + double start; /* start point ticks */ + }; + + std::vector timers_; /* vector to Timer objects */ + double freq_in_100mhz_; + + // AMD timing method + uint64_t CoarseTimestampUs(); + uint64_t MeasureTSCFreqHz(); + + void Error(std::string str); +}; + +#endif // TEST_UTIL_PERF_TIMER_H_ diff --git a/test/hsa/test/util/test_assert.h b/test/hsa/test/util/test_assert.h new file mode 100644 index 00000000..7803865d --- /dev/null +++ b/test/hsa/test/util/test_assert.h @@ -0,0 +1,35 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_TEST_ASSERT_H_ +#define TEST_UTIL_TEST_ASSERT_H_ + +#define TEST_ASSERT(cond) \ + { \ + if (!(cond)) { \ + std::cerr << "Assert failed(" << #cond << ") at " << __FILE__ << ", line " << __LINE__ \ + << std::endl; \ + exit(-1); \ + } \ + } + +#endif // TEST_UTIL_TEST_ASSERT_H_ diff --git a/test/hsa/test/util/xml.h b/test/hsa/test/util/xml.h new file mode 100644 index 00000000..eb2f5074 --- /dev/null +++ b/test/hsa/test/util/xml.h @@ -0,0 +1,457 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_XML_H_ +#define TEST_UTIL_XML_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace xml { + +class Xml { + public: + typedef std::vector token_t; + + struct level_t; + typedef std::vector nodes_t; + typedef std::map opts_t; + struct level_t { + std::string tag; + nodes_t nodes; + opts_t opts; + }; + typedef std::vector nodes_vec_t; + typedef std::map map_t; + + enum { DECL_STATE, BODY_STATE }; + + static Xml* Create(const std::string& file_name, const Xml* obj = NULL) { + Xml* xml = new Xml(file_name, obj); + if (xml != NULL) { + if (xml->Init() == false) { + delete xml; + xml = NULL; + } else { + const std::size_t pos = file_name.rfind('/'); + const std::string path = (pos != std::string::npos) ? file_name.substr(0, pos + 1) : ""; + + xml->PreProcess(); + nodes_t incl_nodes; + for (auto* node : xml->GetNodes("top.include")) { + if (node->opts.find("touch") == node->opts.end()) { + node->opts["touch"] = ""; + incl_nodes.push_back(node); + } + } + for (auto* incl : incl_nodes) { + const std::string& incl_name = path + incl->opts["file"]; + Xml* ixml = Create(incl_name, xml); + if (ixml == NULL) { + delete xml; + xml = NULL; + break; + } else { + delete ixml; + } + } + if (xml) { + xml->Process(); + } + } + } + + return xml; + } + + static void Destroy(Xml* xml) { delete xml; } + + std::string GetName() { return file_name_; } + + void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr) { + const std::size_t pos = full_tag.rfind('.'); + const std::size_t pos1 = (pos == std::string::npos) ? 0 : pos + 1; + const std::string level_tag = full_tag.substr(pos1); + level_t* level = new level_t; + (*map_)[full_tag].push_back(level); + level->tag = level_tag; + level->opts["name"] = name; + level->opts["expr"] = expr; + } + + void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val) { + std::ostringstream oss; + oss << val; + AddExpr(full_tag, name, oss.str()); + } + + nodes_t GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; } + + template F ForEach(const F& f_i) { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + template F ForEach(const F& f_i) const { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + struct print_func { + bool fun(const std::string& global_tag, level_t* node) { + for (auto& opt : node->opts) { + std::cout << global_tag << "." << opt.first << " = " << opt.second << std::endl; + } + return true; + } + }; + + void Print() const { + std::cout << "XML file '" << file_name_ << "':" << std::endl; + ForEach(print_func()); + } + + private: + Xml(const std::string& file_name, const Xml* obj) + : file_name_(file_name), + file_line_(0), + data_size_(0), + index_(0), + state_(BODY_STATE), + comment_(false), + included_(false), + level_(NULL), + map_(NULL) { + if (obj != NULL) { + map_ = obj->map_; + level_ = obj->level_; + included_ = true; + } + } + + struct delete_func { + bool fun(const std::string&, level_t* node) { + delete node; + return true; + } + }; + + ~Xml() { + if (included_ == false) { + ForEach(delete_func()); + delete map_; + } + } + + bool Init() { + fd_ = open(file_name_.c_str(), O_RDONLY); + if (fd_ == -1) { + // perror((std::string("open XML file ") + file_name_).c_str()); + return false; + } + + if (map_ == NULL) { + map_ = new map_t; + if (map_ == NULL) return false; + AddLevel("top"); + } + + return true; + } + + void PreProcess() { + uint32_t ind = 0; + char buf[kBufSize]; + bool error = false; + + while (1) { + const uint32_t pos = lseek(fd_, 0, SEEK_CUR); + uint32_t size = read(fd_, buf, kBufSize); + if (size <= 0) break; + buf[size - 1] = '\0'; + + if (strncmp(buf, "#include \"", 10) == 0) { + for (ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) {} + if (ind == size) { + fprintf(stderr, "XML PreProcess failed, line size limit %zu\n", kBufSize); + error = true; + break; + } + buf[ind] = '\0'; + size = ind; + lseek(fd_, pos + ind + 1, SEEK_SET); + + for (ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) {} + if (ind == size) { + error = true; + break; + } + buf[ind] = '\0'; + + AddLevel("include"); + AddOption("file", &buf[10]); + UpLevel(); + } + } + + if (error) { + fprintf(stderr, "XML PreProcess failed, line '%s'\n", buf); + exit(1); + } + + lseek(fd_, 0, SEEK_SET); + } + + void Process() { + token_t remainder; + + while (1) { + token_t token = (remainder.size()) ? remainder : NextToken(); + remainder.clear(); + + // token_t token1 = token; + // token1.push_back('\0'); + // std::cout << "> " << &token1[0] << std::endl; + + // End of file + if (token.size() == 0) break; + + switch (state_) { + case BODY_STATE: + if (token[0] == '<') { + bool node_begin = true; + unsigned ind = 1; + if (token[1] == '/') { + node_begin = false; + ++ind; + } + + unsigned i = ind; + while (i < token.size()) { + if (token[i] == '>') break; + ++i; + } + for (unsigned j = i + 1; j < token.size(); ++j) remainder.push_back(token[j]); + + if (i == token.size()) { + if (node_begin) + state_ = DECL_STATE; + else + BadFormat(token); + token.push_back('\0'); + } else { + token[i] = '\0'; + } + + const char* tag = &token[ind]; + if (node_begin) { + AddLevel(tag); + } else { + if (strncmp(CurrentLevel().c_str(), tag, strlen(tag)) != 0) { + token.back() = '>'; + BadFormat(token); + } + UpLevel(); + } + } else { + BadFormat(token); + } + break; + case DECL_STATE: + if (token[0] == '>') { + state_ = BODY_STATE; + for (unsigned j = 1; j < token.size(); ++j) remainder.push_back(token[j]); + continue; + } else { + token.push_back('\0'); + unsigned j = 0; + for (j = 0; j < token.size(); ++j) + if (token[j] == '=') break; + if (j == token.size()) BadFormat(token); + token[j] = '\0'; + const char* key = &token[0]; + const char* value = &token[j + 1]; + AddOption(key, value); + } + break; + default: + std::cout << "XML parser error: wrong state: " << state_ << std::endl; + exit(1); + } + } + } + + bool SpaceCheck() const { + bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == '\t')); + return cond; + } + + bool LineEndCheck() { + bool found = false; + if (buffer_[index_] == '\n') { + buffer_[index_] = ' '; + ++file_line_; + found = true; + comment_ = false; + } else if (comment_ || (buffer_[index_] == '#')) { + found = true; + comment_ = true; + } + return found; + } + + token_t NextToken() { + token_t token; + bool in_string = false; + bool special_symb = false; + + while (1) { + if (data_size_ == 0) { + data_size_ = read(fd_, buffer_, kBufSize); + if (data_size_ <= 0) break; + } + + if (token.empty()) { + while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) { + ++index_; + } + } + while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) { + const char symb = buffer_[index_]; + bool skip_symb = false; + + switch (symb) { + case '\\': + if (special_symb) { + special_symb = false; + } else { + special_symb = true; + skip_symb = true; + } + break; + case '"': + if (special_symb) { + special_symb = false; + } else { + in_string = !in_string; + if (!in_string) { + buffer_[index_] = ' '; + --index_; + } + skip_symb = true; + } + break; + } + + if (!skip_symb) token.push_back(symb); + ++index_; + } + + if (index_ == data_size_) { + index_ = 0; + data_size_ = 0; + } else { + if (special_symb || in_string) BadFormat(token); + break; + } + } + + return token; + } + + void BadFormat(token_t token) { + token.push_back('\0'); + std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" + << &token[0] << "'" << std::endl; + exit(1); + } + + void AddLevel(const std::string& tag) { + level_t* level = new level_t; + level->tag = tag; + if (level_) { + level_->nodes.push_back(level); + stack_.push_back(level_); + } + level_ = level; + + std::string global_tag; + for (level_t* level : stack_) { + global_tag += level->tag + "."; + } + global_tag += tag; + (*map_)[global_tag].push_back(level_); + } + + void UpLevel() { + level_ = stack_.back(); + stack_.pop_back(); + } + + std::string CurrentLevel() const { return level_->tag; } + + void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; } + + const std::string file_name_; + unsigned file_line_; + int fd_; + + static const size_t kBufSize = 256; + char buffer_[kBufSize]; + + unsigned data_size_; + unsigned index_; + unsigned state_; + bool comment_; + std::vector stack_; + bool included_; + level_t* level_; + map_t* map_; +}; + +} // namespace xml + +#endif // TEST_UTIL_XML_H_ From 82ceb546c860b2c2b3f4d906ceb2e82bd91bf2fa Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 10 Jun 2022 11:15:15 -0700 Subject: [PATCH 38/38] Fix memory leak in roctxRangePushA Do not strdup message when pushing to stack of std::strings. Use emplace to avoid unnecessary std::string moves/copies. --- src/roctx/roctx.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/roctx/roctx.cpp b/src/roctx/roctx.cpp index 819cc287..32600138 100644 --- a/src/roctx/roctx.cpp +++ b/src/roctx/roctx.cpp @@ -140,7 +140,7 @@ PUBLIC_API int roctxRangePushA(const char* message) { void* api_callback_arg = NULL; roctx::cb_table.get(ROCTX_API_ID_roctxRangePushA, &api_callback_fun, &api_callback_arg); if (api_callback_fun) api_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangePushA, &api_data, api_callback_arg); - roctx::message_stack->push(strdup(message)); + roctx::message_stack->emplace(message); return roctx::message_stack->size() - 1; API_METHOD_CATCH(-1);