Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Amdgpu] Add amdgpu backend profiler #7330

Merged
merged 16 commits into from
Feb 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions taichi/program/kernel_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include "taichi/rhi/cuda/cuda_profiler.h"
#include "taichi/system/timeline.h"

#include "taichi/rhi/amdgpu/amdgpu_profiler.h"

namespace taichi::lang {

void KernelProfileStatisticalResult::insert_record(double t) {
Expand Down Expand Up @@ -124,6 +126,12 @@ std::unique_ptr<KernelProfilerBase> make_profiler(Arch arch, bool enable) {
return std::make_unique<KernelProfilerCUDA>(enable);
#else
TI_NOT_IMPLEMENTED;
#endif
} else if (arch == Arch::amdgpu) {
#if defined(TI_WITH_AMDGPU)
return std::make_unique<KernelProfilerAMDGPU>();
#else
TI_NOT_IMPLEMENTED
#endif
} else {
return std::make_unique<DefaultProfiler>();
Expand Down
1 change: 1 addition & 0 deletions taichi/rhi/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ target_sources(${AMDGPU_RHI}
amdgpu_caching_allocator.cpp
amdgpu_context.cpp
amdgpu_driver.cpp
amdgpu_profiler.cpp
)

target_include_directories(${AMDGPU_RHI}
Expand Down
17 changes: 17 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
#include "taichi/program/program.h"
#include "taichi/system/threading.h"
#include "taichi/rhi/amdgpu/amdgpu_driver.h"
#include "taichi/rhi/amdgpu/amdgpu_profiler.h"
#include "taichi/analysis/offline_cache_util.h"
#include "taichi/util/offline_cache.h"

namespace taichi {
namespace lang {
Expand Down Expand Up @@ -120,6 +122,17 @@ void AMDGPUContext::launch(void *func,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes) {
KernelProfilerBase::TaskHandle task_handle;
// Kernel launch
if (profiler_) {
KernelProfilerAMDGPU *profiler_amdgpu =
dynamic_cast<KernelProfilerAMDGPU *>(profiler_);
std::string primal_task_name, key;
bool valid =
offline_cache::try_demangle_name(task_name, primal_task_name, key);
profiler_amdgpu->trace(task_handle, valid ? primal_task_name : task_name,
func, grid_dim, block_dim, 0);
}
auto pack_size = get_args_byte(arg_sizes);
char *packed_arg = (char *)std::malloc(pack_size);
pack_args(arg_pointers, arg_sizes, packed_arg);
Expand All @@ -132,6 +145,10 @@ void AMDGPUContext::launch(void *func,
reinterpret_cast<void **>(&config));
}
std::free(packed_arg);

if (profiler_)
profiler_->stop(task_handle);

if (debug_) {
driver_.stream_synchronize(nullptr);
}
Expand Down
5 changes: 5 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class AMDGPUContext {
int compute_capability_;
std::string mcpu_;
std::mutex lock_;
KernelProfilerBase *profiler_;
AMDGPUDriver &driver_;
bool debug_;

Expand All @@ -40,6 +41,10 @@ class AMDGPUContext {

int get_args_byte(std::vector<int> arg_sizes);

void set_profiler(KernelProfilerBase *profiler) {
profiler_ = profiler;
}

void launch(void *func,
const std::string &task_name,
const std::vector<void *> &arg_pointers,
Expand Down
252 changes: 252 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_profiler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
#include "taichi/rhi/amdgpu/amdgpu_profiler.h"
#include "taichi/rhi/amdgpu/amdgpu_driver.h"
#include "taichi/rhi/amdgpu/amdgpu_context.h"
#include "taichi/rhi/amdgpu/amdgpu_types.h"

namespace taichi::lang {
#if defined(TI_WITH_AMDGPU)

std::string KernelProfilerAMDGPU::get_device_name() {
return AMDGPUContext::get_instance().get_device_name();
}

bool KernelProfilerAMDGPU::reinit_with_metrics(
const std::vector<std::string> metrics) {
TI_NOT_IMPLEMENTED
}

bool KernelProfilerAMDGPU::set_profiler_toolkit(std::string toolkit_name) {
if (toolkit_name.compare("default") == 0) {
return true;
}
TI_WARN("Only default(event) profiler is allowed on AMDGPU");
return false;
}

KernelProfilerBase::TaskHandle KernelProfilerAMDGPU::start_with_handle(
const std::string &kernel_name) {
TI_NOT_IMPLEMENTED;
}

void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle,
const std::string &kernel_name,
void *kernel,
uint32_t grid_size,
uint32_t block_size,
uint32_t dynamic_smem_size) {
int register_per_thread = 0;
int static_shared_mem_per_block = 0;
// int max_active_blocks_per_multiprocessor = 0;
task_handle = event_toolkit_->start_with_handle(kernel_name);
KernelProfileTracedRecord record;

AMDGPUDriver::get_instance().kernel_get_attribute(
&register_per_thread, HIPfunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS,
kernel);
AMDGPUDriver::get_instance().kernel_get_attribute(
&static_shared_mem_per_block,
HIPfunction_attribute::HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
// kernel_get_occupancy doesn't work well
// AMDGPUDriver::get_instance().kernel_get_occupancy(
// &max_active_blocks_per_multiprocessor, kernel, block_size,
// dynamic_smem_size);

record.name = kernel_name;
record.register_per_thread = register_per_thread;
record.shared_mem_per_block = static_shared_mem_per_block + dynamic_smem_size;
record.grid_size = grid_size;
record.block_size = block_size;
// record.active_blocks_per_multiprocessor =
// max_active_blocks_per_multiprocessor;

traced_records_.push_back(record);
}

void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) {
AMDGPUDriver::get_instance().event_record(handle, 0);
AMDGPUDriver::get_instance().stream_synchronize(nullptr);

// get elapsed time and destroy events
auto record = event_toolkit_->get_current_event_record();
AMDGPUDriver::get_instance().event_elapsed_time(
&record->kernel_elapsed_time_in_ms, record->start_event, handle);
AMDGPUDriver::get_instance().event_elapsed_time(
&record->time_since_base, event_toolkit_->get_base_event(),
record->start_event);

AMDGPUDriver::get_instance().event_destroy(record->start_event);
AMDGPUDriver::get_instance().event_destroy(record->stop_event);
}

bool KernelProfilerAMDGPU::statistics_on_traced_records() {
for (auto &record : traced_records_) {
auto it =
std::find_if(statistical_results_.begin(), statistical_results_.end(),
[&](KernelProfileStatisticalResult &result) {
return result.name == record.name;
});
if (it == statistical_results_.end()) {
statistical_results_.emplace_back(record.name);
it = std::prev(statistical_results_.end());
}
it->insert_record(record.kernel_elapsed_time_in_ms);
total_time_ms_ += record.kernel_elapsed_time_in_ms;
}

return true;
}

void KernelProfilerAMDGPU::sync() {
AMDGPUDriver::get_instance().stream_synchronize(nullptr);
}

void KernelProfilerAMDGPU::update() {
event_toolkit_->update_record(records_size_after_sync_, traced_records_);
event_toolkit_->update_timeline(traced_records_);
statistics_on_traced_records();
event_toolkit_->clear();
records_size_after_sync_ = traced_records_.size();
}

void KernelProfilerAMDGPU::clear() {
update();
total_time_ms_ = 0;
records_size_after_sync_ = 0;
traced_records_.clear();
statistical_results_.clear();
}

#else
std::string KernelProfilerAMDGPU::get_device_name() {
TI_NOT_IMPLEMENTED
}

bool KernelProfilerAMDGPU::reinit_with_metrics(
const std::vector<std::string> metrics){TI_NOT_IMPLEMENTED}

KernelProfilerBase::TaskHandle
KernelProfilerAMDGPU::start_with_handle(const std::string &kernel_name) {
TI_NOT_IMPLEMENTED;
}

void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle,
const std::string &kernel_name,
void *kernel,
uint32_t grid_size,
uint32_t block_size,
uint32_t dynamic_smem_size) {
TI_NOT_IMPLEMENTED;
}

void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) {
TI_NOT_IMPLEMENTED
}

bool KernelProfilerAMDGPU::statistics_on_traced_records() {
TI_NOT_IMPLEMENTED
}

void KernelProfilerAMDGPU::sync() {
TI_NOT_IMPLEMENTED
}
void KernelProfilerAMDGPU::update() {
TI_NOT_IMPLEMENTED
}

void KernelProfilerAMDGPU::clear(){TI_NOT_IMPLEMENTED}

#endif

#if defined(TI_WITH_AMDGPU)

KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(
const std::string &kernel_name) {
EventRecord record;
record.name = kernel_name;

AMDGPUDriver::get_instance().event_create(&(record.start_event),
HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_create(&(record.stop_event),
HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_record((record.start_event), 0);
event_records_.push_back(record);

if (!base_event_) {
int n_iters = 100;
// Warm up
for (int i = 0; i < n_iters; i++) {
void *e;
AMDGPUDriver::get_instance().event_create(&e, HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_record(e, 0);
AMDGPUDriver::get_instance().event_synchronize(e);
auto final_t = Time::get_time();
if (i == n_iters - 1) {
base_event_ = e;
// ignore the overhead of sync, event_create and systematic time offset.
base_time_ = final_t;
} else {
AMDGPUDriver::get_instance().event_destroy(e);
}
}
}
return record.stop_event;
}

void EventToolkitAMDGPU::update_record(
uint32_t records_size_after_sync,
std::vector<KernelProfileTracedRecord> &traced_records) {
uint32_t events_num = event_records_.size();
uint32_t records_num = traced_records.size();
TI_ERROR_IF(
records_size_after_sync + events_num != records_num,
"KernelProfilerAMDGPU::EventToolkitAMDGPU: event_records_.size({}) != "
"traced_records_.size({})",
records_size_after_sync + events_num, records_num);

uint32_t idx = 0;
for (auto &record : event_records_) {
// copy to traced_records_ then clear event_records_
traced_records[records_size_after_sync + idx].kernel_elapsed_time_in_ms =
record.kernel_elapsed_time_in_ms;
traced_records[records_size_after_sync + idx].time_since_base =
record.time_since_base;
idx++;
}
}

void EventToolkitAMDGPU::update_timeline(
std::vector<KernelProfileTracedRecord> &traced_records) {
if (Timelines::get_instance().get_enabled()) {
auto &timeline = Timeline::get_this_thread_instance();
for (auto &record : traced_records) {
timeline.insert_event({record.name, /*param_name=begin*/ true,
base_time_ + record.time_since_base * 1e-3,
"amdgpu"});
timeline.insert_event({record.name, /*param_name=begin*/ false,
base_time_ + (record.time_since_base +
record.kernel_elapsed_time_in_ms) *
1e-3,
"amdgpu"});
}
}
}

#else

KernelProfilerBase::TaskHandle
EventToolkitAMDGPU::start_with_handle(const std::string &kernel_name) {
TI_NOT_IMPLEMENTED;
}
void EventToolkitAMDGPU::update_record(
uint32_t records_size_after_sync,
std::vector<KernelProfileTracedRecord> &traced_records) {
TI_NOT_IMPLEMENTED;
}
void EventToolkitAMDGPU::update_timeline(
std::vector<KernelProfileTracedRecord> &traced_records) {
TI_NOT_IMPLEMENTED;
}

#endif

} // namespace taichi::lang
Loading