Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Amdgpu] Add amdgpu backend profiler #7330

Merged
merged 16 commits into from
Feb 15, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions taichi/program/kernel_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include "taichi/rhi/cuda/cuda_profiler.h"
#include "taichi/system/timeline.h"

#include "taichi/rhi/amdgpu/amdgpu_profiler.h"

namespace taichi::lang {

void KernelProfileStatisticalResult::insert_record(double t) {
Expand Down Expand Up @@ -124,6 +126,12 @@ std::unique_ptr<KernelProfilerBase> make_profiler(Arch arch, bool enable) {
return std::make_unique<KernelProfilerCUDA>(enable);
#else
TI_NOT_IMPLEMENTED;
#endif
} else if (arch == Arch::amdgpu) {
#if defined(TI_WITH_AMDGPU)
return std::make_unique<KernelProfilerAMDGPU>();
#else
TI_NOT_IMPLEMENTED
#endif
} else {
return std::make_unique<DefaultProfiler>();
Expand Down
44 changes: 44 additions & 0 deletions taichi/program/kernel_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,50 @@ class KernelProfilerBase {
}
};

class EventToolkitBase {
turbo0628 marked this conversation as resolved.
Show resolved Hide resolved
public:
virtual void update_record(
uint32_t records_size_after_sync,
std::vector<KernelProfileTracedRecord> &traced_records) {
TI_NOT_IMPLEMENTED;
};
virtual KernelProfilerBase::TaskHandle start_with_handle(
const std::string &kernel_name) {
TI_NOT_IMPLEMENTED;
};
virtual void update_timeline(
std::vector<KernelProfileTracedRecord> &traced_records) {
TI_NOT_IMPLEMENTED;
};

protected:
struct EventRecord {
std::string name;
float kernel_elapsed_time_in_ms{0.0};
float time_since_base{0.0};
void *start_event{nullptr};
void *stop_event{nullptr};
};
float64 base_time_{0.0};
void *base_event_{nullptr};
// for cuEvent profiling, clear after sync()
std::vector<EventRecord> event_records_;

public:
void clear() {
event_records_.clear();
}
EventRecord *get_current_event_record() {
return &(event_records_.back());
}
void *get_base_event() const {
return base_event_;
}
virtual ~EventToolkitBase(){

};
};

std::unique_ptr<KernelProfilerBase> make_profiler(Arch arch, bool enable);

} // namespace taichi::lang
1 change: 1 addition & 0 deletions taichi/rhi/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ target_sources(${AMDGPU_RHI}
amdgpu_caching_allocator.cpp
amdgpu_context.cpp
amdgpu_driver.cpp
amdgpu_profiler.cpp
)

target_include_directories(${AMDGPU_RHI}
Expand Down
17 changes: 17 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
#include "taichi/program/program.h"
#include "taichi/system/threading.h"
#include "taichi/rhi/amdgpu/amdgpu_driver.h"
#include "taichi/rhi/amdgpu/amdgpu_profiler.h"
#include "taichi/analysis/offline_cache_util.h"
#include "taichi/util/offline_cache.h"

namespace taichi {
namespace lang {
Expand Down Expand Up @@ -120,6 +122,17 @@ void AMDGPUContext::launch(void *func,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes) {
KernelProfilerBase::TaskHandle task_handle;
// Kernel launch
if (profiler_) {
KernelProfilerAMDGPU *profiler_amdgpu =
dynamic_cast<KernelProfilerAMDGPU *>(profiler_);
std::string primal_task_name, key;
bool valid =
offline_cache::try_demangle_name(task_name, primal_task_name, key);
profiler_amdgpu->trace(task_handle, valid ? primal_task_name : task_name,
func, grid_dim, block_dim, 0);
}
auto pack_size = get_args_byte(arg_sizes);
char *packed_arg = (char *)std::malloc(pack_size);
pack_args(arg_pointers, arg_sizes, packed_arg);
Expand All @@ -132,6 +145,10 @@ void AMDGPUContext::launch(void *func,
reinterpret_cast<void **>(&config));
}
std::free(packed_arg);

if (profiler_)
profiler_->stop(task_handle);

if (debug_) {
driver_.stream_synchronize(nullptr);
}
Expand Down
5 changes: 5 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class AMDGPUContext {
int compute_capability_;
std::string mcpu_;
std::mutex lock_;
KernelProfilerBase *profiler_;
AMDGPUDriver &driver_;
bool debug_;

Expand All @@ -40,6 +41,10 @@ class AMDGPUContext {

int get_args_byte(std::vector<int> arg_sizes);

void set_profiler(KernelProfilerBase *profiler) {
profiler_ = profiler;
}

void launch(void *func,
const std::string &task_name,
const std::vector<void *> &arg_pointers,
Expand Down
252 changes: 252 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_profiler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
#include "taichi/rhi/amdgpu/amdgpu_profiler.h"
#include "taichi/rhi/amdgpu/amdgpu_driver.h"
#include "taichi/rhi/amdgpu/amdgpu_context.h"
#include "taichi/rhi/amdgpu/amdgpu_types.h"

namespace taichi::lang {
#if defined(TI_WITH_AMDGPU)

std::string KernelProfilerAMDGPU::get_device_name() {
return AMDGPUContext::get_instance().get_device_name();
}

bool KernelProfilerAMDGPU::reinit_with_metrics(
const std::vector<std::string> metrics) {
TI_NOT_IMPLEMENTED
}

bool KernelProfilerAMDGPU::set_profiler_toolkit(std::string toolkit_name) {
if (toolkit_name.compare("default") == 0) {
return true;
}
TI_WARN("Only default(event) profiler is allowed on AMDGPU");
return false;
}

KernelProfilerBase::TaskHandle KernelProfilerAMDGPU::start_with_handle(
const std::string &kernel_name) {
TI_NOT_IMPLEMENTED;
}

void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle,
const std::string &kernel_name,
void *kernel,
uint32_t grid_size,
uint32_t block_size,
uint32_t dynamic_smem_size) {
int register_per_thread = 0;
int static_shared_mem_per_block = 0;
// int max_active_blocks_per_multiprocessor = 0;
task_handle = event_toolkit_->start_with_handle(kernel_name);
KernelProfileTracedRecord record;

AMDGPUDriver::get_instance().kernel_get_attribute(
&register_per_thread, HIPfunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS,
kernel);
AMDGPUDriver::get_instance().kernel_get_attribute(
&static_shared_mem_per_block,
HIPfunction_attribute::HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
// kernel_get_occupancy doesn't work well
// AMDGPUDriver::get_instance().kernel_get_occupancy(
// &max_active_blocks_per_multiprocessor, kernel, block_size,
// dynamic_smem_size);

record.name = kernel_name;
record.register_per_thread = register_per_thread;
record.shared_mem_per_block = static_shared_mem_per_block + dynamic_smem_size;
record.grid_size = grid_size;
record.block_size = block_size;
// record.active_blocks_per_multiprocessor =
// max_active_blocks_per_multiprocessor;

traced_records_.push_back(record);
}

void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) {
AMDGPUDriver::get_instance().event_record(handle, 0);
AMDGPUDriver::get_instance().stream_synchronize(nullptr);

// get elapsed time and destroy events
auto record = event_toolkit_->get_current_event_record();
AMDGPUDriver::get_instance().event_elapsed_time(
&record->kernel_elapsed_time_in_ms, record->start_event, handle);
AMDGPUDriver::get_instance().event_elapsed_time(
&record->time_since_base, event_toolkit_->get_base_event(),
record->start_event);

AMDGPUDriver::get_instance().event_destroy(record->start_event);
AMDGPUDriver::get_instance().event_destroy(record->stop_event);
}

bool KernelProfilerAMDGPU::statistics_on_traced_records() {
for (auto &record : traced_records_) {
auto it =
std::find_if(statistical_results_.begin(), statistical_results_.end(),
[&](KernelProfileStatisticalResult &result) {
return result.name == record.name;
});
if (it == statistical_results_.end()) {
statistical_results_.emplace_back(record.name);
it = std::prev(statistical_results_.end());
}
it->insert_record(record.kernel_elapsed_time_in_ms);
total_time_ms_ += record.kernel_elapsed_time_in_ms;
}

return true;
}

void KernelProfilerAMDGPU::sync() {
AMDGPUDriver::get_instance().stream_synchronize(nullptr);
}

void KernelProfilerAMDGPU::update() {
event_toolkit_->update_record(records_size_after_sync_, traced_records_);
event_toolkit_->update_timeline(traced_records_);
statistics_on_traced_records();
event_toolkit_->clear();
records_size_after_sync_ = traced_records_.size();
}

void KernelProfilerAMDGPU::clear() {
update();
total_time_ms_ = 0;
records_size_after_sync_ = 0;
traced_records_.clear();
statistical_results_.clear();
}

#else
std::string KernelProfilerAMDGPU::get_device_name() {
TI_NOT_IMPLEMENTED
}

bool KernelProfilerAMDGPU::reinit_with_metrics(
const std::vector<std::string> metrics){TI_NOT_IMPLEMENTED}

KernelProfilerBase::TaskHandle
KernelProfilerAMDGPU::start_with_handle(const std::string &kernel_name) {
TI_NOT_IMPLEMENTED;
}

void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle,
const std::string &kernel_name,
void *kernel,
uint32_t grid_size,
uint32_t block_size,
uint32_t dynamic_smem_size) {
TI_NOT_IMPLEMENTED;
}

void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) {
TI_NOT_IMPLEMENTED
}

bool KernelProfilerAMDGPU::statistics_on_traced_records() {
TI_NOT_IMPLEMENTED
}

void KernelProfilerAMDGPU::sync() {
TI_NOT_IMPLEMENTED
}
void KernelProfilerAMDGPU::update() {
TI_NOT_IMPLEMENTED
}

void KernelProfilerAMDGPU::clear(){TI_NOT_IMPLEMENTED}

#endif

#if defined(TI_WITH_AMDGPU)

KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(
const std::string &kernel_name) {
EventRecord record;
record.name = kernel_name;

AMDGPUDriver::get_instance().event_create(&(record.start_event),
HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_create(&(record.stop_event),
HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_record((record.start_event), 0);
event_records_.push_back(record);

if (!base_event_) {
int n_iters = 100;
// Warm up
for (int i = 0; i < n_iters; i++) {
void *e;
AMDGPUDriver::get_instance().event_create(&e, HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_record(e, 0);
AMDGPUDriver::get_instance().event_synchronize(e);
auto final_t = Time::get_time();
if (i == n_iters - 1) {
base_event_ = e;
// ignore the overhead of sync, event_create and systematic time offset.
base_time_ = final_t;
} else {
AMDGPUDriver::get_instance().event_destroy(e);
}
}
}
return record.stop_event;
}

void EventToolkitAMDGPU::update_record(
uint32_t records_size_after_sync,
std::vector<KernelProfileTracedRecord> &traced_records) {
uint32_t events_num = event_records_.size();
uint32_t records_num = traced_records.size();
TI_ERROR_IF(
records_size_after_sync + events_num != records_num,
"KernelProfilerAMDGPU::EventToolkitAMDGPU: event_records_.size({}) != "
"traced_records_.size({})",
records_size_after_sync + events_num, records_num);

uint32_t idx = 0;
for (auto &record : event_records_) {
// copy to traced_records_ then clear event_records_
traced_records[records_size_after_sync + idx].kernel_elapsed_time_in_ms =
record.kernel_elapsed_time_in_ms;
traced_records[records_size_after_sync + idx].time_since_base =
record.time_since_base;
idx++;
}
}

void EventToolkitAMDGPU::update_timeline(
std::vector<KernelProfileTracedRecord> &traced_records) {
if (Timelines::get_instance().get_enabled()) {
auto &timeline = Timeline::get_this_thread_instance();
for (auto &record : traced_records) {
timeline.insert_event({record.name, /*param_name=begin*/ true,
base_time_ + record.time_since_base * 1e-3,
"amdgpu"});
timeline.insert_event({record.name, /*param_name=begin*/ false,
base_time_ + (record.time_since_base +
record.kernel_elapsed_time_in_ms) *
1e-3,
"amdgpu"});
}
}
}

#else

KernelProfilerBase::TaskHandle
EventToolkitAMDGPU::start_with_handle(const std::string &kernel_name) {
TI_NOT_IMPLEMENTED;
}
void EventToolkitAMDGPU::update_record(
uint32_t records_size_after_sync,
std::vector<KernelProfileTracedRecord> &traced_records) {
TI_NOT_IMPLEMENTED;
}
void EventToolkitAMDGPU::update_timeline(
std::vector<KernelProfileTracedRecord> &traced_records) {
TI_NOT_IMPLEMENTED;
}

#endif

} // namespace taichi::lang
Loading