Skip to content

Commit

Permalink
[refactor] [CUDA] Add traced_records_ for KernelProfilerBase, refacto…
Browse files Browse the repository at this point in the history
…ring KernelProfilerCUDA::sync() (#2909)

* [refactor] [CUDA] Add traced_records_ for KernelProfilerBase, refactoring KernelProfilerCUDA::sync()

* Auto Format

* add /*param_name=*/, init total_time_ms_

* Auto Format

Co-authored-by: Taichi Gardener <[email protected]>
  • Loading branch information
yolo2themoon and taichi-gardener authored Sep 11, 2021
1 parent 619f94a commit 36d9590
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 72 deletions.
117 changes: 71 additions & 46 deletions taichi/backends/cuda/cuda_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@ std::string KernelProfilerCUDA::title() const {

KernelProfilerBase::TaskHandle KernelProfilerCUDA::start_with_handle(
const std::string &kernel_name) {
void *start, *stop;
CUDADriver::get_instance().event_create(&start, CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_create(&stop, CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_record(start, 0);
outstanding_events_[kernel_name].push_back(std::make_pair(start, stop));
CUDAEventRecord record;
record.name = kernel_name;

CUDADriver::get_instance().event_create(&(record.start_event),
CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_create(&(record.stop_event),
CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_record((record.start_event), 0);
event_records_.push_back(record);

if (!base_event_) {
// Note that CUDA driver API only allows querying relative time difference
Expand Down Expand Up @@ -47,7 +51,7 @@ KernelProfilerBase::TaskHandle KernelProfilerCUDA::start_with_handle(
}
}

return stop;
return record.stop_event;
}

void KernelProfilerCUDA::stop(KernelProfilerBase::TaskHandle handle) {
Expand All @@ -60,44 +64,64 @@ void KernelProfilerCUDA::record(KernelProfilerBase::TaskHandle &task_handle,
}

void KernelProfilerCUDA::sync() {
// sync
CUDADriver::get_instance().stream_synchronize(nullptr);
auto &timeline = Timeline::get_this_thread_instance();
for (auto &map_elem : outstanding_events_) {
auto &list = map_elem.second;
for (auto &item : list) {
auto start = item.first, stop = item.second;
float kernel_time;
CUDADriver::get_instance().event_elapsed_time(&kernel_time, start, stop);

if (Timelines::get_instance().get_enabled()) {
float time_since_base;
CUDADriver::get_instance().event_elapsed_time(&time_since_base,
base_event_, start);
timeline.insert_event({map_elem.first, true,
base_time_ + time_since_base * 1e-3, "cuda"});
timeline.insert_event(
{map_elem.first, false,
base_time_ + (time_since_base + kernel_time) * 1e-3, "cuda"});
}

auto it = std::find_if(
records.begin(), records.end(),
[&](KernelProfileRecord &r) { return r.name == map_elem.first; });
if (it == records.end()) {
records.emplace_back(map_elem.first);
it = std::prev(records.end());
}
it->insert_sample(kernel_time);
total_time_ms += kernel_time;

// TODO: the following two lines seem to increases profiler overhead a
// little bit. Is there a way to avoid the overhead while not creating
// too many events?
CUDADriver::get_instance().event_destroy(start);
CUDADriver::get_instance().event_destroy(stop);
// cuEvent : get kernel_elapsed_time
for (auto &record : event_records_) {
CUDADriver::get_instance().event_elapsed_time(
&record.kernel_elapsed_time_in_ms, record.start_event,
record.stop_event);
CUDADriver::get_instance().event_elapsed_time(
&record.time_since_base, base_event_, record.start_event);

// TODO: the following two lines seem to increases profiler overhead a
// little bit. Is there a way to avoid the overhead while not creating
// too many events?
CUDADriver::get_instance().event_destroy(record.start_event);
CUDADriver::get_instance().event_destroy(record.stop_event);

// copy to traced_records_ then clear event_records_
KernelProfileTracedRecord traced_record;
traced_record.name = record.name;
traced_record.kernel_elapsed_time_in_ms = record.kernel_elapsed_time_in_ms;
traced_record.time_since_base = record.time_since_base;
traced_records_.push_back(traced_record);
}

// statistics on traced_records_
for (auto &record : traced_records_) {
auto it =
std::find_if(statistical_results_.begin(), statistical_results_.end(),
[&](KernelProfileStatisticalResult &result) {
return result.name == record.name;
});
if (it == statistical_results_.end()) {
statistical_results_.emplace_back(record.name);
it = std::prev(statistical_results_.end());
}
it->insert_record(record.kernel_elapsed_time_in_ms);
total_time_ms_ += record.kernel_elapsed_time_in_ms;
}
outstanding_events_.clear();

// timeline
if (Timelines::get_instance().get_enabled()) {
auto &timeline = Timeline::get_this_thread_instance();
for (auto &record : traced_records_) {
// param of insert_event() :
// struct TimelineEvent @ taichi/taichi/system/timeline.h
timeline.insert_event({record.name, /*param_name=begin*/ true,
base_time_ + record.time_since_base * 1e-3,
"cuda"});
timeline.insert_event({record.name, /*param_name=begin*/ false,
base_time_ + (record.time_since_base +
record.kernel_elapsed_time_in_ms) *
1e-3,
"cuda"});
}
}

event_records_.clear();
}

void KernelProfilerCUDA::print() {
Expand All @@ -109,9 +133,9 @@ void KernelProfilerCUDA::print() {
fmt::print(
"[ % total count | min avg max ] Kernel "
"name\n");
std::sort(records.begin(), records.end());
for (auto &rec : records) {
auto fraction = rec.total / total_time_ms * 100.0f;
std::sort(statistical_results_.begin(), statistical_results_.end());
for (auto &rec : statistical_results_) {
auto fraction = rec.total / total_time_ms_ * 100.0f;
fmt::print("[{:6.2f}% {:7.3f} s {:6d}x |{:9.3f} {:9.3f} {:9.3f} ms] {}\n",
fraction, rec.total / 1000.0f, rec.counter, rec.min,
rec.total / rec.counter, rec.max, rec.name);
Expand All @@ -122,7 +146,7 @@ void KernelProfilerCUDA::print() {
fmt::print(
"[100.00%] Total kernel execution time: {:7.3f} s number of records: "
"{}\n",
get_total_time(), records.size());
get_total_time(), statistical_results_.size());

fmt::print(
"========================================================================"
Expand All @@ -131,8 +155,9 @@ void KernelProfilerCUDA::print() {

void KernelProfilerCUDA::clear() {
sync();
total_time_ms = 0;
records.clear();
total_time_ms_ = 0;
traced_records_.clear();
statistical_results_.clear();
}

#else
Expand Down
12 changes: 10 additions & 2 deletions taichi/backends/cuda/cuda_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@

TLANG_NAMESPACE_BEGIN

struct CUDAEventRecord {
std::string name;
float kernel_elapsed_time_in_ms{0.0};
float time_since_base{0.0};
void *start_event{nullptr};
void *stop_event{nullptr};
};

// A CUDA kernel profiler that uses CUDA timing events
class KernelProfilerCUDA : public KernelProfilerBase {
public:
Expand All @@ -25,8 +33,8 @@ class KernelProfilerCUDA : public KernelProfilerBase {
private:
void *base_event_{nullptr};
float64 base_time_{0.0};
std::map<std::string, std::vector<std::pair<void *, void *>>>
outstanding_events_;
// for cuEvent profiling, clear after sync()
std::vector<CUDAEventRecord> event_records_;
};

TLANG_NAMESPACE_END
40 changes: 22 additions & 18 deletions taichi/program/kernel_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

TLANG_NAMESPACE_BEGIN

void KernelProfileRecord::insert_sample(double t) {
void KernelProfileStatisticalResult::insert_record(double t) {
if (counter == 0) {
min = t;
max = t;
Expand All @@ -18,7 +18,8 @@ void KernelProfileRecord::insert_sample(double t) {
total += t;
}

bool KernelProfileRecord::operator<(const KernelProfileRecord &o) const {
bool KernelProfileStatisticalResult::operator<(
const KernelProfileStatisticalResult &o) const {
return total > o.total;
}

Expand All @@ -42,9 +43,9 @@ void KernelProfilerBase::print() {
fmt::print(
"[ % total count | min avg max ] Kernel "
"name\n");
std::sort(records.begin(), records.end());
for (auto &rec : records) {
auto fraction = rec.total / total_time_ms * 100.0f;
std::sort(statistical_results_.begin(), statistical_results_.end());
for (auto &rec : statistical_results_) {
auto fraction = rec.total / total_time_ms_ * 100.0f;
fmt::print("[{:6.2f}% {:7.3f} s {:6d}x |{:9.3f} {:9.3f} {:9.3f} ms] {}\n",
fraction, rec.total / 1000.0f, rec.counter, rec.min,
rec.total / rec.counter, rec.max, rec.name);
Expand All @@ -55,7 +56,7 @@ void KernelProfilerBase::print() {
fmt::print(
"[100.00%] Total kernel execution time: {:7.3f} s number of records: "
"{}\n",
get_total_time(), records.size());
get_total_time(), statistical_results_.size());

fmt::print(
"========================================================================"
Expand All @@ -69,7 +70,7 @@ void KernelProfilerBase::query(const std::string &kernel_name,
double &avg) {
sync();
std::regex name_regex(kernel_name + "(.*)");
for (auto &rec : records) {
for (auto &rec : statistical_results_) {
if (std::regex_match(rec.name, name_regex)) {
if (counter == 0) {
counter = rec.counter;
Expand All @@ -89,7 +90,7 @@ void KernelProfilerBase::query(const std::string &kernel_name,
}

double KernelProfilerBase::get_total_time() const {
return total_time_ms / 1000.0;
return total_time_ms_ / 1000.0;
}

namespace {
Expand All @@ -105,8 +106,9 @@ class DefaultProfiler : public KernelProfilerBase {

void clear() override {
sync();
total_time_ms = 0;
records.clear();
total_time_ms_ = 0;
traced_records_.clear();
statistical_results_.clear();
}

std::string title() const override {
Expand All @@ -121,15 +123,17 @@ class DefaultProfiler : public KernelProfilerBase {
void stop() override {
auto t = Time::get_time() - start_t_;
auto ms = t * 1000.0;
auto it = std::find_if(
records.begin(), records.end(),
[&](KernelProfileRecord &r) { return r.name == event_name_; });
if (it == records.end()) {
records.emplace_back(event_name_);
it = std::prev(records.end());
auto it =
std::find_if(statistical_results_.begin(), statistical_results_.end(),
[&](KernelProfileStatisticalResult &r) {
return r.name == event_name_;
});
if (it == statistical_results_.end()) {
statistical_results_.emplace_back(event_name_);
it = std::prev(statistical_results_.end());
}
it->insert_sample(ms);
total_time_ms += ms;
it->insert_record(ms);
total_time_ms_ += ms;
}

private:
Expand Down
20 changes: 14 additions & 6 deletions taichi/program/kernel_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,34 @@

TLANG_NAMESPACE_BEGIN

struct KernelProfileRecord {
struct KernelProfileTracedRecord {
std::string name;
float kernel_elapsed_time_in_ms{0.0};
float time_since_base{0.0}; // for Timeline
};

struct KernelProfileStatisticalResult {
std::string name;
int counter;
double min;
double max;
double total;

KernelProfileRecord(const std::string &name)
KernelProfileStatisticalResult(const std::string &name)
: name(name), counter(0), min(0), max(0), total(0) {
}

void insert_sample(double t);
void insert_record(double t); // TODO replace `double time` with
// `KernelProfileTracedRecord record`

bool operator<(const KernelProfileRecord &o) const;
bool operator<(const KernelProfileStatisticalResult &o) const;
};

class KernelProfilerBase {
protected:
std::vector<KernelProfileRecord> records;
double total_time_ms;
std::vector<KernelProfileTracedRecord> traced_records_;
std::vector<KernelProfileStatisticalResult> statistical_results_;
double total_time_ms_{0};

public:
// Needed for the CUDA backend since we need to know which task to "stop"
Expand Down

0 comments on commit 36d9590

Please sign in to comment.