Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[refactor] [CUDA] Add traced_records_ for KernelProfilerBase, refactoring KernelProfilerCUDA::sync() #2909

Merged
merged 4 commits into from
Sep 11, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 68 additions & 46 deletions taichi/backends/cuda/cuda_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@ std::string KernelProfilerCUDA::title() const {

KernelProfilerBase::TaskHandle KernelProfilerCUDA::start_with_handle(
const std::string &kernel_name) {
void *start, *stop;
CUDADriver::get_instance().event_create(&start, CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_create(&stop, CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_record(start, 0);
outstanding_events_[kernel_name].push_back(std::make_pair(start, stop));
CUDAEventRecord record;
record.name = kernel_name;

CUDADriver::get_instance().event_create(&(record.start_event),
CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_create(&(record.stop_event),
CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_record((record.start_event), 0);
event_records_.push_back(record);

if (!base_event_) {
// Note that CUDA driver API only allows querying relative time difference
Expand Down Expand Up @@ -47,7 +51,7 @@ KernelProfilerBase::TaskHandle KernelProfilerCUDA::start_with_handle(
}
}

return stop;
return record.stop_event;
}

void KernelProfilerCUDA::stop(KernelProfilerBase::TaskHandle handle) {
Expand All @@ -60,44 +64,61 @@ void KernelProfilerCUDA::record(KernelProfilerBase::TaskHandle &task_handle,
}

void KernelProfilerCUDA::sync() {
// sync
CUDADriver::get_instance().stream_synchronize(nullptr);
auto &timeline = Timeline::get_this_thread_instance();
for (auto &map_elem : outstanding_events_) {
auto &list = map_elem.second;
for (auto &item : list) {
auto start = item.first, stop = item.second;
float kernel_time;
CUDADriver::get_instance().event_elapsed_time(&kernel_time, start, stop);

if (Timelines::get_instance().get_enabled()) {
float time_since_base;
CUDADriver::get_instance().event_elapsed_time(&time_since_base,
base_event_, start);
timeline.insert_event({map_elem.first, true,
base_time_ + time_since_base * 1e-3, "cuda"});
timeline.insert_event(
{map_elem.first, false,
base_time_ + (time_since_base + kernel_time) * 1e-3, "cuda"});
}

auto it = std::find_if(
records.begin(), records.end(),
[&](KernelProfileRecord &r) { return r.name == map_elem.first; });
if (it == records.end()) {
records.emplace_back(map_elem.first);
it = std::prev(records.end());
}
it->insert_sample(kernel_time);
total_time_ms += kernel_time;

// TODO: the following two lines seem to increases profiler overhead a
// little bit. Is there a way to avoid the overhead while not creating
// too many events?
CUDADriver::get_instance().event_destroy(start);
CUDADriver::get_instance().event_destroy(stop);
// cuEvent : get traced kernel_elapsed_time
for (auto &record : event_records_) {
CUDADriver::get_instance().event_elapsed_time(
&record.kernel_elapsed_time_in_ms, record.start_event,
record.stop_event);
CUDADriver::get_instance().event_elapsed_time(
&record.time_since_base, base_event_, record.start_event);

// TODO: the following two lines seem to increases profiler overhead a
// little bit. Is there a way to avoid the overhead while not creating
// too many events?
CUDADriver::get_instance().event_destroy(record.start_event);
CUDADriver::get_instance().event_destroy(record.stop_event);

KernelProfileTracedRecord traced_record;
traced_record.name = record.name;
traced_record.kernel_elapsed_time_in_ms = record.kernel_elapsed_time_in_ms;
traced_record.time_since_base = record.time_since_base;
traced_records_.push_back(traced_record);
}

// statistics on traced_records_
for (auto &record : traced_records_) {
auto it =
std::find_if(statistical_results_.begin(), statistical_results_.end(),
[&](KernelProfileStatisticalResult &result) {
return result.name == record.name;
});
if (it == statistical_results_.end()) {
statistical_results_.emplace_back(record.name);
it = std::prev(statistical_results_.end());
}
it->insert_record(record.kernel_elapsed_time_in_ms);
total_time_ms_ += record.kernel_elapsed_time_in_ms;
}
outstanding_events_.clear();

// timeline
if (Timelines::get_instance().get_enabled()) {
auto &timeline = Timeline::get_this_thread_instance();
for (auto &record : traced_records_) {
timeline.insert_event({record.name, true,
yolo2themoon marked this conversation as resolved.
Show resolved Hide resolved
base_time_ + record.time_since_base * 1e-3,
"cuda"});
timeline.insert_event({record.name, false,
base_time_ + (record.time_since_base +
record.kernel_elapsed_time_in_ms) *
1e-3,
"cuda"});
}
}

event_records_.clear();
}

void KernelProfilerCUDA::print() {
Expand All @@ -109,9 +130,9 @@ void KernelProfilerCUDA::print() {
fmt::print(
"[ % total count | min avg max ] Kernel "
"name\n");
std::sort(records.begin(), records.end());
for (auto &rec : records) {
auto fraction = rec.total / total_time_ms * 100.0f;
std::sort(statistical_results_.begin(), statistical_results_.end());
for (auto &rec : statistical_results_) {
auto fraction = rec.total / total_time_ms_ * 100.0f;
fmt::print("[{:6.2f}% {:7.3f} s {:6d}x |{:9.3f} {:9.3f} {:9.3f} ms] {}\n",
fraction, rec.total / 1000.0f, rec.counter, rec.min,
rec.total / rec.counter, rec.max, rec.name);
Expand All @@ -122,7 +143,7 @@ void KernelProfilerCUDA::print() {
fmt::print(
"[100.00%] Total kernel execution time: {:7.3f} s number of records: "
"{}\n",
get_total_time(), records.size());
get_total_time(), statistical_results_.size());

fmt::print(
"========================================================================"
Expand All @@ -131,8 +152,9 @@ void KernelProfilerCUDA::print() {

void KernelProfilerCUDA::clear() {
sync();
total_time_ms = 0;
records.clear();
total_time_ms_ = 0;
traced_records_.clear();
statistical_results_.clear();
}

#else
Expand Down
12 changes: 10 additions & 2 deletions taichi/backends/cuda/cuda_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@

TLANG_NAMESPACE_BEGIN

struct CUDAEventRecord {
std::string name;
float kernel_elapsed_time_in_ms{0.0};
float time_since_base{0.0};
void *start_event{nullptr};
void *stop_event{nullptr};
};

// A CUDA kernel profiler that uses CUDA timing events
class KernelProfilerCUDA : public KernelProfilerBase {
public:
Expand All @@ -25,8 +33,8 @@ class KernelProfilerCUDA : public KernelProfilerBase {
private:
void *base_event_{nullptr};
float64 base_time_{0.0};
std::map<std::string, std::vector<std::pair<void *, void *>>>
outstanding_events_;
// for cuEvent profiling, clear after sync()
std::vector<CUDAEventRecord> event_records_;
};

TLANG_NAMESPACE_END
40 changes: 22 additions & 18 deletions taichi/program/kernel_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

TLANG_NAMESPACE_BEGIN

void KernelProfileRecord::insert_sample(double t) {
void KernelProfileStatisticalResult::insert_record(double t) {
if (counter == 0) {
min = t;
max = t;
Expand All @@ -18,7 +18,8 @@ void KernelProfileRecord::insert_sample(double t) {
total += t;
}

bool KernelProfileRecord::operator<(const KernelProfileRecord &o) const {
bool KernelProfileStatisticalResult::operator<(
const KernelProfileStatisticalResult &o) const {
return total > o.total;
}

Expand All @@ -42,9 +43,9 @@ void KernelProfilerBase::print() {
fmt::print(
"[ % total count | min avg max ] Kernel "
"name\n");
std::sort(records.begin(), records.end());
for (auto &rec : records) {
auto fraction = rec.total / total_time_ms * 100.0f;
std::sort(statistical_results_.begin(), statistical_results_.end());
for (auto &rec : statistical_results_) {
auto fraction = rec.total / total_time_ms_ * 100.0f;
fmt::print("[{:6.2f}% {:7.3f} s {:6d}x |{:9.3f} {:9.3f} {:9.3f} ms] {}\n",
fraction, rec.total / 1000.0f, rec.counter, rec.min,
rec.total / rec.counter, rec.max, rec.name);
Expand All @@ -55,7 +56,7 @@ void KernelProfilerBase::print() {
fmt::print(
"[100.00%] Total kernel execution time: {:7.3f} s number of records: "
"{}\n",
get_total_time(), records.size());
get_total_time(), statistical_results_.size());

fmt::print(
"========================================================================"
Expand All @@ -69,7 +70,7 @@ void KernelProfilerBase::query(const std::string &kernel_name,
double &avg) {
sync();
std::regex name_regex(kernel_name + "(.*)");
for (auto &rec : records) {
for (auto &rec : statistical_results_) {
if (std::regex_match(rec.name, name_regex)) {
if (counter == 0) {
counter = rec.counter;
Expand All @@ -89,7 +90,7 @@ void KernelProfilerBase::query(const std::string &kernel_name,
}

double KernelProfilerBase::get_total_time() const {
return total_time_ms / 1000.0;
return total_time_ms_ / 1000.0;
}

namespace {
Expand All @@ -105,8 +106,9 @@ class DefaultProfiler : public KernelProfilerBase {

void clear() override {
sync();
total_time_ms = 0;
records.clear();
total_time_ms_ = 0;
traced_records_.clear();
statistical_results_.clear();
}

std::string title() const override {
Expand All @@ -121,15 +123,17 @@ class DefaultProfiler : public KernelProfilerBase {
void stop() override {
auto t = Time::get_time() - start_t_;
auto ms = t * 1000.0;
auto it = std::find_if(
records.begin(), records.end(),
[&](KernelProfileRecord &r) { return r.name == event_name_; });
if (it == records.end()) {
records.emplace_back(event_name_);
it = std::prev(records.end());
auto it =
std::find_if(statistical_results_.begin(), statistical_results_.end(),
[&](KernelProfileStatisticalResult &r) {
return r.name == event_name_;
});
if (it == statistical_results_.end()) {
statistical_results_.emplace_back(event_name_);
it = std::prev(statistical_results_.end());
}
it->insert_sample(ms);
total_time_ms += ms;
it->insert_record(ms);
total_time_ms_ += ms;
}

private:
Expand Down
20 changes: 14 additions & 6 deletions taichi/program/kernel_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,34 @@

TLANG_NAMESPACE_BEGIN

struct KernelProfileRecord {
struct KernelProfileTracedRecord {
std::string name;
float kernel_elapsed_time_in_ms{0.0};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: In the future we can use a more accurate type to denote time and duration, e.g. std::chrono::duration

float time_since_base{0.0}; // for Timeline
};

struct KernelProfileStatisticalResult {
std::string name;
int counter;
double min;
double max;
double total;

KernelProfileRecord(const std::string &name)
KernelProfileStatisticalResult(const std::string &name)
: name(name), counter(0), min(0), max(0), total(0) {
}

void insert_sample(double t);
void insert_record(double t); // TODO replace `double time` with
// `KernelProfileTracedRecord record`

bool operator<(const KernelProfileRecord &o) const;
bool operator<(const KernelProfileStatisticalResult &o) const;
};

class KernelProfilerBase {
protected:
std::vector<KernelProfileRecord> records;
double total_time_ms;
std::vector<KernelProfileTracedRecord> traced_records_;
std::vector<KernelProfileStatisticalResult> statistical_results_;
double total_time_ms_;
yolo2themoon marked this conversation as resolved.
Show resolved Hide resolved

public:
// Needed for the CUDA backend since we need to know which task to "stop"
Expand Down