Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/memory profiler #7983

Closed
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)

cc_library(profiler SRCS profiler.cc DEPS device_context)
cc_library(profiler SRCS profiler.cc DEPS device_context paddle_memory)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)

if(NOT WITH_C_API AND WITH_FLUID)
Expand Down
85 changes: 73 additions & 12 deletions paddle/platform/profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id,
}
#endif
cpu_ns_ = GetTimeInNsec();

if (has_cuda_) {
memory_used_chars_ = memory::memory_usage(dev_ctx->GetPlace());
} else {
memory_used_chars_ = memory::memory_usage(platform::CPUPlace());
}
}

std::string Event::kind() const {
Expand Down Expand Up @@ -92,6 +98,10 @@ double Event::CudaElapsedMs(const Event& e) const {
#endif
}

double Event::MemoryUsed(const Event& e) const {
return (e.memory_used_chars_ - memory_used_chars_) / (1024 * 1024);
}

#ifdef PADDLE_WITH_CUDA
static void ForEachDevice(std::function<void(int)> func) {
auto original_device = GetCurrentDeviceId();
Expand Down Expand Up @@ -257,14 +267,19 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
double event_time = (g_profiler_place == "CUDA")
? rit->CudaElapsedMs(events[i][j])
: rit->CpuElapsedMs(events[i][j]);

double event_memory_used = rit->MemoryUsed(events[i][j]);
std::string event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
max_name_width = std::max(max_name_width, event_name.size());

if (event_idx.find(event_name) == event_idx.end()) {
event_idx[event_name] = event_items.size();
EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time};
EventItem event_item = {event_name, 1,
event_time, event_time,
event_time, event_time,
event_memory_used, event_memory_used,
event_memory_used, event_memory_used};
event_items.push_back(event_item);
} else {
int index = event_idx[event_name];
Expand All @@ -277,6 +292,17 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
// max time
event_items[index].max_time =
std::max(event_time, event_items[index].max_time);

// total memory used
event_items[index].total_time += event_memory_used;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

total_time -> total_memory_used

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


// min memory used
event_items[index].min_memory_used =
std::min(event_memory_used, event_items[index].min_memory_used);

// max memory used
event_items[index].max_memory_used =
std::max(event_memory_used, event_items[index].max_memory_used);
}

// remove the push marker from the list
Expand All @@ -288,9 +314,10 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
}
}
}
// average time
// average time and memory used
for (auto& item : event_items) {
item.ave_time = item.total_time / item.calls;
item.ave_memory_used = item.total_memory_used / item.calls;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think item.ave_memory_used is necessary.
average memory used is confusing.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tend to count the memory used in the most case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deleted.

}
// sort
if (sorted_by != EventSortingKey::kDefault) {
Expand All @@ -314,29 +341,63 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width,
const size_t data_width) {
double app_total_time = .0, app_total_memory = .0;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
EventItem& event_item = events_table[i][j];

app_total_time += event_item.total_time;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not correct to count the total time like this. There may be overlap between different events, also there may be gap between the end time of one event and the start time of the next.

app_total_memory += event_item.total_memory_used;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please explain how to obtain the memory occupation for every operator? I feel that the memory may be malloced and freed frequently, and often be reused. And total_memory_used calculated in this way would be meaningless.

}
}

// Output header information
std::cout << "\n------------------------->"
<< " Profiling Report "
<< "<-------------------------\n\n";
std::cout << "Place: " << g_profiler_place << std::endl;
std::cout << "Time unit: ms" << std::endl;
std::cout << "Sorted by " << sorted_domain
<< " in descending order in the same thread\n\n";

// clang-format off
std::cout << "Place: " << g_profiler_place << "\t"
<< "Total Time:" << app_total_time << "ms" << "\t"
<< "Total Memory:" << app_total_time << "MB" << "\t"
<< "Sorted by " << sorted_domain
<< " in descending order in the same thread\n"
<< std::endl;
// clang-format on

// Output events table
// clang-format off
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
std::cout << std::setw(name_width) << "Event"
<< std::setw(data_width) << "Calls"
<< std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min."
<< std::setw(data_width) << "Max."
<< std::setw(data_width) << "Ave."
<< std::setw(data_width) << "Total Memory."
<< std::setw(data_width) << "Min Memory."
<< std::setw(data_width) << "Max Memory."
<< std::setw(data_width) << "Ave Memory."
<< std::endl;
// clang-format on

for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
EventItem& event_item = events_table[i][j];

// clang-format off
std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::endl;
<< std::setw(data_width) << event_item.ave_time
<< std::setw(data_width) << event_item.total_memory_used
<< std::setw(data_width) << event_item.min_memory_used
<< std::setw(data_width) << event_item.max_memory_used
<< std::setw(data_width) << event_item.ave_memory_used
<< std::endl;
// clang-format on
}
}
std::cout << std::endl;
Expand Down
9 changes: 9 additions & 0 deletions paddle/platform/profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <forward_list>
#include <list>
#include <mutex>
#include <vector>

#include "paddle/memory/memory.h"
#include "paddle/platform/device_context.h"

namespace paddle {
Expand All @@ -43,12 +46,14 @@ class Event {

double CpuElapsedMs(const Event& e) const;
double CudaElapsedMs(const Event& e) const;
double MemoryUsed(const Event& e) const;

private:
EventKind kind_;
std::string name_;
uint32_t thread_id_;
int64_t cpu_ns_;
int64_t memory_used_chars_;
bool has_cuda_;
#ifdef PADDLE_WITH_CUDA
cudaEvent_t event_ = nullptr;
Expand Down Expand Up @@ -124,6 +129,10 @@ struct EventItem {
double min_time;
double max_time;
double ave_time;
double total_memory_used;
double min_memory_used;
double max_memory_used;
double ave_memory_used;
};

// Candidate keys to sort the profiling report
Expand Down
27 changes: 17 additions & 10 deletions paddle/platform/profiler_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,20 @@ limitations under the License. */
TEST(Event, CpuElapsedTime) {
using paddle::platform::Event;
using paddle::platform::EventKind;
using namespace paddle::platform;

Event start_event(EventKind::kPushRange, "test", 0, nullptr);
DeviceContext* dev_ctx = new CPUDeviceContext();

Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
EXPECT_TRUE(start_event.has_cuda() == false);
int counter = 0;
while (counter != 1000) {
counter++;
}
Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);

delete dev_ctx;
}

#ifdef PADDLE_WITH_CUDA
Expand All @@ -46,27 +51,26 @@ TEST(Event, CudaElapsedTime) {
}
Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);

delete dev_ctx;
}
#endif

TEST(RecordEvent, RecordEvent) {
using paddle::platform::DeviceContext;
using paddle::platform::Event;
using paddle::platform::EventKind;
using paddle::platform::RecordEvent;
using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey;
using namespace paddle::platform;

ProfilerState state = ProfilerState::kCPU;
DeviceContext* dev_ctx = nullptr;
#ifdef PADDLE_WITH_CUDA
using paddle::platform::CUDADeviceContext;
using paddle::platform::CUDAPlace;
state = ProfilerState::kCUDA;
dev_ctx =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
#else
dev_ctx = new CPUDeviceContext();
#endif

EnableProfiler(state);
PADDLE_ENFORCE_NOT_NULL(dev_ctx);

/* Usage 1:
* PushEvent(evt_name, dev_ctx);
Expand All @@ -75,6 +79,7 @@ TEST(RecordEvent, RecordEvent) {
* ...
* PopEvent(evt_name, dev_ctx);
*/

for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i);
Expand Down Expand Up @@ -121,6 +126,8 @@ TEST(RecordEvent, RecordEvent) {
}
}
}

delete dev_ctx;
EXPECT_EQ(cuda_startup_count % 5, 0);
EXPECT_EQ(start_profiler_count, 1);

Expand Down