From 25c77a8b1484d7aef4f1db6a467befd198f0937c Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 19 Oct 2018 01:14:13 +0000 Subject: [PATCH] follow comments --- doc/fluid/howto/optimization/timeline_cn.md | 21 ++++++++++++++++++ doc/fluid/howto/optimization/timeline_en.md | 24 +++++++++++++++++++-- doc/fluid/howto/performance/profiler.md | 19 ++++++++++++++++ 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md index baf5da6bb5e66c..9f9303f46770a1 100644 --- a/doc/fluid/howto/optimization/timeline_cn.md +++ b/doc/fluid/howto/optimization/timeline_cn.md @@ -1,5 +1,7 @@ # timeline工具简介 +## 本地使用 + 1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。 **提示:** @@ -30,3 +32,22 @@ python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=time 1. 结果如下图所示,可以放到来查看timetime的细节信息。 ![chrome timeline](./timeline.jpeg) + +## 分布式使用 +一般来说,分布式的训练程序都会有两种程序:pserver和trainer。我们提供了把pserver和trainer的profile日志用timeline来显示的方式。 + +1. trainer打开方式与[本地使用](#local)部分的第1步相同 + +1. pserver可以通过加两个环境变量打开profile,例如: +``` +FLAGS_rpc_server_profile_period=10 FLAGS_rpc_server_profile_path=./tmp/pserver python train.py +``` + +3. 把pserver和trainer的profile文件生成一个timeline文件,例如: +``` +python /paddle/tools/timeline.py + --profile_path trainer0=local_profile_10_pass0_0,trainer1=local_profile_10_pass0_1,pserver0=./pserver_0,pserver1=./pserver_1 + --timeline_path ./dist.timeline +``` + +4. 在chrome中加载dist.timeline文件,方法和[本地使用](#local)第4步相同。 diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md index 6f963c6b4da696..673452efe0ef4e 100644 --- a/doc/fluid/howto/optimization/timeline_en.md +++ b/doc/fluid/howto/optimization/timeline_en.md @@ -1,4 +1,6 @@ -# how to use timeline tool to do profile +# How to use timeline tool to do profile + +## Local 1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. @@ -18,7 +20,6 @@ 1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details. - ```python python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline ``` @@ -31,3 +32,22 @@ python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=time ![chrome timeline](./timeline.jpeg) + +## Distributed +This tool can support distributed train programs(pserver and trainer) too. + +1. Open traniner profiler just like how to use in [local](#local). + +1. Open pserver profiler: add some enviroment variables, eg: +``` +FLAGS_rpc_server_profile_period=10 FLAGS_rpc_server_profile_path=./tmp/pserver python train.py +``` + +1. Merge pservers' and trainers' profiler file, eg: +``` +python /paddle/tools/timeline.py + --profile_path trainer0=local_profile_10_pass0_0,trainer1=local_profile_10_pass0_1,pserver0=./pserver_0,pserver1=./pserver_1 + --timeline_path ./dist.timeline +``` + +1. Load `dist.timeline` in chrome://tracing diff --git a/doc/fluid/howto/performance/profiler.md b/doc/fluid/howto/performance/profiler.md index 1a5b8939a150d0..e38abebdc46f02 100644 --- a/doc/fluid/howto/performance/profiler.md +++ b/doc/fluid/howto/performance/profiler.md @@ -95,3 +95,22 @@ struct RecordEvent { } }; ``` + +### Report sample + +``` +Event Calls Total Min. Max. Ave. Ratio. +thread101::deserial 1410 392.302 0.032768 14.1058 0.278228 0.00117247 +thread100::GetRPC 11 2951.13 7.60675 1426.75 268.284 0.00882 +thread100::serial 14 75.3212 0.07584 36.2135 5.38009 0.000225112 +thread100::SendRPC 14 13.9494 0.003072 3.97517 0.996389 4.16905e-05 +thread99::GetRPC 15 3012.62 2.79062 1426.61 200.841 0.00900378 +... +thread0::matmul_grad 1480 3674.28 0.375808 181.608 2.48262 0.0109813 +thread0::matmul 1480 3365.82 0.196608 172.256 2.2742 0.0100594 +thread0::mul_grad 3840 3167.39 0.411648 3.33824 0.82484 0.00946633 +thread0::fetch_barrier 5 3082.82 354.385 1617.88 616.564 0.00921359 +thread0::dropout 2480 3014.05 0.201728 6.76454 1.21534 0.00900807 +``` + +Note: profiler can merge the same operator's time which runs multiple times in the same thread. \ No newline at end of file