From feb95e2471d74902c03fb6c43950b8b1e40737bb Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Thu, 23 Feb 2023 15:53:24 -0800 Subject: [PATCH 1/2] fix iteration timing when gas > 1 --- deepspeed/runtime/engine.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 86daf283301b..60a01b911b54 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -2199,6 +2199,7 @@ def step(self, lr_kwargs=None): if flops_profiler_active: if self.autotuning_enabled(): self.flops = self.flops_profiler.get_total_flops() * 3 + self.fwd_duration = self.flops_profiler.get_total_duration() else: self.flops_profiler.print_model_profile( profile_step=self.global_steps, @@ -2252,11 +2253,13 @@ def _autotuning_exit(self): STEP_GLOBAL_TIMER, ], reset=False) - titer = msg[FORWARD_GLOBAL_TIMER] + msg[BACKWARD_GLOBAL_TIMER] + msg[ - STEP_GLOBAL_TIMER] + titer = (msg[FORWARD_GLOBAL_TIMER] + msg[BACKWARD_GLOBAL_TIMER] + msg[ + STEP_GLOBAL_TIMER]) * self.gradient_accumulation_steps() msg["latency"] = titer msg["FLOPS_per_gpu"] = self.flops * 1_000_000 * self.gradient_accumulation_steps( ) / titer + msg["num_flops"] = self.flops + msg["fwd_duration"] = self.fwd_duration msg["throughput"] = self.train_batch_size() * 1_000_000 / \ msg["latency"] print_json_dist(msg, [0], path=self.autotuning_metric_path()) From 364c73c4d1bd31725960a30320eef659c27e3499 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Thu, 23 Feb 2023 16:03:37 -0800 Subject: [PATCH 2/2] fix formatting --- deepspeed/runtime/engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 60a01b911b54..835266d39822 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -2253,8 +2253,8 @@ def _autotuning_exit(self): STEP_GLOBAL_TIMER, ], reset=False) - titer = (msg[FORWARD_GLOBAL_TIMER] + msg[BACKWARD_GLOBAL_TIMER] + msg[ - STEP_GLOBAL_TIMER]) * self.gradient_accumulation_steps() + titer = (msg[FORWARD_GLOBAL_TIMER] + msg[BACKWARD_GLOBAL_TIMER] + + msg[STEP_GLOBAL_TIMER]) * self.gradient_accumulation_steps() msg["latency"] = titer msg["FLOPS_per_gpu"] = self.flops * 1_000_000 * self.gradient_accumulation_steps( ) / titer