diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 68bd1d3516f9..769b52fb733c 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -2149,6 +2149,7 @@ def step(self, lr_kwargs=None): if flops_profiler_active: if self.autotuning_enabled(): self.flops = self.flops_profiler.get_total_flops() * 3 + self.fwd_duration = self.flops_profiler.get_total_duration() else: self.flops_profiler.print_model_profile( profile_step=self.global_steps, @@ -2203,7 +2204,7 @@ def _autotuning_exit(self): titer += msg[FORWARD_GLOBAL_TIMER] if FORWARD_GLOBAL_TIMER in msg else 0 titer += msg[BACKWARD_GLOBAL_TIMER] if BACKWARD_GLOBAL_TIMER in msg else 0 titer += msg[STEP_GLOBAL_TIMER] if STEP_GLOBAL_TIMER in msg else 0 - + titer *= self.gradient_accumulation_steps() msg["latency"] = titer msg["FLOPS_per_gpu"] = self.flops * 1_000_000 * self.gradient_accumulation_steps() / titer msg["throughput"] = self.train_batch_size() * 1_000_000 / \