Performance logging (#385)

Summary: This changes ClassificationTask to compute some high-level performance numbers (img/sec) and plot them in Tensorboard. This is useful for comparing performance optimizations since we now get a "blessed" performance number. Also, this was done in a way that's comparable to NVidia's benchmarks (e.g. https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch/performance), so we know how well we're doing compared to other implementations. In terms of implementation, I could have made a hook instead, but decided against it for two reasons: (1) it would introduce dependencies between hooks; (2) we want to control precisely when the timing measurements are taken; Pull Request resolved: #385 Test Plan: ./classy_train.py --config configs/template_config.json Reviewed By: mannatsingh Differential Revision: D19739656 Pulled By: vreis fbshipit-source-id: a63c394308851e6accee9d260d9cb1d972f33a7f
facebookresearch · Feb 11, 2020 · 0753b0d · 0753b0d
1 parent 7c25113
commit 0753b0d
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 0 deletions.
diff --git a/classy_vision/hooks/tensorboard_plot_hook.py b/classy_vision/hooks/tensorboard_plot_hook.py
@@ -140,6 +140,20 @@ def on_phase_end(
                     )
                     continue
 
+        if hasattr(task, "perf_log"):
+            for perf in task.perf_log:
+                phase_idx = perf["phase_idx"]
+                tag = perf["tag"]
+                for metric_name, metric_value in perf.items():
+                    if metric_name in ["phase_idx", "tag"]:
+                        continue
+
+                    self.tb_writer.add_scalar(
+                        f"Performance/{tag}/{metric_name}",
+                        metric_value,
+                        global_step=phase_idx,
+                    )
+
         # flush so that the plots aren't lost if training crashes soon after
         self.tb_writer.flush()
         logging.info(f"Done plotting to Tensorboard")
diff --git a/classy_vision/tasks/classification_task.py b/classy_vision/tasks/classification_task.py
@@ -7,6 +7,7 @@
 import copy
 import enum
 import logging
+import time
 from typing import Any, Dict, List, Optional, Union
 
 import torch
@@ -93,6 +94,7 @@ class ClassificationTask(ClassyTask):
         by the optimizer
     :var data_iterator: Iterator which can be used to obtain batches
     :var losses: Loss curve
+    :var perf_log: list of training speed measurements, to be logged
 
     """
 
@@ -122,6 +124,7 @@ def __init__(self):
             BroadcastBuffersMode.DISABLED
         )
         self.amp_opt_level = None
+        self.perf_log = []
 
     def set_checkpoint(self, checkpoint):
         """Sets checkpoint on task.
@@ -809,17 +812,49 @@ def on_start(self, local_variables):
         self.run_hooks(local_variables, ClassyHookFunctions.on_start.name)
 
     def on_phase_start(self, local_variables):
+        self.phase_start_time_total = time.perf_counter()
+
         self.advance_phase()
 
         self.run_hooks(local_variables, ClassyHookFunctions.on_phase_start.name)
 
+        self.phase_start_time_train = time.perf_counter()
+
     def on_phase_end(self, local_variables):
+        self.log_phase_end("train")
+
         logging.info("Syncing meters on phase end...")
         for meter in self.meters:
             meter.sync_state()
         logging.info("...meters synced")
         barrier()
+
         self.run_hooks(local_variables, ClassyHookFunctions.on_phase_end.name)
+        self.perf_log = []
+
+        self.log_phase_end("total")
 
     def on_end(self, local_variables):
         self.run_hooks(local_variables, ClassyHookFunctions.on_end.name)
+
+    def log_phase_end(self, tag):
+        if not self.train:
+            return
+
+        start_time = (
+            self.phase_start_time_train
+            if tag == "train"
+            else self.phase_start_time_total
+        )
+        phase_duration = time.perf_counter() - start_time
+        im_per_sec = (
+            self.get_global_batchsize() * self.num_batches_per_phase
+        ) / phase_duration
+        self.perf_log.append(
+            {
+                "tag": tag,
+                "phase_idx": self.train_phase_idx,
+                "epoch_duration": phase_duration,
+                "im_per_sec": im_per_sec,
+            }
+        )