diff --git a/classy_vision/hooks/classy_hook.py b/classy_vision/hooks/classy_hook.py index 10fca71069..6ee11a5e0c 100644 --- a/classy_vision/hooks/classy_hook.py +++ b/classy_vision/hooks/classy_hook.py @@ -51,7 +51,7 @@ def __init__(self, a, b): def __init__(self): self.state = ClassyHookState() - def _noop(self, task: "tasks.ClassyTask", local_variables: Dict[str, Any]) -> None: + def _noop(self, *args, **kwargs) -> None: """Derived classes can set their hook functions to this. This is useful if they want those hook functions to not do anything. @@ -79,9 +79,7 @@ def on_phase_start( pass @abstractmethod - def on_step( - self, task: "tasks.ClassyTask", local_variables: Dict[str, Any] - ) -> None: + def on_step(self, task: "tasks.ClassyTask") -> None: """Called each time after parameters have been updated by the optimizer.""" pass diff --git a/classy_vision/hooks/exponential_moving_average_model_hook.py b/classy_vision/hooks/exponential_moving_average_model_hook.py index d0c72a35ae..409cf0e910 100644 --- a/classy_vision/hooks/exponential_moving_average_model_hook.py +++ b/classy_vision/hooks/exponential_moving_average_model_hook.py @@ -103,7 +103,7 @@ def on_phase_end(self, task: ClassyTask, local_variables: Dict[str, Any]) -> Non # state in the test phase self._save_current_model_state(task.base_model, self.state.model_state) - def on_step(self, task: ClassyTask, local_variables: Dict[str, Any]) -> None: + def on_step(self, task: ClassyTask) -> None: if not task.train: return diff --git a/classy_vision/hooks/loss_lr_meter_logging_hook.py b/classy_vision/hooks/loss_lr_meter_logging_hook.py index 33cb283024..9de42d55fa 100644 --- a/classy_vision/hooks/loss_lr_meter_logging_hook.py +++ b/classy_vision/hooks/loss_lr_meter_logging_hook.py @@ -45,13 +45,11 @@ def on_phase_end( # trainer to implement an unsynced end of phase meter or # for meters to not provide a sync function. logging.info("End of phase metric values:") - self._log_loss_meters(task, local_variables) + self._log_loss_meters(task) if task.train: - self._log_lr(task, local_variables) + self._log_lr(task) - def on_step( - self, task: "tasks.ClassyTask", local_variables: Dict[str, Any] - ) -> None: + def on_step(self, task: "tasks.ClassyTask") -> None: """ Log the LR every log_freq batches, if log_freq is not None. """ @@ -59,22 +57,18 @@ def on_step( return batches = len(task.losses) if batches and batches % self.log_freq == 0: - self._log_lr(task, local_variables) + self._log_lr(task) logging.info("Local unsynced metric values:") - self._log_loss_meters(task, local_variables) + self._log_loss_meters(task) - def _log_lr( - self, task: "tasks.ClassyTask", local_variables: Dict[str, Any] - ) -> None: + def _log_lr(self, task: "tasks.ClassyTask") -> None: """ Compute and log the optimizer LR. """ optimizer_lr = task.optimizer.parameters.lr logging.info("Learning Rate: {}\n".format(optimizer_lr)) - def _log_loss_meters( - self, task: "tasks.ClassyTask", local_variables: Dict[str, Any] - ) -> None: + def _log_loss_meters(self, task: "tasks.ClassyTask") -> None: """ Compute and log the loss and meters. """ diff --git a/classy_vision/hooks/progress_bar_hook.py b/classy_vision/hooks/progress_bar_hook.py index bdecbc6c9f..fc5dfde405 100644 --- a/classy_vision/hooks/progress_bar_hook.py +++ b/classy_vision/hooks/progress_bar_hook.py @@ -51,9 +51,7 @@ def on_phase_start( self.progress_bar = progressbar.ProgressBar(self.bar_size) self.progress_bar.start() - def on_step( - self, task: "tasks.ClassyTask", local_variables: Dict[str, Any] - ) -> None: + def on_step(self, task: "tasks.ClassyTask") -> None: """Update the progress bar with the batch size.""" if task.train and is_master() and self.progress_bar is not None: self.batches += 1 diff --git a/classy_vision/hooks/tensorboard_plot_hook.py b/classy_vision/hooks/tensorboard_plot_hook.py index e86864e313..6abeec04ba 100644 --- a/classy_vision/hooks/tensorboard_plot_hook.py +++ b/classy_vision/hooks/tensorboard_plot_hook.py @@ -64,9 +64,7 @@ def on_phase_start( self.wall_times = [] self.num_steps_global = [] - def on_step( - self, task: "tasks.ClassyTask", local_variables: Dict[str, Any] - ) -> None: + def on_step(self, task: "tasks.ClassyTask") -> None: """Store the observed learning rates.""" if self.learning_rates is None: logging.warning("learning_rates is not initialized") diff --git a/classy_vision/hooks/time_metrics_hook.py b/classy_vision/hooks/time_metrics_hook.py index 618e3fbd35..475282a699 100644 --- a/classy_vision/hooks/time_metrics_hook.py +++ b/classy_vision/hooks/time_metrics_hook.py @@ -40,11 +40,9 @@ def on_phase_start( Initialize start time and reset perf stats """ self.start_time = time.time() - local_variables["perf_stats"] = PerfStats() + task.perf_stats = PerfStats() - def on_step( - self, task: "tasks.ClassyTask", local_variables: Dict[str, Any] - ) -> None: + def on_step(self, task: "tasks.ClassyTask") -> None: """ Log metrics every log_freq batches, if log_freq is not None. """ @@ -52,7 +50,7 @@ def on_step( return batches = len(task.losses) if batches and batches % self.log_freq == 0: - self._log_performance_metrics(task, local_variables) + self._log_performance_metrics(task) def on_phase_end( self, task: "tasks.ClassyTask", local_variables: Dict[str, Any] @@ -62,11 +60,9 @@ def on_phase_end( """ batches = len(task.losses) if batches: - self._log_performance_metrics(task, local_variables) + self._log_performance_metrics(task) - def _log_performance_metrics( - self, task: "tasks.ClassyTask", local_variables: Dict[str, Any] - ) -> None: + def _log_performance_metrics(self, task: "tasks.ClassyTask") -> None: """ Compute and log performance metrics. """ @@ -85,11 +81,11 @@ def _log_performance_metrics( ) # Train step time breakdown - if local_variables.get("perf_stats") is None: - logging.warning('"perf_stats" not set in local_variables') + if not hasattr(task, "perf_stats") or task.perf_stats is None: + logging.warning('"perf_stats" not set in task') elif task.train: logging.info( "Train step time breakdown (rank {}):\n{}".format( - get_rank(), local_variables["perf_stats"].report_str() + get_rank(), task.perf_stats.report_str() ) ) diff --git a/classy_vision/tasks/classification_task.py b/classy_vision/tasks/classification_task.py index c54c3ca7d5..b476a43e26 100644 --- a/classy_vision/tasks/classification_task.py +++ b/classy_vision/tasks/classification_task.py @@ -8,7 +8,7 @@ import enum import logging import time -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, NamedTuple, Optional, Union import torch from classy_vision.dataset import ClassyDataset, build_dataset @@ -54,6 +54,13 @@ class BroadcastBuffersMode(enum.Enum): BEFORE_EVAL = enum.auto() +class LastBatchInfo(NamedTuple): + loss: torch.Tensor + output: torch.Tensor + target: torch.Tensor + sample: Dict[str, Any] + + @register_task("classification_task") class ClassificationTask(ClassyTask): """Basic classification training task. @@ -630,114 +637,94 @@ def set_classy_state(self, state): # Set up pytorch module in train vs eval mode, update optimizer. self._set_model_train_mode() - def eval_step(self, use_gpu, local_variables=None): - if local_variables is None: - local_variables = {} - + def eval_step(self, use_gpu): # Process next sample sample = next(self.get_data_iterator()) - local_variables["sample"] = sample - assert ( - isinstance(local_variables["sample"], dict) - and "input" in local_variables["sample"] - and "target" in local_variables["sample"] - ), ( + assert isinstance(sample, dict) and "input" in sample and "target" in sample, ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys" ) # Copy sample to GPU - local_variables["target"] = local_variables["sample"]["target"] + target = sample["target"] if use_gpu: - for key, value in local_variables["sample"].items(): - local_variables["sample"][key] = recursive_copy_to_gpu( - value, non_blocking=True - ) + for key, value in sample.items(): + sample[key] = recursive_copy_to_gpu(value, non_blocking=True) with torch.no_grad(): - local_variables["output"] = self.model(local_variables["sample"]["input"]) + output = self.model(sample["input"]) - local_variables["local_loss"] = self.compute_loss( - local_variables["output"], local_variables["sample"] - ) + local_loss = self.compute_loss(output, sample) - local_variables["loss"] = local_variables["local_loss"].detach().clone() - local_variables["loss"] = all_reduce_mean(local_variables["loss"]) + loss = local_loss.detach().clone() + loss = all_reduce_mean(loss) - self.losses.append( - local_variables["loss"].data.cpu().item() - * local_variables["target"].size(0) - ) + self.losses.append(loss.data.cpu().item() * target.size(0)) - self.update_meters(local_variables["output"], local_variables["sample"]) + self.update_meters(output, sample) - def train_step(self, use_gpu, local_variables=None): + # Move some data to the task so hooks get a chance to access it + self.last_batch = LastBatchInfo( + loss=loss, output=output, target=target, sample=sample + ) + + def train_step(self, use_gpu): """Train step to be executed in train loop Args: use_gpu: if true, execute training on GPU - local_variables: Dict containing intermediate values - in train_step for access by hooks """ - if local_variables is None: - local_variables = {} + self.last_batch = None # Process next sample sample = next(self.get_data_iterator()) - local_variables["sample"] = sample - assert ( - isinstance(local_variables["sample"], dict) - and "input" in local_variables["sample"] - and "target" in local_variables["sample"] - ), ( + assert isinstance(sample, dict) and "input" in sample and "target" in sample, ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys" ) # Copy sample to GPU - local_variables["target"] = local_variables["sample"]["target"] + target = sample["target"] if use_gpu: - for key, value in local_variables["sample"].items(): - local_variables["sample"][key] = recursive_copy_to_gpu( - value, non_blocking=True - ) + for key, value in sample.items(): + sample[key] = recursive_copy_to_gpu(value, non_blocking=True) with torch.enable_grad(): # Forward pass - local_variables["output"] = self.model(local_variables["sample"]["input"]) + output = self.model(sample["input"]) - local_variables["local_loss"] = self.compute_loss( - local_variables["output"], local_variables["sample"] - ) + local_loss = self.compute_loss(output, sample) - local_variables["loss"] = local_variables["local_loss"].detach().clone() - local_variables["loss"] = all_reduce_mean(local_variables["loss"]) + loss = local_loss.detach().clone() + loss = all_reduce_mean(loss) - self.losses.append( - local_variables["loss"].data.cpu().item() - * local_variables["target"].size(0) - ) + self.losses.append(loss.data.cpu().item() * target.size(0)) - self.update_meters(local_variables["output"], local_variables["sample"]) + self.update_meters(output, sample) # Run backwards pass / update optimizer if self.amp_opt_level is not None: self.optimizer.zero_grad() with apex.amp.scale_loss( - local_variables["local_loss"], self.optimizer.optimizer + local_loss, self.optimizer.optimizer ) as scaled_loss: scaled_loss.backward() else: - self.optimizer.backward(local_variables["local_loss"]) + self.optimizer.backward(local_loss) self.optimizer.update_schedule_on_step(self.where) self.optimizer.step() self.num_updates += self.get_global_batchsize() + # Move some data to the task so hooks get a chance to access it + self.last_batch = LastBatchInfo( + loss=loss, output=output, target=target, sample=sample + ) + def compute_loss(self, model_output, sample): return self.loss(model_output, sample["target"]) diff --git a/classy_vision/tasks/classy_task.py b/classy_vision/tasks/classy_task.py index 7d24e68ddb..e8478c3e5a 100644 --- a/classy_vision/tasks/classy_task.py +++ b/classy_vision/tasks/classy_task.py @@ -107,7 +107,7 @@ def prepare( pass @abstractmethod - def train_step(self, use_gpu, local_variables: Optional[Dict] = None) -> None: + def train_step(self, use_gpu) -> None: """ Run a train step. @@ -115,8 +115,6 @@ def train_step(self, use_gpu, local_variables: Optional[Dict] = None) -> None: Args: use_gpu: True if training on GPUs, False otherwise - local_variables: Local variables created in the function. Can be passed to - custom :class:`classy_vision.hooks.ClassyHook`. """ pass @@ -157,7 +155,7 @@ def on_end(self, local_variables): pass @abstractmethod - def eval_step(self, use_gpu, local_variables: Optional[Dict] = None) -> None: + def eval_step(self, use_gpu) -> None: """ Run an evaluation step. @@ -165,20 +163,19 @@ def eval_step(self, use_gpu, local_variables: Optional[Dict] = None) -> None: Args: use_gpu: True if training on GPUs, False otherwise - local_variables: Local variables created in the function. Can be passed to - custom :class:`classy_vision.hooks.ClassyHook`. """ pass - def step(self, use_gpu, local_variables: Optional[Dict] = None) -> None: + def step(self, use_gpu) -> None: from classy_vision.hooks import ClassyHookFunctions if self.train: - self.train_step(use_gpu, local_variables) + self.train_step(use_gpu) else: - self.eval_step(use_gpu, local_variables) + self.eval_step(use_gpu) - self.run_hooks(local_variables, ClassyHookFunctions.on_step.name) + for hook in self.hooks: + hook.on_step(self) def run_hooks(self, local_variables: Dict[str, Any], hook_function: str) -> None: """ diff --git a/classy_vision/trainer/classy_trainer.py b/classy_vision/trainer/classy_trainer.py index 28ec02bb9d..25c6f3357d 100644 --- a/classy_vision/trainer/classy_trainer.py +++ b/classy_vision/trainer/classy_trainer.py @@ -77,7 +77,7 @@ def train(self, task: ClassyTask): task.on_phase_start(local_variables) while True: try: - task.step(self.use_gpu, local_variables) + task.step(self.use_gpu) except StopIteration: break task.on_phase_end(local_variables) diff --git a/classy_vision/trainer/elastic_trainer.py b/classy_vision/trainer/elastic_trainer.py index 36f445a05b..c604beb40a 100644 --- a/classy_vision/trainer/elastic_trainer.py +++ b/classy_vision/trainer/elastic_trainer.py @@ -106,7 +106,7 @@ def _run_step(self, state, local_variables, use_gpu): state.advance_to_next_phase = True state.skip_current_phase = False # Reset flag else: - state.task.step(use_gpu, local_variables) + state.task.step(use_gpu) except StopIteration: state.advance_to_next_phase = True diff --git a/test/hooks_exponential_moving_average_model_hook_test.py b/test/hooks_exponential_moving_average_model_hook_test.py index 710403065a..4b1cab64b4 100644 --- a/test/hooks_exponential_moving_average_model_hook_test.py +++ b/test/hooks_exponential_moving_average_model_hook_test.py @@ -53,7 +53,7 @@ def _test_exponential_moving_average_hook(self, model_device, hook_device): task.base_model.update_fc_weight() fc_weight = model.fc.weight.clone() for _ in range(num_updates): - exponential_moving_average_hook.on_step(task, local_variables) + exponential_moving_average_hook.on_step(task) exponential_moving_average_hook.on_phase_end(task, local_variables) # the model weights shouldn't have changed self.assertTrue(torch.allclose(model.fc.weight, fc_weight)) diff --git a/test/hooks_loss_lr_meter_logging_hook_test.py b/test/hooks_loss_lr_meter_logging_hook_test.py index d7499fc75d..2a7b3e06f4 100644 --- a/test/hooks_loss_lr_meter_logging_hook_test.py +++ b/test/hooks_loss_lr_meter_logging_hook_test.py @@ -51,27 +51,27 @@ def test_logging(self, mock_get_rank: mock.MagicMock) -> None: for i in range(num_batches): task.losses = list(range(i)) - loss_lr_meter_hook.on_step(task, local_variables) + loss_lr_meter_hook.on_step(task) if log_freq is not None and i and i % log_freq == 0: - mock_fn.assert_called_with(task, local_variables) + mock_fn.assert_called_with(task) mock_fn.reset_mock() - mock_lr_fn.assert_called_with(task, local_variables) + mock_lr_fn.assert_called_with(task) mock_lr_fn.reset_mock() continue mock_fn.assert_not_called() mock_lr_fn.assert_not_called() loss_lr_meter_hook.on_phase_end(task, local_variables) - mock_fn.assert_called_with(task, local_variables) + mock_fn.assert_called_with(task) if task.train: - mock_lr_fn.assert_called_with(task, local_variables) + mock_lr_fn.assert_called_with(task) # test _log_loss_lr_meters() task.losses = losses with self.assertLogs(): - loss_lr_meter_hook._log_loss_meters(task, local_variables) - loss_lr_meter_hook._log_lr(task, local_variables) + loss_lr_meter_hook._log_loss_meters(task) + loss_lr_meter_hook._log_lr(task) task.phase_idx += 1 @@ -95,7 +95,7 @@ def scheduler_mock(where): lr_order = [0.0, 1 / 6, 1 / 6, 2 / 6, 3 / 6, 3 / 6, 4 / 6, 5 / 6, 5 / 6] lr_list = [] - def mock_log_lr(task: ClassyTask, local_variables) -> None: + def mock_log_lr(task: ClassyTask) -> None: lr_list.append(task.optimizer.parameters.lr) with mock.patch.object( diff --git a/test/hooks_time_metrics_hook_test.py b/test/hooks_time_metrics_hook_test.py index 296c42e561..ab9c0358bd 100644 --- a/test/hooks_time_metrics_hook_test.py +++ b/test/hooks_time_metrics_hook_test.py @@ -49,7 +49,7 @@ def test_time_metrics( mock_time.return_value = start_time time_metrics_hook.on_phase_start(task, local_variables) self.assertEqual(time_metrics_hook.start_time, start_time) - self.assertTrue(isinstance(local_variables.get("perf_stats"), PerfStats)) + self.assertTrue(isinstance(task.perf_stats, PerfStats)) # test that the code doesn't raise an exception if losses is empty try: @@ -66,15 +66,15 @@ def test_time_metrics( for i in range(num_batches): task.losses = list(range(i)) - time_metrics_hook.on_step(task, local_variables) + time_metrics_hook.on_step(task) if log_freq is not None and i and i % log_freq == 0: - mock_fn.assert_called_with(task, local_variables) + mock_fn.assert_called_with(task) mock_fn.reset_mock() continue mock_fn.assert_not_called() time_metrics_hook.on_phase_end(task, local_variables) - mock_fn.assert_called_with(task, local_variables) + mock_fn.assert_called_with(task) task.losses = [0.23, 0.45, 0.34, 0.67] @@ -84,7 +84,7 @@ def test_time_metrics( # test _log_performance_metrics() with self.assertLogs() as log_watcher: - time_metrics_hook._log_performance_metrics(task, local_variables) + time_metrics_hook._log_performance_metrics(task) # there should 2 be info logs for train and 1 for test self.assertEqual(len(log_watcher.output), 2 if train else 1) @@ -112,7 +112,7 @@ def test_time_metrics( # if on_phase_start() is not called, 2 warnings should be logged # create a new time metrics hook - local_variables = {} + task.perf_stats = None time_metrics_hook_new = TimeMetricsHook() with self.assertLogs() as log_watcher: diff --git a/test/manual/hooks_progress_bar_hook_test.py b/test/manual/hooks_progress_bar_hook_test.py index 3b53fca7c4..bfbd3a9747 100644 --- a/test/manual/hooks_progress_bar_hook_test.py +++ b/test/manual/hooks_progress_bar_hook_test.py @@ -48,14 +48,14 @@ def test_progress_bar( # on_step should update the progress bar correctly for i in range(num_batches): - progress_bar_hook.on_step(task, local_variables) + progress_bar_hook.on_step(task) mock_progress_bar.update.assert_called_once_with(i + 1) mock_progress_bar.update.reset_mock() # check that even if on_step is called again, the progress bar is # only updated with num_batches for _ in range(num_batches): - progress_bar_hook.on_step(task, local_variables) + progress_bar_hook.on_step(task) mock_progress_bar.update.assert_called_once_with(num_batches) mock_progress_bar.update.reset_mock() @@ -68,7 +68,7 @@ def test_progress_bar( # crash progress_bar_hook = ProgressBarHook() try: - progress_bar_hook.on_step(task, local_variables) + progress_bar_hook.on_step(task) progress_bar_hook.on_phase_end(task, local_variables) except Exception as e: self.fail( @@ -81,7 +81,7 @@ def test_progress_bar( progress_bar_hook = ProgressBarHook() try: progress_bar_hook.on_phase_start(task, local_variables) - progress_bar_hook.on_step(task, local_variables) + progress_bar_hook.on_step(task) progress_bar_hook.on_phase_end(task, local_variables) except Exception as e: self.fail("Received Exception when is_master() is False: {}".format(e)) diff --git a/test/manual/hooks_tensorboard_plot_hook_test.py b/test/manual/hooks_tensorboard_plot_hook_test.py index be125f0637..033fa46d70 100644 --- a/test/manual/hooks_tensorboard_plot_hook_test.py +++ b/test/manual/hooks_tensorboard_plot_hook_test.py @@ -62,7 +62,7 @@ def test_writer(self, mock_is_master_func: mock.MagicMock) -> None: # the writer if on_phase_start() is not called for initialization # before on_step() is called. with self.assertLogs() as log_watcher: - tensorboard_plot_hook.on_step(task, local_variables) + tensorboard_plot_hook.on_step(task) self.assertTrue( len(log_watcher.records) == 1 @@ -88,7 +88,7 @@ def test_writer(self, mock_is_master_func: mock.MagicMock) -> None: for loss in losses: task.losses.append(loss) - tensorboard_plot_hook.on_step(task, local_variables) + tensorboard_plot_hook.on_step(task) tensorboard_plot_hook.on_phase_end(task, local_variables) diff --git a/test/optim_param_scheduler_test.py b/test/optim_param_scheduler_test.py index 8b9ad6c486..08aca43ff9 100644 --- a/test/optim_param_scheduler_test.py +++ b/test/optim_param_scheduler_test.py @@ -207,7 +207,7 @@ class TestHook(ClassyHook): on_phase_end = ClassyHook._noop on_end = ClassyHook._noop - def on_step(self, task: ClassyTask, local_variables) -> None: + def on_step(self, task: ClassyTask) -> None: if not task.train: return diff --git a/test/tasks_classification_task_test.py b/test/tasks_classification_task_test.py index d9ea87ed4d..44617e3e46 100644 --- a/test/tasks_classification_task_test.py +++ b/test/tasks_classification_task_test.py @@ -73,7 +73,6 @@ def test_checkpointing(self): task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()]) use_gpu = torch.cuda.is_available() - local_variables = {} # prepare the tasks for the right device task.prepare(use_gpu=use_gpu) @@ -96,8 +95,8 @@ def test_checkpointing(self): # test that the train step runs the same way on both states # and the loss remains the same - task.train_step(use_gpu, local_variables) - task_2.train_step(use_gpu, local_variables) + task.train_step(use_gpu) + task_2.train_step(use_gpu) self._compare_states(task.get_classy_state(), task_2.get_classy_state()) def test_final_train_checkpoint(self):