From 013cb7b526901fafbd0fa1a079911ba419709404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 5 Jul 2021 11:43:30 +0200 Subject: [PATCH 1/6] add fix --- pytorch_lightning/loops/batch/training_batch_loop.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py index ef6c885930914..4155a7f90efec 100644 --- a/pytorch_lightning/loops/batch/training_batch_loop.py +++ b/pytorch_lightning/loops/batch/training_batch_loop.py @@ -178,6 +178,10 @@ def _run_optimization( with self.block_ddp_sync_behaviour(): closure() + if self.trainer.lightning_module.automatic_optimization and len(self.trainer.optimizers) > 1: + # revert back to previous state + self.trainer.lightning_module.untoggle_optimizer(opt_idx) + # ------------------------------ # BACKWARD PASS # ------------------------------ From e6aff65dc7b732707dd0c31e43d3b5e4f06f351e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 5 Jul 2021 12:02:36 +0200 Subject: [PATCH 2/6] toggle test --- tests/core/test_lightning_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py index f05305c785c7e..09c7b3a7115c5 100644 --- a/tests/core/test_lightning_module.py +++ b/tests/core/test_lightning_module.py @@ -197,7 +197,7 @@ def optimizer_step( max_epochs=1, default_root_dir=tmpdir, limit_train_batches=8, - accumulate_grad_batches=1, + accumulate_grad_batches=2, limit_val_batches=0, ) trainer.fit(model) @@ -331,7 +331,7 @@ def configure_optimizers(self): max_epochs=1, default_root_dir=tmpdir, limit_train_batches=8, - accumulate_grad_batches=1, + accumulate_grad_batches=2, ) trainer.fit(model) From ab8336b12b2523799ef64b09860fcbe2c911a8e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 5 Jul 2021 12:02:44 +0200 Subject: [PATCH 3/6] re-structure --- .../loops/batch/training_batch_loop.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py index 4155a7f90efec..4f0c454af4885 100644 --- a/pytorch_lightning/loops/batch/training_batch_loop.py +++ b/pytorch_lightning/loops/batch/training_batch_loop.py @@ -130,6 +130,7 @@ def advance(self, batch, batch_idx, dataloader_idx): if self.trainer.lightning_module.automatic_optimization: for opt_idx, optimizer in self.get_active_optimizers(batch_idx): + # assert self.trainer.lightning_module.layer.weight.requires_grad result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer) if result: self.batch_outputs[opt_idx].append(result.training_step_output) @@ -178,10 +179,6 @@ def _run_optimization( with self.block_ddp_sync_behaviour(): closure() - if self.trainer.lightning_module.automatic_optimization and len(self.trainer.optimizers) > 1: - # revert back to previous state - self.trainer.lightning_module.untoggle_optimizer(opt_idx) - # ------------------------------ # BACKWARD PASS # ------------------------------ @@ -189,20 +186,16 @@ def _run_optimization( else: if self.trainer.lightning_module.automatic_optimization: self._optimizer_step(optimizer, opt_idx, batch_idx, closure) - if len(self.trainer.optimizers) > 1: - # revert back to previous state - self.trainer.lightning_module.untoggle_optimizer(opt_idx) else: result = self._training_step(split_batch, batch_idx, opt_idx, self._hiddens) - if not result: - # user decided to skip optimization - return result - + if result: # update running loss + reset accumulated loss self._update_running_loss(result.loss) + self._process_closure_result(result) - self._process_closure_result(result) + # untoggle model params + self._run_optimization_end(opt_idx) return result def _training_step_and_backward_closure( @@ -494,6 +487,11 @@ def _run_optimization_start(self, opt_idx: int, optimizer: torch.optim.Optimizer model = self.trainer.lightning_module model.toggle_optimizer(optimizer, opt_idx) + def _run_optimization_end(self, opt_idx: int) -> None: + if self.trainer.lightning_module.automatic_optimization and len(self.trainer.optimizers) > 1: + model = self.trainer.lightning_module + model.untoggle_optimizer(opt_idx) + @contextmanager def block_ddp_sync_behaviour(self, should_block_sync: bool = False) -> Generator[None, None, None]: """ From 2eb191207684bcb6ec00c79548d5c47e10effaae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 5 Jul 2021 12:07:34 +0200 Subject: [PATCH 4/6] update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 26c863be63d83..9666cd9e54e2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -354,6 +354,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed passing a custom `DDPPlugin` when choosing `accelerator="ddp_cpu"` for the accelerator ([#6208](https://github.com/PyTorchLightning/pytorch-lightning/pull/6208)) +- Fixed missing call to `LightningModule.untoggle_optimizer` in training loop when running gradient accumulation with multiple optimizers ([#8284](https://github.com/PyTorchLightning/pytorch-lightning/pull/8284)) + + ## [1.3.8] - 2021-07-01 ### Fixed From 521ff99ef79eca0d3890e4ad90e250a2cc951daa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 5 Jul 2021 12:35:18 +0200 Subject: [PATCH 5/6] update comment --- pytorch_lightning/loops/batch/training_batch_loop.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py index 4f0c454af4885..f79b9c17d8cbc 100644 --- a/pytorch_lightning/loops/batch/training_batch_loop.py +++ b/pytorch_lightning/loops/batch/training_batch_loop.py @@ -190,7 +190,8 @@ def _run_optimization( result = self._training_step(split_batch, batch_idx, opt_idx, self._hiddens) if result: - # update running loss + reset accumulated loss + # if no result, user decided to skip optimization + # otherwise update running loss + reset accumulated loss self._update_running_loss(result.loss) self._process_closure_result(result) From 43c7fb0d59cbc204c9d086d4c2b82881ac054912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 5 Jul 2021 13:29:20 +0200 Subject: [PATCH 6/6] remove debugging assertion --- pytorch_lightning/loops/batch/training_batch_loop.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py index f79b9c17d8cbc..e1e9a1bf3bcf6 100644 --- a/pytorch_lightning/loops/batch/training_batch_loop.py +++ b/pytorch_lightning/loops/batch/training_batch_loop.py @@ -130,7 +130,6 @@ def advance(self, batch, batch_idx, dataloader_idx): if self.trainer.lightning_module.automatic_optimization: for opt_idx, optimizer in self.get_active_optimizers(batch_idx): - # assert self.trainer.lightning_module.layer.weight.requires_grad result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer) if result: self.batch_outputs[opt_idx].append(result.training_step_output)