Skip to content

Commit

Permalink
Fix reported training loss when using AMP
Browse files Browse the repository at this point in the history
  • Loading branch information
Miltos Allamanis authored and mallamanis committed Jul 25, 2021
1 parent f62a616 commit 6c55894
Showing 1 changed file with 8 additions and 6 deletions.
14 changes: 8 additions & 6 deletions ptgnn/baseneuralmodel/distributedtrainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@ def _run_training(
if torch.isnan(mb_loss):
raise Exception("Loss has a NaN value.")

mb_loss = scaler.scale(mb_loss)

num_minibatches += 1
num_samples += len(raw_samples)
sum_epoch_loss += float(mb_loss.detach())
with torch.no_grad():
num_minibatches += 1
num_samples += len(raw_samples)
sum_epoch_loss += mb_loss

mb_loss = scaler.scale(mb_loss)
mb_loss.backward()

if self._clip_gradient_norm is not None:
Expand Down Expand Up @@ -105,7 +105,9 @@ def _run_training(
assert (
num_minibatches > 0
), "No training minibatches were created. The minibatch size may be too large or the training dataset size too small."
self.LOGGER.info("Epoch %i: Train Loss %.2f", epoch + 1, sum_epoch_loss / num_minibatches)
self.LOGGER.info(
"Epoch %i: Train Loss %.2f", epoch + 1, float(sum_epoch_loss) / num_minibatches
)
train_metrics = distibuted_module.module.report_metrics()

for epoch_hook in self._train_epoch_end_hooks:
Expand Down

0 comments on commit 6c55894

Please sign in to comment.