From b16d356221b1fd74a3007f0fcbeeff295ce988f4 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 6 May 2020 13:54:47 -0700
Subject: [PATCH 01/40] Rebase off of master; add new working prototype of loss
 callback

Signed-off-by: Jason <jasoli@nvidia.com>
---
 examples/asr/jasper_an4_debug.py            | 298 ++++++++++++++++++++
 nemo/backends/pytorch/actions.py            |  28 +-
 nemo/core/callbacks.py                      |  39 +++
 nemo/core/neural_factory.py                 |  70 ++++-
 nemo/core/neural_types/__init__.py          |   1 +
 nemo/core/neural_types/neural_type.py       |   9 +-
 nemo/core/neural_types/nmtensor_registry.py |  87 ++++++
 nemo/utils/app_state.py                     |  23 +-
 8 files changed, 525 insertions(+), 30 deletions(-)
 create mode 100755 examples/asr/jasper_an4_debug.py
 create mode 100755 nemo/core/neural_types/nmtensor_registry.py

diff --git a/examples/asr/jasper_an4_debug.py b/examples/asr/jasper_an4_debug.py
new file mode 100755
index 000000000000..e19ea0117f62
--- /dev/null
+++ b/examples/asr/jasper_an4_debug.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2019 NVIDIA Corporation
+import argparse
+import math
+import os
+from functools import partial
+
+from ruamel.yaml import YAML
+
+import nemo
+import nemo.collections.asr as nemo_asr
+import nemo.utils.argparse as nm_argparse
+from nemo.collections.asr.helpers import (
+    monitor_asr_train_progress,
+    post_process_predictions,
+    post_process_transcripts,
+    process_evaluation_batch,
+    process_evaluation_epoch,
+    word_error_rate,
+)
+from nemo.utils.lr_policies import CosineAnnealing
+
+logging = nemo.logging
+
+
+def create_dags(model_config_file, vocab, args, nf):
+
+    # Create a data_layer for training.
+    data_layer = nemo_asr.AudioToTextDataLayer.import_from_config(
+        model_config_file,
+        "AudioToTextDataLayer_train",
+        overwrite_params={"manifest_filepath": args.train_dataset, "batch_size": args.batch_size},
+    )
+
+    num_samples = len(data_layer)
+    steps_per_epoch = math.ceil(num_samples / (data_layer.batch_size * args.iter_per_step * nf.world_size))
+    total_steps = steps_per_epoch * args.num_epochs
+    logging.info("Train samples=", num_samples, "num_steps=", total_steps)
+
+    # # Create a data_layer for evaluation.
+    # data_layer_eval = nemo_asr.AudioToTextDataLayer.import_from_config(
+    #     model_config_file, "AudioToTextDataLayer_eval", overwrite_params={"manifest_filepath": args.eval_datasets},
+    # )
+
+    # num_samples = len(data_layer_eval)
+    # logging.info(f"Eval samples={num_samples}")
+
+    # Instantiate data processor.
+    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config(
+        model_config_file, "AudioToMelSpectrogramPreprocessor"
+    )
+
+    # Instantiate JASPER encoder-decoder modules.
+    jasper_encoder = nemo_asr.JasperEncoder.import_from_config(model_config_file, "JasperEncoder")
+    jasper_decoder = nemo_asr.JasperDecoderForCTC.import_from_config(
+        model_config_file, "JasperDecoderForCTC", overwrite_params={"num_classes": len(vocab)}
+    )
+
+    # Instantiate losses.
+    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
+    greedy_decoder = nemo_asr.GreedyCTCDecoder()
+
+    # Create a training graph.
+    audio, audio_len, transcript, transcript_len = data_layer()
+    processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len)
+    encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len)
+    log_probs = jasper_decoder(encoder_output=encoded)
+    predictions = greedy_decoder(log_probs=log_probs)
+    loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,)
+
+    # # Create an evaluation graph.
+    # audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
+    # processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e)
+    # encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e)
+    # log_probs_e = jasper_decoder(encoder_output=encoded_e)
+    # predictions_e = greedy_decoder(log_probs=log_probs_e)
+    # loss_e = ctc_loss(
+    #     log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
+    # )
+    logging.info("Num of params in encoder: {0}".format(jasper_encoder.num_weights))
+
+    # Callbacks to print info to console and Tensorboard.
+    # train_callback = nemo.core.SimpleLossLoggerCallback(
+    #     tensors=[loss, predictions, transcript, transcript_len],
+    #     print_func=partial(monitor_asr_train_progress, labels=vocab),
+    #     get_tb_values=lambda x: [["loss", x[0]]],
+    #     tb_writer=nf.tb_writer,
+    # )
+
+    # loss.rename("test")
+    # train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["test"])
+
+    train_callback = nemo.core.SimpleLossLogger()
+
+    # checkpointer_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)
+
+    # eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
+    # eval_callback = nemo.core.EvaluatorCallback(
+    #     eval_tensors=eval_tensors,
+    #     user_iter_callback=partial(process_evaluation_batch, labels=vocab),
+    #     user_epochs_done_callback=process_evaluation_epoch,
+    #     eval_step=args.eval_freq,
+    #     tb_writer=nf.tb_writer,
+    #     eval_at_start=not args.do_not_eval_at_start,
+    # )
+    # callbacks = [train_callback, checkpointer_callback, eval_callback]
+    callbacks = [train_callback]
+
+    # Return entities required by the actual training.
+    return (
+        loss,
+        # eval_tensors,
+        callbacks,
+        total_steps,
+        # log_probs_e,
+        # encoded_len_e,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        parents=[nm_argparse.NemoArgParser()], description='AN4 ASR', conflict_handler='resolve',
+    )
+
+    # Overwrite default args
+    parser.add_argument("--train_dataset", type=str, help="training dataset path")
+    parser.add_argument("--eval_datasets", type=str, help="validation dataset path")
+
+    # Create new args
+    # parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str)
+    parser.add_argument("--batch_size", default=48, type=int, help="size of the training batch")
+    parser.add_argument("--lm", default=None, type=str)
+    parser.add_argument("--test_after_training", action='store_true')
+    parser.add_argument("--momentum", type=float)
+    parser.add_argument("--beta1", default=0.95, type=float)
+    parser.add_argument("--beta2", default=0.25, type=float)
+    parser.add_argument("--do_not_eval_at_start", action='store_true')
+    parser.set_defaults(
+        model_config="./configs/jasper_an4.yaml",
+        train_dataset="~/TestData/an4_dataset/an4_train.json",
+        eval_datasets="~/TestData/an4_dataset/an4_val.json",
+        work_dir="./tmp",
+        optimizer="novograd",
+        num_epochs=50,
+        lr=0.02,
+        weight_decay=0.005,
+        checkpoint_save_freq=1000,
+        eval_freq=100,
+        amp_opt_level="O1",
+    )
+
+    args = parser.parse_args()
+    betas = (args.beta1, args.beta2)
+
+    wer_thr = 0.20
+    beam_wer_thr = 0.15
+
+    nf = nemo.core.NeuralModuleFactory(
+        local_rank=args.local_rank,
+        files_to_copy=[__file__],
+        optimization_level=args.amp_opt_level,
+        random_seed=0,
+        log_dir=args.work_dir,
+        create_tb_writer=True,
+        cudnn_benchmark=args.cudnn_benchmark,
+    )
+    tb_writer = nf.tb_writer
+    checkpoint_dir = nf.checkpoint_dir
+
+    # Load model definition
+    yaml = YAML(typ="safe")
+    with open(args.model_config) as f:
+        jasper_params = yaml.load(f)
+    # Get vocabulary.
+    vocab = jasper_params['labels']
+
+    # (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e,) = create_dags(
+    #     args.model_config, vocab, args, nf
+    # )
+
+    loss, callbacks, total_steps = create_dags(args.model_config, vocab, args, nf)
+
+    nf.train(
+        tensors_to_optimize=[loss],
+        callbacks=callbacks,
+        optimizer=args.optimizer,
+        lr_policy=CosineAnnealing(total_steps=total_steps, min_lr=args.lr / 100),
+        optimization_params={
+            "num_epochs": args.num_epochs,
+            "max_steps": args.max_steps,
+            "lr": args.lr,
+            "momentum": args.momentum,
+            "betas": betas,
+            "weight_decay": args.weight_decay,
+            "grad_norm_clip": None,
+        },
+        batches_per_step=args.iter_per_step,
+        amp_max_loss_scale=256.0,
+        # synced_batchnorm=(nf.global_rank is not None),
+    )
+
+    # if args.test_after_training:
+    #     logging.info("Testing greedy and beam search with LM WER.")
+    #     # Create BeamSearch NM
+    #     if nf.world_size > 1 or args.lm is None:
+    #         logging.warning("Skipping beam search WER as it does not work if doing distributed training.")
+    #     else:
+    #         beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
+    #             vocab=vocab, beam_width=64, alpha=2.0, beta=1.5, lm_path=args.lm, num_cpus=max(os.cpu_count(), 1),
+    #         )
+    #         beam_predictions = beam_search_with_lm(log_probs=log_probs_e, log_probs_length=encoded_len_e)
+    #         eval_tensors.append(beam_predictions)
+
+    #     evaluated_tensors = nf.infer(eval_tensors)
+    #     if nf.global_rank in [0, None]:
+    #         greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
+    #         references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab)
+    #         wer = word_error_rate(hypotheses=greedy_hypotheses, references=references)
+    #         logging.info("Greedy WER: {:.2f}%".format(wer * 100))
+    #         if wer > wer_thr:
+    #             nf.sync_all_processes(False)
+    #             raise ValueError(f"Final eval greedy WER {wer * 100:.2f}% > :" f"than {wer_thr * 100:.2f}%")
+    #     nf.sync_all_processes()
+
+    #     if nf.world_size == 1 and args.lm is not None:
+    #         beam_hypotheses = []
+    #         # Over mini-batch
+    #         for i in evaluated_tensors[-1]:
+    #             # Over samples
+    #             for j in i:
+    #                 beam_hypotheses.append(j[0][1])
+
+    #         beam_wer = word_error_rate(hypotheses=beam_hypotheses, references=references)
+    #         logging.info("Beam WER {:.2f}%".format(beam_wer * 100))
+    #         assert beam_wer <= beam_wer_thr, "Final eval beam WER {:.2f}%  > than {:.2f}%".format(
+    #             beam_wer * 100, beam_wer_thr * 100
+    #         )
+    #         assert beam_wer <= wer, "Final eval beam WER > than the greedy WER."
+
+    #     # Reload model weights and train for extra 10 epochs
+    #     checkpointer_callback = nemo.core.CheckpointCallback(
+    #         folder=checkpoint_dir, step_freq=args.checkpoint_save_freq, force_load=True,
+    #     )
+
+    #     # Distributed Data Parallel changes the underlying class so we need
+    #     # to reinstantiate Encoder and Decoder
+    #     args.num_epochs += 10
+    #     previous_step_count = total_steps
+    #     loss, eval_tensors, callbacks, total_steps, _, _ = create_dags(args.model_config, vocab, args, nf)
+
+    #     nf.reset_trainer()
+    #     nf.train(
+    #         tensors_to_optimize=[loss],
+    #         callbacks=callbacks,
+    #         optimizer=args.optimizer,
+    #         lr_policy=CosineAnnealing(warmup_steps=previous_step_count, total_steps=total_steps),
+    #         optimization_params={
+    #             "num_epochs": args.num_epochs,
+    #             "lr": args.lr / 100,
+    #             "momentum": args.momentum,
+    #             "betas": betas,
+    #             "weight_decay": args.weight_decay,
+    #             "grad_norm_clip": None,
+    #         },
+    #         reset=True,
+    #         amp_max_loss_scale=256.0,
+    #         # synced_batchnorm=(nf.global_rank is not None),
+    #     )
+
+    #     evaluated_tensors = nf.infer(eval_tensors)
+    #     if nf.global_rank in [0, None]:
+    #         greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
+    #         references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab)
+    #         wer_new = word_error_rate(hypotheses=greedy_hypotheses, references=references)
+    #         logging.info("New greedy WER: {:.2f}%".format(wer_new * 100))
+    #         if wer_new > wer * 1.1:
+    #             nf.sync_all_processes(False)
+    #             raise ValueError(
+    #                 f"Fine tuning: new WER {wer_new * 100:.2f}% > than the " f"previous WER {wer * 100:.2f}%"
+    #             )
+    #     nf.sync_all_processes()
+
+    #     # Open the log file and ensure that epochs is strictly increasing
+    #     if nf._exp_manager.log_file:
+    #         epochs = []
+    #         with open(nf._exp_manager.log_file, "r") as log_file:
+    #             line = log_file.readline()
+    #             while line:
+    #                 index = line.find("Starting epoch")
+    #                 if index != -1:
+    #                     epochs.append(int(line[index + len("Starting epoch") :]))
+    #                 line = log_file.readline()
+    #         for i, e in enumerate(epochs):
+    #             if i != e:
+    #                 raise ValueError("Epochs from logfile was not understood")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 172b2131990c..7663beea9293 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -20,8 +20,8 @@
 from nemo.backends.pytorch.nm import DataLayerNM, TrainableNM
 from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params
 from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor
-from nemo.core.callbacks import ActionCallback, EvaluatorCallback, SimpleLossLoggerCallback
-from nemo.core.neural_factory import Actions, OperationMode, Optimization
+from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback, SimpleLossLoggerCallback
+from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState
 from nemo.core.neural_types import *
 from nemo.utils.helpers import get_checkpoint_from_dir
 
@@ -450,10 +450,10 @@ def __nm_graph_forward_pass(
                 if nm_tensor is None:
                     continue
                 t_name = nm_tensor.unique_name
-                if t_name not in registered_tensors:
+                if t_name not in registered_tensors or registered_tensors[t_name] is None:
                     registered_tensors[t_name] = t_tensor
                 else:
-                    raise ValueError("A NMTensor was produced twice in " f"the same DAG. {t_name}")
+                    raise ValueError(f"A NMTensor was produced twice in the same DAG. {t_name}")
 
     @staticmethod
     def pad_tensor(t: torch.Tensor, target_size: torch.Size):
@@ -1110,6 +1110,7 @@ def train(
         gradient_predivide=False,
         amp_max_loss_scale=2.0 ** 24,
     ):
+        self._training_state = TrainingState()
         # Analyse the arguments passed to train.
         if tensors_to_optimize is not None and training_graph is not None:
             raise ValueError("Cannot pass both `tensors_to_optimize` and `training_graph` to the train() function")
@@ -1204,7 +1205,7 @@ def train(
         # callbacks setup
         if callbacks is not None:
             for callback in callbacks:
-                if not isinstance(callback, ActionCallback):
+                if not isinstance(callback, ActionCallback) and not isinstance(callback, NeMoCallback):
                     raise ValueError("A callback was received that was not a child of ActionCallback")
                 elif isinstance(callback, SimpleLossLoggerCallback):
                     if logging_callchain:
@@ -1407,20 +1408,20 @@ def train(
                     else:
                         tensors.append(d)
 
-                registered_tensors = {
-                    t.unique_name: d for t, d in zip(curr_call_chain[0][2].values(), tensors) if t is not None
-                }
+                for t, d in zip(curr_call_chain[0][2].values(), tensors):
+                    if t is not None:
+                        self.training_state.set_tensor(t, d)
                 disable_allreduce = batch_counter < (batches_per_step - 1)
                 self.__nm_graph_forward_pass(
-                    call_chain=curr_call_chain, registered_tensors=registered_tensors,
+                    call_chain=curr_call_chain, registered_tensors=self.training_state.tensor_dict,
                 )
 
                 curr_tensors_to_optimize = training_loop[self.step % len(training_loop)][1]
                 final_loss = 0
                 for tensor in curr_tensors_to_optimize:
                     if (
-                        torch.isnan(registered_tensors[tensor.unique_name]).any()
-                        or torch.isinf(registered_tensors[tensor.unique_name]).any()
+                        torch.isnan(self.training_state.tensor_dict[tensor.unique_name]).any()
+                        or torch.isinf(self.training_state.tensor_dict[tensor.unique_name]).any()
                     ):
                         if (
                             (stop_on_nan_loss)
@@ -1436,7 +1437,7 @@ def train(
                             )
                         else:
                             logging.warning('Loss is NaN or inf, continuing training')
-                    final_loss += registered_tensors[tensor.unique_name]
+                    final_loss += self.training_state.tensor_dict[tensor.unique_name]
 
                 if self._optim_level in AmpOptimizations and self._optim_level != Optimization.mxprO0:
                     with amp.scale_loss(final_loss, curr_optimizer, delay_unscale=disable_allreduce) as scaled_loss:
@@ -1479,10 +1480,11 @@ def train(
                     batch_counter = 0
                     # Register iteration end with callbacks
                     self._update_callbacks(
-                        callbacks=callbacks, registered_tensors=registered_tensors,
+                        callbacks=callbacks, registered_tensors=self.training_state.tensor_dict, final_loss=final_loss
                     )
                     self._perform_on_iteration_end(callbacks=callbacks)
                     self.step += 1
+                self.training_state.clear_dict()
             # End of epoch for loop
             # Register epochs end with callbacks
             self._perform_on_epoch_end(callbacks=callbacks)
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index e465bf5bf95a..1161cef57ee2 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -37,6 +37,45 @@
 logging = nemo.logging
 
 
+class NeMoCallback(ABC):
+    def on_action_start(self, state):
+        pass
+
+    def on_action_end(self, state):
+        pass
+
+    def on_epoch_start(self, state):
+        pass
+
+    def on_epoch_end(self, state):
+        pass
+
+    def on_iteration_start(self, state):
+        pass
+
+    def on_iteration_end(self, state):
+        pass
+
+
+class SimpleLossLogger(NeMoCallback):
+    def __init__(self, step_freq=100, tensors_to_log=["loss"]):
+        # Step_freq: how often logs are printed
+        self.step_freq = step_freq
+        self.tensors_to_log = tensors_to_log
+
+    # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]):
+    #     #tensors_to_log: List of keys into state that will be logged
+
+    def on_iteration_end(self, state):
+        if state["step"] % self.step_freq == 0:
+            for tensor_key in self.tensors_to_log:
+                tensor = state["tensors"].get_tensor(tensor_key)
+                logging.info("%s: %s", tensor_key, tensor)
+                # except KeyError:
+                #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
+                #                    f"Current state tensors include {state['tensors'].tensor_list()}")
+
+
 class ActionCallback(ABC):
     """Abstract interface for callbacks.
     """
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 4402ded7b927..37dac0e678d8 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -36,6 +36,7 @@
 from ..utils import ExpManager
 from .callbacks import ActionCallback, EvaluatorCallback
 from .neural_types import *
+from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
 
 logging = nemo.logging
@@ -84,6 +85,26 @@ class DeviceType(Enum):
     AllGpu = 3
 
 
+class TrainingState:
+    def __init__(self):
+        tensor_naming_registery = AppState().tensor_names
+        self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None)
+
+    def tensor_list(self):
+        return self.tensor_dict.keys()
+
+    def clear_dict(self):
+        for name in self.tensor_dict:
+            self.tensor_dict[name] = None
+
+    def set_tensor(self, tensor, value):
+        self.tensor_dict[tensor.unique_name] = value
+
+    def get_tensor(self, name):
+        unique_name = AppState().tensor_names[name]
+        return self.tensor_dict[unique_name]
+
+
 class Actions(ABC):
     """Basic actions allowed on graphs of Neural Modules"""
 
@@ -93,6 +114,15 @@ def __init__(self, local_rank, global_rank, optimization_level=Optimization.mxpr
         self._optim_level = optimization_level
         self.step = None
         self.epoch_num = None
+        self._training_state = TrainingState()
+
+    @property
+    def state(self):
+        return {"step": self.step, "tensors": self.training_state}
+
+    @property
+    def training_state(self):
+        return self._training_state
 
     @property
     def local_rank(self):
@@ -201,45 +231,67 @@ def _perform_on_iteration_start(self, callbacks):
         # to be a list of ActionCallback objects
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
-                callback.on_iteration_start()
+                if isinstance(callback, ActionCallback):
+                    callback.on_iteration_start()
+                else:
+                    callback.on_iteration_start(self.state)
 
     def _perform_on_iteration_end(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
-                callback.on_iteration_end()
+                if isinstance(callback, ActionCallback):
+                    callback.on_iteration_end()
+                else:
+                    callback.on_iteration_end(self.state)
 
     def _perform_on_action_start(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
-                callback.on_action_start()
+                if isinstance(callback, ActionCallback):
+                    callback.on_action_start()
+                else:
+                    callback.on_action_start(self.state)
 
     def _perform_on_action_end(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
-                callback.on_action_end()
+                if isinstance(callback, ActionCallback):
+                    callback.on_action_end()
+                else:
+                    callback.on_action_end(self.state)
 
     def _perform_on_epoch_start(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
-                callback.on_epoch_start()
+                if isinstance(callback, ActionCallback):
+                    callback.on_epoch_start()
+                else:
+                    callback.on_epoch_start(self.state)
 
     def _perform_on_epoch_end(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
-                callback.on_epoch_end()
+                if isinstance(callback, ActionCallback):
+                    callback.on_epoch_end()
+                else:
+                    callback.on_epoch_end(self.state)
 
     def _init_callbacks(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
-                callback.action = self
+                if isinstance(callback, ActionCallback):
+                    callback.action = self
 
     def _update_callbacks(
-        self, callbacks=None, registered_tensors=None,
+        self, callbacks=None, registered_tensors=None, final_loss=None,
     ):
         # if self.local_rank is None or self.local_rank == 0:
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
-                callback._registered_tensors = registered_tensors
+                if isinstance(callback, ActionCallback):
+                    callback._registered_tensors = registered_tensors
+                else:  # For now, we can use the old callback function. In the future we should improve this
+                    self.training_state.tensor_dict["loss"] = final_loss
 
 
 def _str_to_opt_level(opt_str: str) -> Optimization:
diff --git a/nemo/core/neural_types/__init__.py b/nemo/core/neural_types/__init__.py
index 1fb5bf349076..0ae947d90137 100644
--- a/nemo/core/neural_types/__init__.py
+++ b/nemo/core/neural_types/__init__.py
@@ -19,3 +19,4 @@
 from nemo.core.neural_types.comparison import *
 from nemo.core.neural_types.elements import *
 from nemo.core.neural_types.neural_type import *
+from nemo.core.neural_types.nmtensor_registry import NmTensorNameRegistry
diff --git a/nemo/core/neural_types/neural_type.py b/nemo/core/neural_types/neural_type.py
index d503f8b78cf1..d78d0dc9923c 100644
--- a/nemo/core/neural_types/neural_type.py
+++ b/nemo/core/neural_types/neural_type.py
@@ -49,9 +49,9 @@ class NeuralType(object):
     def __str__(self):
 
         if self.axes is not None:
-            return f"axes: {self.axes}; " f" elements_type: {self.elements_type.__class__.__name__}"
+            return f"axes: {self.axes}; elements_type: {self.elements_type.__class__.__name__}"
         else:
-            return f"axes: None; " f" elements_type: {self.elements_type.__class__.__name__}"
+            return f"axes: None; elements_type: {self.elements_type.__class__.__name__}"
 
     def __init__(self, axes: Optional[Tuple] = None, elements_type: ElementType = VoidType(), optional=False):
         if not isinstance(elements_type, ElementType):
@@ -223,6 +223,7 @@ def __init__(self, producer, producer_args, output_port_name, ntype=None):
         self._step_number = AppState().active_graph.step_number
         # List of tuples (step number, module name, input port name)
         self._consumers = []
+        AppState().tensor_names.register(self)
 
     @property
     def producer(self):
@@ -323,6 +324,10 @@ def unique_name(self):
             raise ValueError("This NmTensor does not have a unique name")
         return f"{self._output_port_name}~~~{self._producer_name}~~~{self._uuid}"
 
+    def rename(self, new_name):
+        """TODO
+        """
+        AppState().tensor_names.rename_NmTensor(self, new_name)
 
 class NeuralTypeError(Exception):
     """Base class for neural type related exceptions."""
diff --git a/nemo/core/neural_types/nmtensor_registry.py b/nemo/core/neural_types/nmtensor_registry.py
new file mode 100755
index 000000000000..c439d4949c9d
--- /dev/null
+++ b/nemo/core/neural_types/nmtensor_registry.py
@@ -0,0 +1,87 @@
+# =============================================================================
+# Copyright (c) 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+
+class NmTensorNameRegistry:
+    def __init__(self):
+        """
+            Constructor. Initializes the manager. Sets active graph to None.
+
+            TODO: Should probably be a property of a graph
+        """
+        # Create the nmtensor_naming_dict
+        # which contains a mapping of str to NMTensor.unique_name
+        self._nmtensor_naming_dict = {"loss": "loss"}  # Reserve keyname of 'loss'
+        self._nmtensor_uniname_set = set(["loss"])
+
+    # def summary(self):
+    #     """ Prints a nice summary. """
+    #     desc = ""
+    #     for graph in self:
+    #         desc = desc + "`{}`: {}\n".format(graph.name, graph)
+    #     return desc
+
+    @property
+    def unique_names(self):
+        return self._nmtensor_uniname_set
+
+    # def register(self, tensor: NmTensor):
+    def register(self, tensor):
+        """TODO
+        """
+
+        # Check if object is already in a set.
+        if tensor.unique_name in self._nmtensor_uniname_set:
+            pass
+
+        # Finally, add object to the set.
+        self._nmtensor_uniname_set.add(tensor.unique_name)
+
+    # def rename_NmTensor(self, tensor: NmTensor, new_name: str):
+    def rename_NmTensor(self, tensor, new_name: str):
+        """ TODO
+        """
+        # Find old name if exists
+        old_name = tensor.unique_name
+        for custom_name, unique_name in self._nmtensor_naming_dict.items():
+            if unique_name == tensor.unique_name:
+                old_name = custom_name
+
+        if old_name != tensor.unique_name:
+            del self._nmtensor_naming_dict[old_name]
+
+        if new_name in self._nmtensor_naming_dict:
+            raise KeyError(f"{new_name} already exists in current graph. Please use a unique name")
+        self._nmtensor_naming_dict[new_name] = tensor.unique_name
+
+    def __getitem__(self, key):
+        """
+        Object getter function.
+
+        Args:
+            key: Object name.
+
+        Returns:
+            Object associated with the key.
+        """
+        # Search for an object with a given name.
+        if key in self._nmtensor_naming_dict:
+            key = self._nmtensor_naming_dict[key]
+
+        if key in self._nmtensor_uniname_set:
+            return key
+
+        raise KeyError("A NmTensor with name `{}` don't exists!".format(key))
diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py
index d77daa133adf..6183526b87fe 100644
--- a/nemo/utils/app_state.py
+++ b/nemo/utils/app_state.py
@@ -47,6 +47,17 @@ def __init__(self, device=None):
         self._module_registry = ObjectRegistry("module")
         # Create graph manager (registry with some additional functionality).
         self._neural_graph_manager = NeuralGraphManager()
+        # Create NmTensor registry
+        self._nmtensor_name_registry = nemo.core.neural_types.NmTensorNameRegistry()
+
+    @property
+    def tensor_names(self):
+        """ Property returning the existing modules.
+
+            Returns:
+                Existing modules (a set object).
+        """
+        return self._nmtensor_name_registry
 
     @property
     def modules(self):
@@ -68,14 +79,14 @@ def graphs(self):
         return self._neural_graph_manager
 
     def register_module(self, module, name: str) -> str:
-        """ 
-            Registers a module using the provided name. 
+        """
+            Registers a module using the provided name.
             If name is none - generates a new unique name.
-            
+
             Args:
                 module: A Neural Module object to be registered.
                 name: A "proposition" of module name.
-            
+
             Returns:
                 A unique name (proposition or newly generated name).
         """
@@ -85,11 +96,11 @@ def register_graph(self, graph, name: str) -> str:
         """
             Registers a new graph using the provided name.
             If name is none - generates a new unique name.
-            
+
             Args:
                 graph: A Neural Graph object to be registered.
                 name: A "proposition" of graph name.
-            
+
             Returns:
                 A unique name (proposition or newly generated name).
         """

From 8024454fb9cd0afbd8af50163bd950f6421fd27d Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 6 May 2020 17:22:59 -0700
Subject: [PATCH 02/40] first working hack of computing uncomputed tensors

Signed-off-by: Jason <jasoli@nvidia.com>
---
 examples/asr/jasper_an4_debug.py            |   6 +-
 nemo/backends/pytorch/actions.py            | 292 ++++++--------------
 nemo/core/callbacks.py                      |  15 +
 nemo/core/neural_factory.py                 | 128 ++++++++-
 nemo/core/neural_types/nmtensor_registry.py |  11 +-
 nemo/utils/neural_graph/object_registry.py  |  12 +-
 6 files changed, 246 insertions(+), 218 deletions(-)

diff --git a/examples/asr/jasper_an4_debug.py b/examples/asr/jasper_an4_debug.py
index e19ea0117f62..f06055baec8a 100755
--- a/examples/asr/jasper_an4_debug.py
+++ b/examples/asr/jasper_an4_debug.py
@@ -65,7 +65,7 @@ def create_dags(model_config_file, vocab, args, nf):
     encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len)
     log_probs = jasper_decoder(encoder_output=encoded)
     predictions = greedy_decoder(log_probs=log_probs)
-    loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,)
+    loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len)
 
     # # Create an evaluation graph.
     # audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
@@ -89,7 +89,9 @@ def create_dags(model_config_file, vocab, args, nf):
     # loss.rename("test")
     # train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["test"])
 
-    train_callback = nemo.core.SimpleLossLogger()
+    # train_callback = nemo.core.SimpleLossLogger()
+    predictions.rename("test")
+    train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["loss", "test"])
 
     # checkpointer_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)
 
diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 7663beea9293..e737b08997c2 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -21,8 +21,10 @@
 from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params
 from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor
 from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback, SimpleLossLoggerCallback
-from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState
+from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState, topological_sort_from_leaves
 from nemo.core.neural_types import *
+from nemo.utils.app_state import AppState
+from nemo.utils.decorators import deprecated
 from nemo.utils.helpers import get_checkpoint_from_dir
 
 # these imports will happen on as-needed basis
@@ -87,137 +89,38 @@ def __init__(
             local_rank=local_rank, global_rank=global_rank, optimization_level=optimization_level,
         )
 
-        # will be [unique_instance_id -> (NMModule, PTModule)]
-        self.module_reference_table = {}
         self.step = 0
         self.epoch_num = 0
         self.optimizers = []
         self.tb_writer = tb_writer
-        self._modules = set()
         self.cache = None
         self.amp_initialized = False
-
-    @property
-    def modules(self):
-        return self._modules
+        self.ddp_initialized = False
+        self.ddp_module_dict = {}
 
     def __get_top_sorted_modules_and_dataloader(self, hook):
+        """ TODO
         """
-        Constructs DAG leading to hook and creates its topological order.
-        It also populates self.module_reference_table.
-        Args:
-          hook: an NmTensor or a list of NmTensors representing leaf nodes
-          in DAG
-
-        Returns:
-          list of modules with their call arguments and outputs, and dataset
-        """
-
-        def create_node(producer, producer_args):
-            if producer_args is None:
-                return tuple((producer, ()))
-            else:
-                return tuple((producer, tuple([(k, v) for k, v in producer_args.items()]),))
-
-        def is_in_degree_zero(node, processed_nodes):
-            """A node has in degree of zero"""
-            if node[1] == ():
-                return True
-            for portname, nmtensor in node[1]:
-                nd = create_node(nmtensor.producer, nmtensor.producer_args)
-                if nd not in processed_nodes:
-                    return False
-            return True
-
-        hooks = hook if isinstance(hook, list) else [hook]
-
-        # ensures that no tensors are processed twice
-        processed_nmtensors = set()
-
-        indices_to_remove = []
-        # Check for duplicates in hook
-        for i, nmtensor in enumerate(hook):
-            if nmtensor in processed_nmtensors:
-                indices_to_remove.append(i)
-            else:
-                processed_nmtensors.add(nmtensor)
-
-        for i in reversed(indices_to_remove):
-            hook.pop(i)
-
-        _top_sorted_modules = []
-        all_nodes = {}
-
-        # extract all nodes to all_nodes set
-        hooks_lst = list(hooks)
-        while len(hooks_lst) > 0:
-            # take nmtensor from the end of the list
-            nmtensor = hooks_lst.pop()
-
-            node = create_node(nmtensor.producer, nmtensor.producer_args)
-            # Store nmtensor as an output of its producer
-            # first make sure all keys are present per output port
-            # and nm is inside all_nodes
-            if node not in all_nodes:
-                all_nodes[node] = {k: None for k in nmtensor.producer.output_ports}
-            # second, populate output port with current nmtensor
-            # where applicable
-            all_nodes[node][nmtensor.name] = nmtensor
-            processed_nmtensors.add(nmtensor)
-            if nmtensor.producer_args is not None and nmtensor.producer_args != {}:
-                for _, new_nmtensor in nmtensor.producer_args.items():
-                    if new_nmtensor not in processed_nmtensors:
-                        # put in the start of list
-                        hooks_lst.insert(0, new_nmtensor)
-
-        all_node_with_output = []
-        # Iterate over all_nodes to create new nodes that include its output
-        # now all nodes have (module, input tensors, output tensors)
-        for node in all_nodes:
-            all_node_with_output.append(tuple((node[0], node[1], all_nodes[node])))
-
-        processed_nodes = []
-        while len(all_node_with_output) > 0:
-            for node in all_node_with_output.copy():
-                # if node's in_degree is zero it can be added to
-                # _top_sorted_modules
-                # this will also reduce in_degree of its children
-                if is_in_degree_zero(node, processed_nodes):
-                    _top_sorted_modules.append(node)
-                    processed_nodes.append((node[0], node[1]))
-                    all_node_with_output.remove(node)
-
-        # Create top_sorted_modules aka callchain
-        top_sorted_modules = []
-        for i, m in enumerate(_top_sorted_modules):
-            top_sorted_modules.append((m[0], dict(m[1]), m[2]))
-            # Ensure that there is only one dataset in callchain
-            if i > 0 and isinstance(m[0], DataLayerNM):
-                raise ValueError("There were more than one DataLayer NeuralModule inside your DAG.")
+        top_sorted_modules = topological_sort_from_leaves(hook)
 
         if not isinstance(top_sorted_modules[0][0], DataLayerNM):
             raise ValueError("The first module in your DAG was not a DataLayer NeuralModule.")
 
         tdataset = top_sorted_modules[0][0].dataset
 
-        # populate self.module_reference_table
         for m in top_sorted_modules:
             if m[0].factory is None and self._local_rank is not None:
                 raise ValueError(
-                    "Neural module {0} was created without "
-                    "NeuralModuleFactory, but you are trying to"
-                    "run in distributed mode. Please instantiate"
-                    "NeuralModuleFactory first and pass its "
-                    "instance as `factory` parameter to all your"
-                    "Neural Module objects."
-                    "".format(str(m[0]))
+                    "Neural module {0} was created without NeuralModuleFactory, but you are trying to run in "
+                    "distributed mode. Please instantiate NeuralModuleFactory first and pass its instance as "
+                    "`factory` parameter to all your Neural Module objects.".format(str(m[0]))
                 )
-            key = m[0].unique_instance_id
-            if key not in self.module_reference_table:
-                if isinstance(m[0], TrainableNeuralModuleWrapper):
-                    self.module_reference_table[key] = (m[0], m[0]._pt_module)
-                else:
-                    self.module_reference_table[key] = (m[0], m[0])
+            # key = m[0].unique_instance_id
+            # if key not in self.module_reference_table:
+            #     if isinstance(m[0], TrainableNeuralModuleWrapper):
+            #         self.module_reference_table[key] = (m[0], m[0]._pt_module)
+            #     else:
+            #         self.module_reference_table[key] = (m[0], m[0])
 
         return top_sorted_modules, tdataset
 
@@ -372,10 +275,10 @@ def __initialize_amp(
         if optim_level == Optimization.mxprO0:
             return optimizer
 
-        if len(self.modules) < 1:
+        if len(AppState().modules) < 1:
             raise ValueError("There were no modules to initialize")
         pt_modules = []
-        for module in self.modules:
+        for module in AppState().modules:
             if isinstance(module, nn.Module):
                 pt_modules.append(module)
             elif isinstance(module, TrainableNeuralModuleWrapper):
@@ -391,6 +294,9 @@ def __initialize_amp(
         self.amp_initialized = True
         return optimizer
 
+    def nm_graph_forward_pass(self, callchain, registered_tensors):
+        self.__nm_graph_forward_pass(callchain, registered_tensors)
+
     def __nm_graph_forward_pass(
         self, call_chain, registered_tensors, mode=OperationMode.training, use_cache=False,
     ):
@@ -409,8 +315,9 @@ def __nm_graph_forward_pass(
                     continue
             call_args = call_chain[ind][1]
             # module = call_chain[ind][0]
+            # pmodule = self.module_reference_table[m_id][1]
             m_id = call_chain[ind][0].unique_instance_id
-            pmodule = self.module_reference_table[m_id][1]
+            pmodule = self.ddp_module_dict[m_id] if self.ddp_initialized else call_chain[ind][0]
 
             # if self._local_rank is not None:
             #     if isinstance(pmodule, DDP):
@@ -436,10 +343,11 @@ def __nm_graph_forward_pass(
                 key = nmtensor.unique_name
                 call_set[tensor_name] = registered_tensors[key]
             # actual PyTorch module call with signature
-            if isinstance(self.module_reference_table[m_id][0], TrainableNeuralModuleWrapper,):
-                new_tensors = pmodule(**call_set)
-            else:
-                new_tensors = pmodule(force_pt=True, **call_set)
+            # if isinstance(self.module_reference_table[m_id][0], TrainableNeuralModuleWrapper,):
+            #     new_tensors = pmodule(**call_set)
+            # else:
+            #     new_tensors = pmodule(force_pt=True, **call_set)
+            new_tensors = pmodule(force_pt=True, **call_set)
 
             if not isinstance(new_tensors, List):
                 if not isinstance(new_tensors, tuple):
@@ -925,31 +833,6 @@ def _check_tuples(list_of_tuples):
                 return False
         return True
 
-    def _get_all_modules(self, training_loop, callbacks, logging_callchain=None):
-        """Gets all neural modules that will be used by train() and eval() via
-        EvaluatorCallbacks. Saves all modules to self.modules
-        """
-        # If there is a SimpleLossLoggerCallback, create an logger_callchain
-        # with all callchains from training_loop and
-        # SimpleLossLoggerCallback.tensors
-        if logging_callchain:
-            for module in logging_callchain:
-                self.modules.add(module[0])
-
-        # Else grab all callchains from training_loop
-        else:
-            for step in training_loop:
-                for module in step[2]:
-                    self.modules.add(module[0])
-
-        # Lastly, grab all eval modules
-        if callbacks is not None:
-            for callback in callbacks:
-                if isinstance(callback, EvaluatorCallback):
-                    (callchain, _,) = self.__get_top_sorted_modules_and_dataloader(hook=callback.eval_tensors)
-                    for module in callchain:
-                        self.modules.add(module[0])
-
     @staticmethod
     def __module_export(module, output, d_format: DeploymentFormat, input_example=None, output_example=None):
         # Check if output already exists
@@ -1217,8 +1100,6 @@ def train(
                         all_tensors = all_tensors + step[1]
                     (logging_callchain, _,) = self.__get_top_sorted_modules_and_dataloader(hook=all_tensors)
 
-        self._get_all_modules(training_loop, callbacks, logging_callchain)
-
         # Intialize Amp if needed
         if self._optim_level in AmpOptimizations:
             # Store mapping of self.optimizers to optimizer in callchain
@@ -1270,67 +1151,72 @@ def train(
                 else:
                     train_sampler = None
 
-            for train_iter in training_loop:
-                call_chain = train_iter[2]
-                for i in range(1, len(call_chain) - 1):
-                    key = call_chain[i][0].unique_instance_id
-                    pmodule = self.module_reference_table[key][1]
-                    num_trainable_weights = self.module_reference_table[key][1].num_weights
-                    if (
-                        not isinstance(pmodule, DDP)
-                        and isinstance(pmodule, torch.nn.Module)
-                        and num_trainable_weights > 0
-                    ):
-                        # gpf = 1
-                        # if gradient_predivide:
-                        #     gpf = dist.get_world_size()
-                        # pmodule = DDP(pmodule, gradient_predivide_factor=gpf)  # Old Apex Method
-
-                        # Per pytorch docs, convert sync bn prior to DDP
-                        if synced_batchnorm:
-                            world_size = dist.get_world_size()
-                            sync_batchnorm_group = None
-                            if synced_batchnorm_groupsize > 0:
-                                if world_size % synced_batchnorm_groupsize != 0:
-                                    raise ValueError(
-                                        f"Synchronized batch norm group size ({synced_batchnorm_groupsize}) must be 0"
-                                        f" or divide total number of GPUs ({world_size})."
-                                    )
-                                # Find ranks of other nodes in the same batchnorm group
-                                rank = torch.distributed.get_rank()
-                                group = rank // synced_batchnorm_groupsize
-                                group_rank_ids = range(
-                                    group * synced_batchnorm_groupsize, (group + 1) * synced_batchnorm_groupsize
+            # for train_iter in training_loop:
+            #     call_chain = train_iter[2]
+            #     for i in range(1, len(call_chain) - 1):
+            #         key = call_chain[i][0].unique_instance_id
+            #         pmodule = self.module_reference_table[key][1]
+            #         num_trainable_weights = self.module_reference_table[key][1].num_weights
+            self.ddp_initialized = True
+            for module in AppState().modules:
+                key = module.unique_instance_id
+                num_trainable_weights = module.num_weights
+                if (
+                    not isinstance(module, DDP)
+                    and isinstance(module, torch.nn.Module)
+                    and num_trainable_weights > 0
+                ):
+                    # gpf = 1
+                    # if gradient_predivide:
+                    #     gpf = dist.get_world_size()
+                    # pmodule = DDP(pmodule, gradient_predivide_factor=gpf)  # Old Apex Method
+
+                    # Per pytorch docs, convert sync bn prior to DDP
+                    if synced_batchnorm:
+                        world_size = dist.get_world_size()
+                        sync_batchnorm_group = None
+                        if synced_batchnorm_groupsize > 0:
+                            if world_size % synced_batchnorm_groupsize != 0:
+                                raise ValueError(
+                                    f"Synchronized batch norm group size ({synced_batchnorm_groupsize}) must be 0"
+                                    f" or divide total number of GPUs ({world_size})."
                                 )
-                                sync_batchnorm_group = torch.distributed.new_group(group_rank_ids)
-
-                            pmodule = nn.SyncBatchNorm.convert_sync_batchnorm(
-                                pmodule, process_group=sync_batchnorm_group
+                            # Find ranks of other nodes in the same batchnorm group
+                            rank = torch.distributed.get_rank()
+                            group = rank // synced_batchnorm_groupsize
+                            group_rank_ids = range(
+                                group * synced_batchnorm_groupsize, (group + 1) * synced_batchnorm_groupsize
                             )
+                            sync_batchnorm_group = torch.distributed.new_group(group_rank_ids)
 
-                        # By default, disable broadcast_buffers. This disables batch norm synchronization on forward
-                        # pass
-                        pmodule = DDP(
-                            pmodule, device_ids=[self.local_rank], broadcast_buffers=False, find_unused_parameters=True
+                        module = nn.SyncBatchNorm.convert_sync_batchnorm(
+                            module, process_group=sync_batchnorm_group
                         )
 
-                    # # Convert batchnorm modules to synced if applicable
-                    # if synced_batchnorm and isinstance(pmodule, torch.nn.Module):
-                    #     world_size = dist.get_world_size()
-                    #     if synced_batchnorm_groupsize > 0 and world_size % synced_batchnorm_groupsize != 0:
-                    #         raise ValueError(
-                    #             f"Synchronized batch norm group size"
-                    #             f" ({synced_batchnorm_groupsize}) must be 0"
-                    #             f" or divide total number of GPUs"
-                    #             f" ({world_size})."
-                    #         )
-                    #     process_group = create_syncbn_process_group(synced_batchnorm_groupsize)
-                    #     pmodule = convert_syncbn(pmodule, process_group=process_group)
-
-                    self.module_reference_table[key] = (
-                        self.module_reference_table[key][0],
-                        pmodule,
+                    # By default, disable broadcast_buffers. This disables batch norm synchronization on forward
+                    # pass
+                    module = DDP(
+                        module, device_ids=[self.local_rank], broadcast_buffers=False, find_unused_parameters=True
                     )
+                    self.ddp_module_dict[key] = module
+
+                # # Convert batchnorm modules to synced if applicable
+                # if synced_batchnorm and isinstance(pmodule, torch.nn.Module):
+                #     world_size = dist.get_world_size()
+                #     if synced_batchnorm_groupsize > 0 and world_size % synced_batchnorm_groupsize != 0:
+                #         raise ValueError(
+                #             f"Synchronized batch norm group size"
+                #             f" ({synced_batchnorm_groupsize}) must be 0"
+                #             f" or divide total number of GPUs"
+                #             f" ({world_size})."
+                #         )
+                #     process_group = create_syncbn_process_group(synced_batchnorm_groupsize)
+                #     pmodule = convert_syncbn(pmodule, process_group=process_group)
+
+                # self.module_reference_table[key] = (
+                #     self.module_reference_table[key][0],
+                #     pmodule,
+                # )
         # single GPU/CPU training
         else:
             if t_dataset is not None:
@@ -1566,7 +1452,7 @@ def get_DDP_modules(self, call_chain):
         modules = []
         for ind in range(1, len(call_chain)):
             m_id = call_chain[ind][0].unique_instance_id
-            module = self.module_reference_table[m_id][1]
+            module = self.ddp_module_dict[m_id]
             if isinstance(module, DDP):
                 modules.append(module)
 
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 1161cef57ee2..17dbf890f76c 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -38,6 +38,18 @@
 
 
 class NeMoCallback(ABC):
+    def __init__(self):
+        self._action = None
+
+    @property
+    def action(self):
+        """TODO remove"""
+        return self._action
+
+    @action.setter
+    def action(self, action_obj):
+        self._action = action_obj
+
     def on_action_start(self, state):
         pass
 
@@ -59,6 +71,7 @@ def on_iteration_end(self, state):
 
 class SimpleLossLogger(NeMoCallback):
     def __init__(self, step_freq=100, tensors_to_log=["loss"]):
+        super().__init__()
         # Step_freq: how often logs are printed
         self.step_freq = step_freq
         self.tensors_to_log = tensors_to_log
@@ -70,6 +83,8 @@ def on_iteration_end(self, state):
         if state["step"] % self.step_freq == 0:
             for tensor_key in self.tensors_to_log:
                 tensor = state["tensors"].get_tensor(tensor_key)
+                if tensor is None:
+                    tensor = state["tensors"].get_and_compute_tensor(tensor_key, self.action)
                 logging.info("%s: %s", tensor_key, tensor)
                 # except KeyError:
                 #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 37dac0e678d8..3d9faf867e08 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -25,6 +25,7 @@
     'DeploymentFormat',
 ]
 
+import copy
 import random
 from abc import ABC, abstractmethod
 from enum import Enum
@@ -41,6 +42,111 @@
 
 logging = nemo.logging
 
+# def topological_sort_from_leaves(leaf_nmtensors, cached_training_state: TrainingState = None):
+def topological_sort_from_leaves(leaf_nmtensors, cached_training_state = None):
+    from nemo.backends.pytorch.nm import DataLayerNM
+    def create_node(producer, producer_args):
+        if producer_args is None:
+            return tuple((producer, ()))
+        else:
+            return tuple((producer, tuple([(k, v) for k, v in producer_args.items()]),))
+
+    def is_in_degree_zero(node, processed_nodes, cached_training_state):
+        """A node has in degree of zero"""
+        if node[1] == ():
+            return True
+        for portname, nmtensor in node[1]:
+            nd = create_node(nmtensor.producer, nmtensor.producer_args)
+            if nd not in processed_nodes:
+                if cached_training_state and cached_training_state.check_tensor_cached(nmtensor.unique_name):
+                    continue
+                return False
+        return True
+
+    hooks = leaf_nmtensors if isinstance(leaf_nmtensors, list) else [leaf_nmtensors]
+
+    # ensures that no tensors are processed twice
+    processed_nmtensors = set()
+
+    indices_to_remove = []
+    # Check for duplicates in hook
+    for i, nmtensor in enumerate(hooks):
+        if nmtensor in processed_nmtensors:
+            indices_to_remove.append(i)
+        else:
+            processed_nmtensors.add(nmtensor)
+
+    for i in reversed(indices_to_remove):
+        hooks.pop(i)
+
+    _top_sorted_modules = []
+    all_nodes = {}
+
+    # extract all nodes to all_nodes set
+    hooks_lst = list(hooks)
+    while len(hooks_lst) > 0:
+        # take nmtensor from the end of the list
+        nmtensor = hooks_lst.pop()
+        producer_args = nmtensor.producer_args
+
+        node = create_node(nmtensor.producer, producer_args)
+        # Store nmtensor as an output of its producer
+        # first make sure all keys are present per output port
+        # and nm is inside all_nodes
+        if node not in all_nodes:
+            all_nodes[node] = {k: None for k in nmtensor.producer.output_ports}
+        # second, populate output port with current nmtensor
+        # where applicable
+        all_nodes[node][nmtensor.name] = nmtensor
+        processed_nmtensors.add(nmtensor)
+
+        new_tensors = set()
+        if producer_args is not None and producer_args != {}:
+            for _, new_nmtensor in producer_args.items():
+                if new_nmtensor not in processed_nmtensors:
+                    new_tensors.add(new_nmtensor)
+
+        # TODO
+        if cached_training_state:
+            for name, input_nmtensor in producer_args.items():
+                if cached_training_state.check_tensor_cached(input_nmtensor.unique_name):
+                    new_tensors.remove(input_nmtensor)
+
+        for new_nmtensor in new_tensors:
+            # put in the start of list
+            hooks_lst.insert(0, new_nmtensor)
+
+    all_node_with_output = []
+    # Iterate over all_nodes to create new nodes that include its output
+    # now all nodes have (module, input tensors, output tensors)
+    for node in all_nodes:
+        all_node_with_output.append(tuple((node[0], node[1], all_nodes[node])))
+
+    processed_nodes = []
+    while len(all_node_with_output) > 0:
+        for node in all_node_with_output.copy():
+            # if node's in_degree is zero it can be added to
+            # _top_sorted_modules
+            # this will also reduce in_degree of its children
+            if is_in_degree_zero(node, processed_nodes, cached_training_state):
+                _top_sorted_modules.append(node)
+                processed_nodes.append((node[0], node[1]))
+                all_node_with_output.remove(node)
+
+    # Create top_sorted_modules aka callchain
+    top_sorted_modules = []
+    for i, m in enumerate(_top_sorted_modules):
+        top_sorted_modules.append((m[0], dict(m[1]), m[2]))
+        # Ensure that there is only one dataset in callchain
+        if i > 0 and isinstance(m[0], DataLayerNM):
+            raise ValueError("There were more than one DataLayer NeuralModule inside your DAG.")
+
+        #TODO
+        if cached_training_state and isinstance(m[0], DataLayerNM):
+            raise ValueError("Could not compute tensor from current cached training state.")
+
+    return top_sorted_modules
+
 
 class DeploymentFormat(Enum):
     """Which format to use when exporting a Neural Module for deployment"""
@@ -100,10 +206,28 @@ def clear_dict(self):
     def set_tensor(self, tensor, value):
         self.tensor_dict[tensor.unique_name] = value
 
+    def check_tensor_cached(self, unique_name):
+        if self.tensor_dict[unique_name] is None:
+            return False
+        return True
+
     def get_tensor(self, name):
         unique_name = AppState().tensor_names[name]
         return self.tensor_dict[unique_name]
 
+    def get_and_compute_tensor(self, name, action):
+        unique_name = AppState().tensor_names[name]
+        tensor_value = self.tensor_dict[unique_name]
+        if tensor_value is None:
+            nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name]
+            callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self)
+            # print(callchain)
+            callchain.insert(0, ())
+            action.nm_graph_forward_pass(callchain, self.tensor_dict)
+            # print(self.tensor_dict[unique_name])
+            tensor_value = self.tensor_dict[unique_name]
+        return tensor_value
+
 
 class Actions(ABC):
     """Basic actions allowed on graphs of Neural Modules"""
@@ -279,8 +403,8 @@ def _perform_on_epoch_end(self, callbacks):
     def _init_callbacks(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
-                if isinstance(callback, ActionCallback):
-                    callback.action = self
+                # if isinstance(callback, ActionCallback):
+                callback.action = self
 
     def _update_callbacks(
         self, callbacks=None, registered_tensors=None, final_loss=None,
diff --git a/nemo/core/neural_types/nmtensor_registry.py b/nemo/core/neural_types/nmtensor_registry.py
index c439d4949c9d..5055319c2cef 100755
--- a/nemo/core/neural_types/nmtensor_registry.py
+++ b/nemo/core/neural_types/nmtensor_registry.py
@@ -25,7 +25,8 @@ def __init__(self):
         # Create the nmtensor_naming_dict
         # which contains a mapping of str to NMTensor.unique_name
         self._nmtensor_naming_dict = {"loss": "loss"}  # Reserve keyname of 'loss'
-        self._nmtensor_uniname_set = set(["loss"])
+        # self._nmtensor_uniname_set = set(["loss"])
+        self._nmtensor_uniname_dict = {"loss": None}
 
     # def summary(self):
     #     """ Prints a nice summary. """
@@ -36,7 +37,7 @@ def __init__(self):
 
     @property
     def unique_names(self):
-        return self._nmtensor_uniname_set
+        return self._nmtensor_uniname_dict.keys()
 
     # def register(self, tensor: NmTensor):
     def register(self, tensor):
@@ -44,11 +45,11 @@ def register(self, tensor):
         """
 
         # Check if object is already in a set.
-        if tensor.unique_name in self._nmtensor_uniname_set:
+        if tensor.unique_name in self._nmtensor_uniname_dict:
             pass
 
         # Finally, add object to the set.
-        self._nmtensor_uniname_set.add(tensor.unique_name)
+        self._nmtensor_uniname_dict[tensor.unique_name] = tensor
 
     # def rename_NmTensor(self, tensor: NmTensor, new_name: str):
     def rename_NmTensor(self, tensor, new_name: str):
@@ -81,7 +82,7 @@ def __getitem__(self, key):
         if key in self._nmtensor_naming_dict:
             key = self._nmtensor_naming_dict[key]
 
-        if key in self._nmtensor_uniname_set:
+        if key in self._nmtensor_uniname_dict:
             return key
 
         raise KeyError("A NmTensor with name `{}` don't exists!".format(key))
diff --git a/nemo/utils/neural_graph/object_registry.py b/nemo/utils/neural_graph/object_registry.py
index 8e861e529944..8a6a1207e2ef 100644
--- a/nemo/utils/neural_graph/object_registry.py
+++ b/nemo/utils/neural_graph/object_registry.py
@@ -24,7 +24,7 @@ class ObjectRegistry(WeakSet):
     """
 
     def __init__(self, base_type_name):
-        """ 
+        """
             Stores base type name.
         """
         super().__init__()
@@ -32,13 +32,13 @@ def __init__(self, base_type_name):
 
     def register(self, new_obj, name: str) -> str:
         """
-            Registers a new object using the provided name. 
+            Registers a new object using the provided name.
             If name is none - generates new unique name.
-            
+
             Args:
                 new_obj: An object to be registered.
                 name: A "proposition" for the object name.
-            
+
             Returns:
                 A unique name (proposition or newly generated name).
         """
@@ -66,7 +66,7 @@ def register(self, new_obj, name: str) -> str:
         return unique_name
 
     def has(self, name: str) -> bool:
-        """ 
+        """
             Check if registry stores object with a given name.
 
             Args:
@@ -125,7 +125,7 @@ def __eq__(self, other):
         """
             Checks if two registers have the same content.
 
-            Args:    
+            Args:
                 other: The second registry object.
         """
         if not isinstance(other, WeakSet):

From 879fcfc1a1eabf41461b23e1070259a146a68879 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 6 May 2020 17:23:48 -0700
Subject: [PATCH 03/40] style

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py      | 10 ++--------
 nemo/core/neural_factory.py           |  5 +++--
 nemo/core/neural_types/neural_type.py |  1 +
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index e737b08997c2..e06511f9d130 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -1161,11 +1161,7 @@ def train(
             for module in AppState().modules:
                 key = module.unique_instance_id
                 num_trainable_weights = module.num_weights
-                if (
-                    not isinstance(module, DDP)
-                    and isinstance(module, torch.nn.Module)
-                    and num_trainable_weights > 0
-                ):
+                if not isinstance(module, DDP) and isinstance(module, torch.nn.Module) and num_trainable_weights > 0:
                     # gpf = 1
                     # if gradient_predivide:
                     #     gpf = dist.get_world_size()
@@ -1189,9 +1185,7 @@ def train(
                             )
                             sync_batchnorm_group = torch.distributed.new_group(group_rank_ids)
 
-                        module = nn.SyncBatchNorm.convert_sync_batchnorm(
-                            module, process_group=sync_batchnorm_group
-                        )
+                        module = nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group=sync_batchnorm_group)
 
                     # By default, disable broadcast_buffers. This disables batch norm synchronization on forward
                     # pass
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 3d9faf867e08..13ec0f110808 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -43,8 +43,9 @@
 logging = nemo.logging
 
 # def topological_sort_from_leaves(leaf_nmtensors, cached_training_state: TrainingState = None):
-def topological_sort_from_leaves(leaf_nmtensors, cached_training_state = None):
+def topological_sort_from_leaves(leaf_nmtensors, cached_training_state=None):
     from nemo.backends.pytorch.nm import DataLayerNM
+
     def create_node(producer, producer_args):
         if producer_args is None:
             return tuple((producer, ()))
@@ -141,7 +142,7 @@ def is_in_degree_zero(node, processed_nodes, cached_training_state):
         if i > 0 and isinstance(m[0], DataLayerNM):
             raise ValueError("There were more than one DataLayer NeuralModule inside your DAG.")
 
-        #TODO
+        # TODO
         if cached_training_state and isinstance(m[0], DataLayerNM):
             raise ValueError("Could not compute tensor from current cached training state.")
 
diff --git a/nemo/core/neural_types/neural_type.py b/nemo/core/neural_types/neural_type.py
index d78d0dc9923c..d3da8a80fdf5 100644
--- a/nemo/core/neural_types/neural_type.py
+++ b/nemo/core/neural_types/neural_type.py
@@ -329,6 +329,7 @@ def rename(self, new_name):
         """
         AppState().tensor_names.rename_NmTensor(self, new_name)
 
+
 class NeuralTypeError(Exception):
     """Base class for neural type related exceptions."""
 

From ddbf472c56818537d3d3af9be4ab032372a0296c Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Tue, 12 May 2020 13:31:55 -0700
Subject: [PATCH 04/40] add a reference to Actions into TrainingState, remove
 deprecated function

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py |  2 +-
 nemo/core/callbacks.py           | 14 +----
 nemo/core/neural_factory.py      | 99 ++++----------------------------
 3 files changed, 12 insertions(+), 103 deletions(-)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index e06511f9d130..90ea4d34a490 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -993,7 +993,7 @@ def train(
         gradient_predivide=False,
         amp_max_loss_scale=2.0 ** 24,
     ):
-        self._training_state = TrainingState()
+        self._training_state = TrainingState(self)
         # Analyse the arguments passed to train.
         if tensors_to_optimize is not None and training_graph is not None:
             raise ValueError("Cannot pass both `tensors_to_optimize` and `training_graph` to the train() function")
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 17dbf890f76c..13a01c38390e 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -38,18 +38,6 @@
 
 
 class NeMoCallback(ABC):
-    def __init__(self):
-        self._action = None
-
-    @property
-    def action(self):
-        """TODO remove"""
-        return self._action
-
-    @action.setter
-    def action(self, action_obj):
-        self._action = action_obj
-
     def on_action_start(self, state):
         pass
 
@@ -84,7 +72,7 @@ def on_iteration_end(self, state):
             for tensor_key in self.tensors_to_log:
                 tensor = state["tensors"].get_tensor(tensor_key)
                 if tensor is None:
-                    tensor = state["tensors"].get_and_compute_tensor(tensor_key, self.action)
+                    tensor = state["tensors"].get_and_compute_tensor(tensor_key)
                 logging.info("%s: %s", tensor_key, tensor)
                 # except KeyError:
                 #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 13ec0f110808..fbfc87ec4abf 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -34,9 +34,9 @@
 import numpy as np
 
 import nemo
-from ..utils import ExpManager
-from .callbacks import ActionCallback, EvaluatorCallback
-from .neural_types import *
+from nemo.utils import ExpManager
+from nemo.core.callbacks import ActionCallback, EvaluatorCallback
+from nemo.core.neural_types import NmTensor
 from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
 
@@ -193,9 +193,10 @@ class DeviceType(Enum):
 
 
 class TrainingState:
-    def __init__(self):
+    def __init__(self, action):
         tensor_naming_registery = AppState().tensor_names
         self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None)
+        self._action = action
 
     def tensor_list(self):
         return self.tensor_dict.keys()
@@ -216,7 +217,7 @@ def get_tensor(self, name):
         unique_name = AppState().tensor_names[name]
         return self.tensor_dict[unique_name]
 
-    def get_and_compute_tensor(self, name, action):
+    def get_and_compute_tensor(self, name):
         unique_name = AppState().tensor_names[name]
         tensor_value = self.tensor_dict[unique_name]
         if tensor_value is None:
@@ -224,7 +225,7 @@ def get_and_compute_tensor(self, name, action):
             callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self)
             # print(callchain)
             callchain.insert(0, ())
-            action.nm_graph_forward_pass(callchain, self.tensor_dict)
+            self._action.nm_graph_forward_pass(callchain, self.tensor_dict)
             # print(self.tensor_dict[unique_name])
             tensor_value = self.tensor_dict[unique_name]
         return tensor_value
@@ -239,7 +240,7 @@ def __init__(self, local_rank, global_rank, optimization_level=Optimization.mxpr
         self._optim_level = optimization_level
         self.step = None
         self.epoch_num = None
-        self._training_state = TrainingState()
+        self._training_state = None
 
     @property
     def state(self):
@@ -404,8 +405,8 @@ def _perform_on_epoch_end(self, callbacks):
     def _init_callbacks(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
-                # if isinstance(callback, ActionCallback):
-                callback.action = self
+                if isinstance(callback, ActionCallback):
+                    callback.action = self
 
     def _update_callbacks(
         self, callbacks=None, registered_tensors=None, final_loss=None,
@@ -617,86 +618,6 @@ def __name_import(name):
             mod = getattr(mod, comp)
         return mod
 
-    @deprecated(version=0.11)
-    def __get_pytorch_module(self, name, collection, params, pretrained):
-        # TK: "factory" is not passed as parameter anymore.
-        # params["factory"] = self
-
-        if collection == "toys" or collection == "tutorials" or collection == "other":
-            constructor = NeuralModuleFactory.__name_import("nemo.backends.pytorch.tutorials." + name)
-        elif collection == "nemo_nlp":
-            constructor = NeuralModuleFactory.__name_import("nemo_nlp." + name)
-            if name == "BERT" and pretrained is True:
-                params["pretrained"] = True
-        elif collection == "nemo_asr":
-            constructor = NeuralModuleFactory.__name_import("nemo_asr." + name)
-        elif collection == "nemo_lpr":
-            constructor = NeuralModuleFactory.__name_import("nemo_lpr." + name)
-        elif collection == 'common':
-            constructor = NeuralModuleFactory.__name_import('nemo.backends.pytorch.common.' + name)
-        elif collection == "torchvision":
-            import torchvision.models as tv_models
-            import nemo.backends.pytorch.module_wrapper as mw
-            import torch.nn as nn
-
-            if name == "ImageFolderDataLayer":
-                constructor = NeuralModuleFactory.__name_import("nemo.backends.pytorch.torchvision.data." + name)
-                instance = constructor(**params)
-                return instance
-            else:
-                _nm_name = name.lower()
-                if _nm_name == "resnet18":
-                    input_ports = {
-                        "x": NeuralType(
-                            {
-                                0: AxisType(BatchTag),
-                                1: AxisType(ChannelTag),
-                                2: AxisType(HeightTag, 224),
-                                3: AxisType(WidthTag, 224),
-                            }
-                        )
-                    }
-                    output_ports = {"output": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})}
-
-                    pt_model = tv_models.resnet18(pretrained=pretrained)
-                    num_classes = params.get("num_classes", None)
-                    if num_classes is not None:
-                        pt_model.fc = nn.Linear(512, params["num_classes"])
-                    return mw.TrainableNeuralModuleWrapper(
-                        pt_nn_module=pt_model, input_ports_dict=input_ports, output_ports_dict=output_ports,
-                    )
-                elif _nm_name == "resnet50":
-                    input_ports = {
-                        "x": NeuralType(
-                            {
-                                0: AxisType(BatchTag),
-                                1: AxisType(ChannelTag),
-                                2: AxisType(HeightTag, 224),
-                                3: AxisType(WidthTag, 224),
-                            }
-                        )
-                    }
-                    output_ports = {"output": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})}
-
-                    pt_model = tv_models.resnet50(pretrained=pretrained)
-                    num_classes = params.get("num_classes", None)
-                    if num_classes is not None:
-                        pt_model.fc = nn.Linear(2048, params["num_classes"])
-                    return mw.TrainableNeuralModuleWrapper(
-                        pt_nn_module=pt_model, input_ports_dict=input_ports, output_ports_dict=output_ports,
-                    )
-        else:
-            collection_path = "nemo.collections." + collection + "." + name
-            constructor = NeuralModuleFactory.__name_import(collection_path)
-            if name == "BERT" and pretrained is True:
-                params["pretrained"] = True
-
-        # TK: "placement" is not passed as parameter anymore.
-        # if "placement" not in params:
-        #    params["placement"] = self._placement
-        instance = constructor(**params)
-        return instance
-
     @deprecated(version=0.11)
     def get_module(self, name, collection, params, pretrained=False):
         """

From 912d83d409001c5c57d95019a3ab563729e1f74b Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Tue, 12 May 2020 14:13:34 -0700
Subject: [PATCH 05/40] add decorators; add all events

Signed-off-by: Jason <jasoli@nvidia.com>
---
 examples/asr/jasper_an4_debug.py |  6 +++
 nemo/backends/pytorch/actions.py |  4 +-
 nemo/core/callbacks.py           | 91 +++++++++++++++++++++++++++++---
 nemo/core/neural_factory.py      | 56 +++++++++++++++++---
 4 files changed, 139 insertions(+), 18 deletions(-)

diff --git a/examples/asr/jasper_an4_debug.py b/examples/asr/jasper_an4_debug.py
index f06055baec8a..bd0f2ec99b48 100755
--- a/examples/asr/jasper_an4_debug.py
+++ b/examples/asr/jasper_an4_debug.py
@@ -107,6 +107,12 @@ def create_dags(model_config_file, vocab, args, nf):
     # callbacks = [train_callback, checkpointer_callback, eval_callback]
     callbacks = [train_callback]
 
+    @nemo.core.callbacks.on_step_start
+    def my_own_func(state):
+        print(state)
+
+    callbacks.append(my_own_func)
+
     # Return entities required by the actual training.
     return (
         loss,
diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 90ea4d34a490..dec18f0e9ea9 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -1258,7 +1258,7 @@ def train(
                     curr_optimizer = training_loop[self.step % len(training_loop)][0]
                     curr_optimizer.zero_grad()
                     # Register iteration start with callbacks
-                    self._perform_on_iteration_start(callbacks=callbacks)
+                    self._perform_on_step_start(callbacks=callbacks)
 
                 # set learning rate policy
                 if lr_policy is not None:
@@ -1362,7 +1362,7 @@ def train(
                     self._update_callbacks(
                         callbacks=callbacks, registered_tensors=self.training_state.tensor_dict, final_loss=final_loss
                     )
-                    self._perform_on_iteration_end(callbacks=callbacks)
+                    self._perform_on_step_end(callbacks=callbacks)
                     self.step += 1
                 self.training_state.clear_dict()
             # End of epoch for loop
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 13a01c38390e..9ce148507b02 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -38,28 +38,33 @@
 
 
 class NeMoCallback(ABC):
-    def on_action_start(self, state):
+    def on_train_start(self, state):
         pass
 
-    def on_action_end(self, state):
+    def on_epoch_start(self, state):
         pass
 
-    def on_epoch_start(self, state):
+    def on_batch_start(self, state):
         pass
 
-    def on_epoch_end(self, state):
+    def on_step_start(self, state):
         pass
 
-    def on_iteration_start(self, state):
+    def on_step_end(self, state):
         pass
 
-    def on_iteration_end(self, state):
+    def on_batch_end(self, state):
+        pass
+
+    def on_epoch_end(self, state):
+        pass
+
+    def on_train_end(self, state):
         pass
 
 
 class SimpleLossLogger(NeMoCallback):
     def __init__(self, step_freq=100, tensors_to_log=["loss"]):
-        super().__init__()
         # Step_freq: how often logs are printed
         self.step_freq = step_freq
         self.tensors_to_log = tensors_to_log
@@ -67,7 +72,7 @@ def __init__(self, step_freq=100, tensors_to_log=["loss"]):
     # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]):
     #     #tensors_to_log: List of keys into state that will be logged
 
-    def on_iteration_end(self, state):
+    def on_step_end(self, state):
         if state["step"] % self.step_freq == 0:
             for tensor_key in self.tensors_to_log:
                 tensor = state["tensors"].get_tensor(tensor_key)
@@ -78,6 +83,76 @@ def on_iteration_end(self, state):
                 #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
                 #                    f"Current state tensors include {state['tensors'].tensor_list()}")
 
+def on_train_start(func):
+    class NeMoCallbackWrapper(NeMoCallback):
+        def __init__(self, my_func):
+            self._func = my_func
+        def on_train_start(self, state):
+            self._func(state)
+    return NeMoCallbackWrapper(func)
+
+
+def on_epoch_start(func):
+    class NeMoCallbackWrapper(NeMoCallback):
+        def __init__(self, my_func):
+            self._func = my_func
+        def on_epoch_start(self, state):
+            self._func(state)
+    return NeMoCallbackWrapper(func)
+
+
+def on_batch_start(func):
+    class NeMoCallbackWrapper(NeMoCallback):
+        def __init__(self, my_func):
+            self._func = my_func
+        def on_batch_start(self, state):
+            self._func(state)
+    return NeMoCallbackWrapper(func)
+
+
+def on_step_start(func):
+    class NeMoCallbackWrapper(NeMoCallback):
+        def __init__(self, my_func):
+            self._func = my_func
+        def on_step_start(self, state):
+            self._func(state)
+    return NeMoCallbackWrapper(func)
+
+
+def on_step_end(func):
+    class NeMoCallbackWrapper(NeMoCallback):
+        def __init__(self, my_func):
+            self._func = my_func
+        def on_step_end(self, state):
+            self._func(state)
+    return NeMoCallbackWrapper(func)
+
+
+def on_batch_end(func):
+    class NeMoCallbackWrapper(NeMoCallback):
+        def __init__(self, my_func):
+            self._func = my_func
+        def on_batch_end(self, state):
+            self._func(state)
+    return NeMoCallbackWrapper(func)
+
+
+def on_epoch_end(func):
+    class NeMoCallbackWrapper(NeMoCallback):
+        def __init__(self, my_func):
+            self._func = my_func
+        def on_epoch_end(self, state):
+            self._func(state)
+    return NeMoCallbackWrapper(func)
+
+
+def on_train_end(func):
+    class NeMoCallbackWrapper(NeMoCallback):
+        def __init__(self, my_func):
+            self._func = my_func
+        def on_train_end(self, state):
+            self._func(state)
+    return NeMoCallbackWrapper(func)
 
 class ActionCallback(ABC):
     """Abstract interface for callbacks.
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index fbfc87ec4abf..3416c5fc8032 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -35,7 +35,7 @@
 
 import nemo
 from nemo.utils import ExpManager
-from nemo.core.callbacks import ActionCallback, EvaluatorCallback
+from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback
 from nemo.core.neural_types import NmTensor
 from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
@@ -352,55 +352,95 @@ def create_optimizer(self, optimizer, things_to_optimize, optimizer_params):
         """
         pass
 
-    def _perform_on_iteration_start(self, callbacks):
+    def _perform_on_step_start(self, callbacks):
         # TODO: Most of these checks can be relaxed since we enforce callbacks
         # to be a list of ActionCallback objects
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 if isinstance(callback, ActionCallback):
                     callback.on_iteration_start()
+                elif isinstance(callback, NeMoCallback):
+                    callback.on_step_start(self.state)
                 else:
-                    callback.on_iteration_start(self.state)
+                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
+                                     "understood")
 
-    def _perform_on_iteration_end(self, callbacks):
+    def _perform_on_step_end(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 if isinstance(callback, ActionCallback):
                     callback.on_iteration_end()
+                elif isinstance(callback, NeMoCallback):
+                    callback.on_step_end(self.state)
                 else:
-                    callback.on_iteration_end(self.state)
+                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
+                                     "understood")
 
     def _perform_on_action_start(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 if isinstance(callback, ActionCallback):
                     callback.on_action_start()
+                elif isinstance(callback, NeMoCallback):
+                    callback.on_train_start(self.state)
                 else:
-                    callback.on_action_start(self.state)
+                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
+                                     "understood")
 
     def _perform_on_action_end(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 if isinstance(callback, ActionCallback):
                     callback.on_action_end()
+                elif isinstance(callback, NeMoCallback):
+                    callback.on_train_end(self.state)
                 else:
-                    callback.on_action_end(self.state)
+                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
+                                     "understood")
 
     def _perform_on_epoch_start(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 if isinstance(callback, ActionCallback):
                     callback.on_epoch_start()
-                else:
+                elif isinstance(callback, NeMoCallback):
                     callback.on_epoch_start(self.state)
+                else:
+                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
+                                     "understood")
 
     def _perform_on_epoch_end(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 if isinstance(callback, ActionCallback):
                     callback.on_epoch_end()
+                elif isinstance(callback, NeMoCallback):
+                    callback.on_epoch_end(self.state)
+                else:
+                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
+                                     "understood")
+
+    def _perform_on_batch_start(self, callbacks):
+        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+            for callback in callbacks:
+                if isinstance(callback, ActionCallback):
+                    continue
+                elif isinstance(callback, NeMoCallback):
+                    callback.on_epoch_start(self.state)
                 else:
+                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
+                                     "understood")
+
+    def _perform_on_batch_end(self, callbacks):
+        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+            for callback in callbacks:
+                if isinstance(callback, ActionCallback):
+                    continue
+                elif isinstance(callback, NeMoCallback):
                     callback.on_epoch_end(self.state)
+                else:
+                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
+                                     "understood")
 
     def _init_callbacks(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:

From 2e4eb18cf9542a2d3e9b5f29243381a84da1605d Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Tue, 12 May 2020 14:15:17 -0700
Subject: [PATCH 06/40] style

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/core/callbacks.py      | 18 ++++++++++++++++
 nemo/core/neural_factory.py | 42 ++++++++++++++++++++++---------------
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 9ce148507b02..5780818352df 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -83,12 +83,15 @@ def on_step_end(self, state):
                 #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
                 #                    f"Current state tensors include {state['tensors'].tensor_list()}")
 
+
 def on_train_start(func):
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
+
         def on_train_start(self, state):
             self._func(state)
+
     return NeMoCallbackWrapper(func)
 
 
@@ -96,8 +99,10 @@ def on_epoch_start(func):
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
+
         def on_epoch_start(self, state):
             self._func(state)
+
     return NeMoCallbackWrapper(func)
 
 
@@ -105,8 +110,10 @@ def on_batch_start(func):
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
+
         def on_batch_start(self, state):
             self._func(state)
+
     return NeMoCallbackWrapper(func)
 
 
@@ -114,8 +121,10 @@ def on_step_start(func):
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
+
         def on_step_start(self, state):
             self._func(state)
+
     return NeMoCallbackWrapper(func)
 
 
@@ -123,8 +132,10 @@ def on_step_end(func):
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
+
         def on_step_end(self, state):
             self._func(state)
+
     return NeMoCallbackWrapper(func)
 
 
@@ -132,8 +143,10 @@ def on_batch_end(func):
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
+
         def on_batch_end(self, state):
             self._func(state)
+
     return NeMoCallbackWrapper(func)
 
 
@@ -141,8 +154,10 @@ def on_epoch_end(func):
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
+
         def on_epoch_end(self, state):
             self._func(state)
+
     return NeMoCallbackWrapper(func)
 
 
@@ -150,10 +165,13 @@ def on_train_end(func):
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
+
         def on_train_end(self, state):
             self._func(state)
+
     return NeMoCallbackWrapper(func)
 
+
 class ActionCallback(ABC):
     """Abstract interface for callbacks.
     """
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 3416c5fc8032..b9d6662c543e 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -34,9 +34,9 @@
 import numpy as np
 
 import nemo
-from nemo.utils import ExpManager
 from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback
 from nemo.core.neural_types import NmTensor
+from nemo.utils import ExpManager
 from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
 
@@ -362,8 +362,9 @@ def _perform_on_step_start(self, callbacks):
                 elif isinstance(callback, NeMoCallback):
                     callback.on_step_start(self.state)
                 else:
-                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
-                                     "understood")
+                    raise ValueError(
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                    )
 
     def _perform_on_step_end(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
@@ -373,8 +374,9 @@ def _perform_on_step_end(self, callbacks):
                 elif isinstance(callback, NeMoCallback):
                     callback.on_step_end(self.state)
                 else:
-                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
-                                     "understood")
+                    raise ValueError(
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                    )
 
     def _perform_on_action_start(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
@@ -384,8 +386,9 @@ def _perform_on_action_start(self, callbacks):
                 elif isinstance(callback, NeMoCallback):
                     callback.on_train_start(self.state)
                 else:
-                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
-                                     "understood")
+                    raise ValueError(
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                    )
 
     def _perform_on_action_end(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
@@ -395,8 +398,9 @@ def _perform_on_action_end(self, callbacks):
                 elif isinstance(callback, NeMoCallback):
                     callback.on_train_end(self.state)
                 else:
-                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
-                                     "understood")
+                    raise ValueError(
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                    )
 
     def _perform_on_epoch_start(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
@@ -406,8 +410,9 @@ def _perform_on_epoch_start(self, callbacks):
                 elif isinstance(callback, NeMoCallback):
                     callback.on_epoch_start(self.state)
                 else:
-                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
-                                     "understood")
+                    raise ValueError(
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                    )
 
     def _perform_on_epoch_end(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
@@ -417,8 +422,9 @@ def _perform_on_epoch_end(self, callbacks):
                 elif isinstance(callback, NeMoCallback):
                     callback.on_epoch_end(self.state)
                 else:
-                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
-                                     "understood")
+                    raise ValueError(
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                    )
 
     def _perform_on_batch_start(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
@@ -428,8 +434,9 @@ def _perform_on_batch_start(self, callbacks):
                 elif isinstance(callback, NeMoCallback):
                     callback.on_epoch_start(self.state)
                 else:
-                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
-                                     "understood")
+                    raise ValueError(
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                    )
 
     def _perform_on_batch_end(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
@@ -439,8 +446,9 @@ def _perform_on_batch_end(self, callbacks):
                 elif isinstance(callback, NeMoCallback):
                     callback.on_epoch_end(self.state)
                 else:
-                    raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not "
-                                     "understood")
+                    raise ValueError(
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                    )
 
     def _init_callbacks(self, callbacks):
         if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:

From 35d6b7d4306559cf026ba676422211826cafc858 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Tue, 12 May 2020 14:15:47 -0700
Subject: [PATCH 07/40] more style

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/core/neural_factory.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index b9d6662c543e..79c357582f57 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -363,7 +363,7 @@ def _perform_on_step_start(self, callbacks):
                     callback.on_step_start(self.state)
                 else:
                     raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
                     )
 
     def _perform_on_step_end(self, callbacks):
@@ -375,7 +375,7 @@ def _perform_on_step_end(self, callbacks):
                     callback.on_step_end(self.state)
                 else:
                     raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
                     )
 
     def _perform_on_action_start(self, callbacks):
@@ -387,7 +387,7 @@ def _perform_on_action_start(self, callbacks):
                     callback.on_train_start(self.state)
                 else:
                     raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
                     )
 
     def _perform_on_action_end(self, callbacks):
@@ -399,7 +399,7 @@ def _perform_on_action_end(self, callbacks):
                     callback.on_train_end(self.state)
                 else:
                     raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
                     )
 
     def _perform_on_epoch_start(self, callbacks):
@@ -411,7 +411,7 @@ def _perform_on_epoch_start(self, callbacks):
                     callback.on_epoch_start(self.state)
                 else:
                     raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
                     )
 
     def _perform_on_epoch_end(self, callbacks):
@@ -423,7 +423,7 @@ def _perform_on_epoch_end(self, callbacks):
                     callback.on_epoch_end(self.state)
                 else:
                     raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
                     )
 
     def _perform_on_batch_start(self, callbacks):
@@ -435,7 +435,7 @@ def _perform_on_batch_start(self, callbacks):
                     callback.on_epoch_start(self.state)
                 else:
                     raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
                     )
 
     def _perform_on_batch_end(self, callbacks):
@@ -447,7 +447,7 @@ def _perform_on_batch_end(self, callbacks):
                     callback.on_epoch_end(self.state)
                 else:
                     raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood"
+                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
                     )
 
     def _init_callbacks(self, callbacks):

From 4f6e1f71691ec751969dff6ce50b14b2208dee28 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Fri, 15 May 2020 17:02:41 -0700
Subject: [PATCH 08/40] initial refactor

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py | 155 +++++++++++++++
 nemo/core/callbacks.py           | 101 ++++++----
 nemo/core/neural_factory.py      | 314 +++++++++++++------------------
 3 files changed, 356 insertions(+), 214 deletions(-)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index dec18f0e9ea9..ee9f9cd6ce13 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -50,6 +50,45 @@
 }
 
 
+class TrainingState:
+    def __init__(self, action):
+        tensor_naming_registery = AppState().tensor_names
+        self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None)
+        self._action = action
+
+    def tensor_list(self):
+        return self.tensor_dict.keys()
+
+    def clear_dict(self):
+        for name in self.tensor_dict:
+            self.tensor_dict[name] = None
+
+    def set_tensor(self, tensor, value):
+        self.tensor_dict[tensor.unique_name] = value
+
+    def check_tensor_cached(self, unique_name):
+        if self.tensor_dict[unique_name] is None:
+            return False
+        return True
+
+    def get_tensor(self, name):
+        unique_name = AppState().tensor_names[name]
+        return self.tensor_dict[unique_name]
+
+    def get_and_compute_tensor(self, name):
+        unique_name = AppState().tensor_names[name]
+        tensor_value = self.tensor_dict[unique_name]
+        if tensor_value is None:
+            nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name]
+            callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self)
+            # print(callchain)
+            callchain.insert(0, ())
+            self._action.nm_graph_forward_pass(callchain, self.tensor_dict)
+            # print(self.tensor_dict[unique_name])
+            tensor_value = self.tensor_dict[unique_name]
+        return tensor_value
+
+
 class PtActions(Actions):
     def __init__(
         self, local_rank=None, global_rank=None, tb_writer=None, optimization_level=Optimization.mxprO0,
@@ -993,6 +1032,122 @@ def train(
         gradient_predivide=False,
         amp_max_loss_scale=2.0 ** 24,
     ):
+        def _perform_on_step_start(callbacks, state):
+            # TODO: Most of these checks can be relaxed since we enforce callbacks
+            # to be a list of ActionCallback objects
+            if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+                for callback in callbacks:
+                    if isinstance(callback, ActionCallback):
+                        callback.on_iteration_start()
+                    elif isinstance(callback, NeMoCallback):
+                        callback.on_step_start(state)
+                    else:
+                        raise ValueError(
+                            "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+                        )
+
+        def _perform_on_step_end(callbacks, state):
+            if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+                for callback in callbacks:
+                    if isinstance(callback, ActionCallback):
+                        callback.on_iteration_end()
+                    elif isinstance(callback, NeMoCallback):
+                        callback.on_step_end(state)
+                    else:
+                        raise ValueError(
+                            "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+                        )
+
+        def _perform_on_action_start(callbacks, state):
+            if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+                for callback in callbacks:
+                    if isinstance(callback, ActionCallback):
+                        callback.on_action_start()
+                    elif isinstance(callback, NeMoCallback):
+                        callback.on_train_start(state)
+                    else:
+                        raise ValueError(
+                            "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+                        )
+
+        def _perform_on_action_end(callbacks, state):
+            if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+                for callback in callbacks:
+                    if isinstance(callback, ActionCallback):
+                        callback.on_action_end()
+                    elif isinstance(callback, NeMoCallback):
+                        callback.on_train_end(state)
+                    else:
+                        raise ValueError(
+                            "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+                        )
+
+        def _perform_on_epoch_start(callbacks, state):
+            if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+                for callback in callbacks:
+                    if isinstance(callback, ActionCallback):
+                        callback.on_epoch_start()
+                    elif isinstance(callback, NeMoCallback):
+                        callback.on_epoch_start(state)
+                    else:
+                        raise ValueError(
+                            "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+                        )
+
+        def _perform_on_epoch_end(callbacks, state):
+            if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+                for callback in callbacks:
+                    if isinstance(callback, ActionCallback):
+                        callback.on_epoch_end()
+                    elif isinstance(callback, NeMoCallback):
+                        callback.on_epoch_end(state)
+                    else:
+                        raise ValueError(
+                            "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+                        )
+
+        def _perform_on_batch_start(callbacks, state):
+            if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+                for callback in callbacks:
+                    if isinstance(callback, ActionCallback):
+                        continue
+                    elif isinstance(callback, NeMoCallback):
+                        callback.on_epoch_start(state)
+                    else:
+                        raise ValueError(
+                            "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+                        )
+
+        def _perform_on_batch_end(callbacks, state):
+            if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+                for callback in callbacks:
+                    if isinstance(callback, ActionCallback):
+                        continue
+                    elif isinstance(callback, NeMoCallback):
+                        callback.on_epoch_end(state)
+                    else:
+                        raise ValueError(
+                            "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+                        )
+
+        def _init_callbacks(callbacks, action):
+            if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+                for callback in callbacks:
+                    if isinstance(callback, ActionCallback):
+                        callback.action = action
+
+        def _update_callbacks(callbacks=None, registered_tensors=None, final_loss=None):
+            # if self.local_rank is None or self.local_rank == 0:
+            if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+                for callback in callbacks:
+                    if isinstance(callback, ActionCallback):
+                        callback._registered_tensors = registered_tensors
+                    else:  # For now, we can use the old callback function. In the future we should improve this
+                        registered_tensors["loss"] = final_loss
+
+        def get_state(self):
+            return {"step": self.step, "tensors": self._training_state, "epoch_num":self.epoch_num, "optimizer": self.optimizers}
+
         self._training_state = TrainingState(self)
         # Analyse the arguments passed to train.
         if tensors_to_optimize is not None and training_graph is not None:
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 5780818352df..ecdb38fa30dc 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -26,6 +26,7 @@
 
 import nemo
 from nemo.utils import get_checkpoint_from_dir
+from nemo.utils.app_state import AppState
 
 try:
     import wandb
@@ -63,6 +64,46 @@ def on_train_end(self, state):
         pass
 
 
+class TensorboardLogger(NeMoCallback):
+    def __init__(self, step_freq=100, tensors_to_log=["loss"]):
+        # Step_freq: how often logs are printed
+        self.step_freq = step_freq
+        self.tensors_to_log = tensors_to_log
+
+    # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]):
+    #     #tensors_to_log: List of keys into state that will be logged
+
+    def on_step_end(self, state):
+        if state["step"] % self.step_freq == 0:
+            for tensor_key in self.tensors_to_log:
+                tensor = state["tensors"].get_tensor(tensor_key)
+                if tensor is None:
+                    tensor = state["tensors"].get_and_compute_tensor(tensor_key)
+                logging.info("%s: %s", tensor_key, tensor)
+                # except KeyError:
+                #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
+                #                    f"Current state tensors include {state['tensors'].tensor_list()}")
+
+class WandBLogger(NeMoCallback):
+    def __init__(self, step_freq=100, tensors_to_log=["loss"]):
+        # Step_freq: how often logs are printed
+        self.step_freq = step_freq
+        self.tensors_to_log = tensors_to_log
+
+    # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]):
+    #     #tensors_to_log: List of keys into state that will be logged
+
+    def on_step_end(self, state):
+        if state["step"] % self.step_freq == 0:
+            for tensor_key in self.tensors_to_log:
+                tensor = state["tensors"].get_tensor(tensor_key)
+                if tensor is None:
+                    tensor = state["tensors"].get_and_compute_tensor(tensor_key)
+                logging.info("%s: %s", tensor_key, tensor)
+                # except KeyError:
+                #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
+                #                    f"Current state tensors include {state['tensors'].tensor_list()}")
+
 class SimpleLossLogger(NeMoCallback):
     def __init__(self, step_freq=100, tensors_to_log=["loss"]):
         # Step_freq: how often logs are printed
@@ -374,7 +415,7 @@ def on_iteration_end(self):
                 logging.info(f"Step time: {run_time} seconds")
 
 
-class CheckpointCallback(ActionCallback):
+class CheckpointCallback(NeMoCallback):
     """
     For callback documentation: please see
     https://nvidia.github.io/NeMo/tutorials/callbacks.html
@@ -400,14 +441,14 @@ def __init__(
         # If True, run will fail if we cannot load module weights
         self._force_load = force_load
 
-    def __save_to(self, path):
-        if self.global_rank is not None and self.global_rank != 0:
+    def __save_to(self, path, state):
+        if state.global_rank is not None and state.global_rank != 0:
             return
         if not os.path.isdir(path):
             logging.info(f"Creating {path} folder")
             os.makedirs(path, exist_ok=True)
         unique_mod_names = set()
-        for module in self.action.modules:
+        for module in AppState().modules:
             if module.num_weights > 0:
                 if str(module) in unique_mod_names:
                     raise NotImplementedError(
@@ -416,19 +457,19 @@ def __save_to(self, path):
                     )
                 unique_mod_names.add(str(module))
                 if self._step_freq > -1:
-                    filename = f"{module}-STEP-{self.step}.pt"
+                    filename = f"{module}-STEP-{state.step}.pt"
                 else:
-                    filename = f"{module}-EPOCH-{self.epoch_num}.pt"
+                    filename = f"{module}-EPOCH-{state.epoch_num}.pt"
                 module.save_to(os.path.join(path, filename))
 
         if self._step_freq > -1:
-            filename = f"trainer-STEP-{self.step}.pt"
-            self.action.save_state_to(f'{path}/{filename}')
-            self._saved_ckpts.append(f'-{self.step}.pt')
+            filename = f"trainer-STEP-{state.step}.pt"
+            state.save_state_to(f'{path}/{filename}')
+            self._saved_ckpts.append(f'-{state.step}.pt')
         else:
-            filename = f"trainer-EPOCH-{self.epoch_num}.pt"
-            self.action.save_state_to(f'{path}/{filename}')
-            self._saved_ckpts.append(f'-{self.epoch_num}.pt')
+            filename = f"trainer-EPOCH-{state.epoch_num}.pt"
+            state.save_state_to(f'{path}/{filename}')
+            self._saved_ckpts.append(f'-{state.epoch_num}.pt')
 
         if len(self._saved_ckpts) > self._ckpt2keep:
             for end in self._saved_ckpts[: -self._ckpt2keep]:
@@ -437,7 +478,7 @@ def __save_to(self, path):
             self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :]
         logging.info(f'Saved checkpoint: {path}/{filename}')
 
-    def __restore_from(self, path):
+    def __restore_from(self, path, state):
         if not os.path.isdir(path):
             if self._force_load:
                 raise ValueError("force_load was set to True for checkpoint callback but a checkpoint was not found.")
@@ -446,7 +487,7 @@ def __restore_from(self, path):
             logging.info(f"Found checkpoint folder {path}. Will attempt to restore checkpoints from it.")
             modules_to_restore = []
             modules_to_restore_name = []
-            for module in self.action.modules:
+            for module in AppState().modules:
                 if module.num_weights > 0:
                     modules_to_restore.append(module)
                     modules_to_restore_name.append(str(module))
@@ -454,7 +495,7 @@ def __restore_from(self, path):
                 module_checkpoints = get_checkpoint_from_dir(modules_to_restore_name, path)
 
                 for mod, checkpoint in zip(modules_to_restore, module_checkpoints):
-                    mod.restore_from(checkpoint, self.local_rank)
+                    mod.restore_from(checkpoint, state.local_rank)
             except (BaseException, ValueError) as e:
                 if self._force_load:
                     raise ValueError(
@@ -469,8 +510,8 @@ def __restore_from(self, path):
 
             try:
                 trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path)
-                for tr, checkpoint in zip([self.action], trainer_checkpoints):
-                    tr.restore_state_from(checkpoint)
+                state.restore_state_from(checkpoint)
+                # for tr, checkpoint in zip([self.action], trainer_checkpoints):
             except (BaseException, ValueError) as e:
                 logging.warning(e)
                 logging.warning(
@@ -479,10 +520,10 @@ def __restore_from(self, path):
                 )
                 return
 
-    def on_action_start(self):
+    def on_train_start(self, state):
         num_parameters = 0
         unique_mod_names = set()
-        for module in self.action.modules:
+        for module in AppState().modules:
             if module.num_weights > 0:
                 if str(module) in unique_mod_names:
                     raise NotImplementedError(
@@ -491,29 +532,25 @@ def on_action_start(self):
                     )
                 unique_mod_names.add(str(module))
                 num_parameters += module.num_weights
-        logging.info(f"Found {len(unique_mod_names)} modules with " f"weights:")
+        logging.info(f"Found {len(unique_mod_names)} modules with weights:")
         for name in unique_mod_names:
             logging.info(f"{name}")
         logging.info(f"Total model parameters: {num_parameters}")
         self.__restore_from(path=self._load_from_folder)
 
-    def on_iteration_end(self):
-        step = self.step
+    def on_step_end(self, state):
+        step = state["step"]
         if self._step_freq > 0 and step % self._step_freq == 0 and step > 0:
             self.__save_to(path=self._folder)
 
-    def on_action_end(self):
+    def on_train_end(self, state):
         if self._step_freq > 0 or self._epoch_freq > 0:
             self.__save_to(path=self._folder)
 
-    def on_epoch_start(self):
-        self._last_epoch_start = time.time()
-
-    def on_epoch_end(self):
-        if self._epoch_freq > 0:
-            if self.global_rank is None or self.global_rank == 0:
-                if (self.epoch_num + 1) % self._epoch_freq == 0:
-                    self.__save_to(path=self._folder)
+    def on_epoch_end(self, state):
+        epoch = state["epoch"]
+        if self._epoch_freq > 0 and epoch % self._epoch_freq == 0 and epoch > 0:
+            self.__save_to(path=self._folder)
 
 
 class EvaluatorCallback(ActionCallback):
@@ -712,7 +749,7 @@ def on_iteration_start(self):
                 m.unfreeze()
 
 
-class WandbCallback(ActionCallback):
+class OldWandbCallback(ActionCallback):
     """
     Log metrics to [Weights & Biases](https://docs.wandb.com/)
     """
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 79c357582f57..f80df8c39b23 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -192,45 +192,6 @@ class DeviceType(Enum):
     AllGpu = 3
 
 
-class TrainingState:
-    def __init__(self, action):
-        tensor_naming_registery = AppState().tensor_names
-        self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None)
-        self._action = action
-
-    def tensor_list(self):
-        return self.tensor_dict.keys()
-
-    def clear_dict(self):
-        for name in self.tensor_dict:
-            self.tensor_dict[name] = None
-
-    def set_tensor(self, tensor, value):
-        self.tensor_dict[tensor.unique_name] = value
-
-    def check_tensor_cached(self, unique_name):
-        if self.tensor_dict[unique_name] is None:
-            return False
-        return True
-
-    def get_tensor(self, name):
-        unique_name = AppState().tensor_names[name]
-        return self.tensor_dict[unique_name]
-
-    def get_and_compute_tensor(self, name):
-        unique_name = AppState().tensor_names[name]
-        tensor_value = self.tensor_dict[unique_name]
-        if tensor_value is None:
-            nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name]
-            callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self)
-            # print(callchain)
-            callchain.insert(0, ())
-            self._action.nm_graph_forward_pass(callchain, self.tensor_dict)
-            # print(self.tensor_dict[unique_name])
-            tensor_value = self.tensor_dict[unique_name]
-        return tensor_value
-
-
 class Actions(ABC):
     """Basic actions allowed on graphs of Neural Modules"""
 
@@ -238,17 +199,6 @@ def __init__(self, local_rank, global_rank, optimization_level=Optimization.mxpr
         self._local_rank = local_rank
         self._global_rank = global_rank
         self._optim_level = optimization_level
-        self.step = None
-        self.epoch_num = None
-        self._training_state = None
-
-    @property
-    def state(self):
-        return {"step": self.step, "tensors": self.training_state}
-
-    @property
-    def training_state(self):
-        return self._training_state
 
     @property
     def local_rank(self):
@@ -312,29 +262,29 @@ def infer(self, tensors: List[NmTensor]):
         """
         pass
 
-    @abstractmethod
-    def save_state_to(self, path: str):
-        """
-        Saves current state such as step, epoch and optimizer parameters
-        Args:
-          path:
+    # @abstractmethod
+    # def save_state_to(self, path: str):
+    #     """
+    #     Saves current state such as step, epoch and optimizer parameters
+    #     Args:
+    #       path:
 
-        Returns:
+    #     Returns:
 
-        """
-        pass
+    #     """
+    #     pass
 
-    @abstractmethod
-    def restore_state_from(self, path: str):
-        """
-        Restores state such as step, epoch and optimizer parameters
-        Args:
-          path:
+    # @abstractmethod
+    # def restore_state_from(self, path: str):
+    #     """
+    #     Restores state such as step, epoch and optimizer parameters
+    #     Args:
+    #       path:
 
-        Returns:
+    #     Returns:
 
-        """
-        pass
+    #     """
+    #     pass
 
     @abstractmethod
     def create_optimizer(self, optimizer, things_to_optimize, optimizer_params):
@@ -352,120 +302,120 @@ def create_optimizer(self, optimizer, things_to_optimize, optimizer_params):
         """
         pass
 
-    def _perform_on_step_start(self, callbacks):
-        # TODO: Most of these checks can be relaxed since we enforce callbacks
-        # to be a list of ActionCallback objects
-        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-            for callback in callbacks:
-                if isinstance(callback, ActionCallback):
-                    callback.on_iteration_start()
-                elif isinstance(callback, NeMoCallback):
-                    callback.on_step_start(self.state)
-                else:
-                    raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-                    )
-
-    def _perform_on_step_end(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-            for callback in callbacks:
-                if isinstance(callback, ActionCallback):
-                    callback.on_iteration_end()
-                elif isinstance(callback, NeMoCallback):
-                    callback.on_step_end(self.state)
-                else:
-                    raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-                    )
-
-    def _perform_on_action_start(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-            for callback in callbacks:
-                if isinstance(callback, ActionCallback):
-                    callback.on_action_start()
-                elif isinstance(callback, NeMoCallback):
-                    callback.on_train_start(self.state)
-                else:
-                    raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-                    )
-
-    def _perform_on_action_end(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-            for callback in callbacks:
-                if isinstance(callback, ActionCallback):
-                    callback.on_action_end()
-                elif isinstance(callback, NeMoCallback):
-                    callback.on_train_end(self.state)
-                else:
-                    raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-                    )
-
-    def _perform_on_epoch_start(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-            for callback in callbacks:
-                if isinstance(callback, ActionCallback):
-                    callback.on_epoch_start()
-                elif isinstance(callback, NeMoCallback):
-                    callback.on_epoch_start(self.state)
-                else:
-                    raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-                    )
-
-    def _perform_on_epoch_end(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-            for callback in callbacks:
-                if isinstance(callback, ActionCallback):
-                    callback.on_epoch_end()
-                elif isinstance(callback, NeMoCallback):
-                    callback.on_epoch_end(self.state)
-                else:
-                    raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-                    )
-
-    def _perform_on_batch_start(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-            for callback in callbacks:
-                if isinstance(callback, ActionCallback):
-                    continue
-                elif isinstance(callback, NeMoCallback):
-                    callback.on_epoch_start(self.state)
-                else:
-                    raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-                    )
-
-    def _perform_on_batch_end(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-            for callback in callbacks:
-                if isinstance(callback, ActionCallback):
-                    continue
-                elif isinstance(callback, NeMoCallback):
-                    callback.on_epoch_end(self.state)
-                else:
-                    raise ValueError(
-                        "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-                    )
-
-    def _init_callbacks(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-            for callback in callbacks:
-                if isinstance(callback, ActionCallback):
-                    callback.action = self
-
-    def _update_callbacks(
-        self, callbacks=None, registered_tensors=None, final_loss=None,
-    ):
-        # if self.local_rank is None or self.local_rank == 0:
-        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-            for callback in callbacks:
-                if isinstance(callback, ActionCallback):
-                    callback._registered_tensors = registered_tensors
-                else:  # For now, we can use the old callback function. In the future we should improve this
-                    self.training_state.tensor_dict["loss"] = final_loss
+    # def _perform_on_step_start(self, callbacks):
+    #     # TODO: Most of these checks can be relaxed since we enforce callbacks
+    #     # to be a list of ActionCallback objects
+    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+    #         for callback in callbacks:
+    #             if isinstance(callback, ActionCallback):
+    #                 callback.on_iteration_start()
+    #             elif isinstance(callback, NeMoCallback):
+    #                 callback.on_step_start(self.state)
+    #             else:
+    #                 raise ValueError(
+    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+    #                 )
+
+    # def _perform_on_step_end(self, callbacks):
+    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+    #         for callback in callbacks:
+    #             if isinstance(callback, ActionCallback):
+    #                 callback.on_iteration_end()
+    #             elif isinstance(callback, NeMoCallback):
+    #                 callback.on_step_end(self.state)
+    #             else:
+    #                 raise ValueError(
+    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+    #                 )
+
+    # def _perform_on_action_start(self, callbacks):
+    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+    #         for callback in callbacks:
+    #             if isinstance(callback, ActionCallback):
+    #                 callback.on_action_start()
+    #             elif isinstance(callback, NeMoCallback):
+    #                 callback.on_train_start(self.state)
+    #             else:
+    #                 raise ValueError(
+    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+    #                 )
+
+    # def _perform_on_action_end(self, callbacks):
+    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+    #         for callback in callbacks:
+    #             if isinstance(callback, ActionCallback):
+    #                 callback.on_action_end()
+    #             elif isinstance(callback, NeMoCallback):
+    #                 callback.on_train_end(self.state)
+    #             else:
+    #                 raise ValueError(
+    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+    #                 )
+
+    # def _perform_on_epoch_start(self, callbacks):
+    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+    #         for callback in callbacks:
+    #             if isinstance(callback, ActionCallback):
+    #                 callback.on_epoch_start()
+    #             elif isinstance(callback, NeMoCallback):
+    #                 callback.on_epoch_start(self.state)
+    #             else:
+    #                 raise ValueError(
+    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+    #                 )
+
+    # def _perform_on_epoch_end(self, callbacks):
+    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+    #         for callback in callbacks:
+    #             if isinstance(callback, ActionCallback):
+    #                 callback.on_epoch_end()
+    #             elif isinstance(callback, NeMoCallback):
+    #                 callback.on_epoch_end(self.state)
+    #             else:
+    #                 raise ValueError(
+    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+    #                 )
+
+    # def _perform_on_batch_start(self, callbacks):
+    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+    #         for callback in callbacks:
+    #             if isinstance(callback, ActionCallback):
+    #                 continue
+    #             elif isinstance(callback, NeMoCallback):
+    #                 callback.on_epoch_start(self.state)
+    #             else:
+    #                 raise ValueError(
+    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+    #                 )
+
+    # def _perform_on_batch_end(self, callbacks):
+    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+    #         for callback in callbacks:
+    #             if isinstance(callback, ActionCallback):
+    #                 continue
+    #             elif isinstance(callback, NeMoCallback):
+    #                 callback.on_epoch_end(self.state)
+    #             else:
+    #                 raise ValueError(
+    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
+    #                 )
+
+    # def _init_callbacks(self, callbacks):
+    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+    #         for callback in callbacks:
+    #             if isinstance(callback, ActionCallback):
+    #                 callback.action = self
+
+    # def _update_callbacks(
+    #     self, callbacks=None, registered_tensors=None, final_loss=None,
+    # ):
+    #     # if self.local_rank is None or self.local_rank == 0:
+    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
+    #         for callback in callbacks:
+    #             if isinstance(callback, ActionCallback):
+    #                 callback._registered_tensors = registered_tensors
+    #             else:  # For now, we can use the old callback function. In the future we should improve this
+    #                 self.training_state.tensor_dict["loss"] = final_loss
 
 
 def _str_to_opt_level(opt_str: str) -> Optimization:

From 3c7b89e216f137a0812c4df5f03d539f8960632b Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Fri, 15 May 2020 17:18:25 -0700
Subject: [PATCH 09/40] adding checkpoint callback

Signed-off-by: Jason <jasoli@nvidia.com>
---
 examples/asr/jasper_an4_debug.py |  7 ++--
 nemo/backends/pytorch/actions.py | 72 +++++++++++++++++++++++---------
 nemo/core/callbacks.py           | 30 ++++++-------
 3 files changed, 72 insertions(+), 37 deletions(-)

diff --git a/examples/asr/jasper_an4_debug.py b/examples/asr/jasper_an4_debug.py
index bd0f2ec99b48..761c674ad3be 100755
--- a/examples/asr/jasper_an4_debug.py
+++ b/examples/asr/jasper_an4_debug.py
@@ -93,7 +93,7 @@ def create_dags(model_config_file, vocab, args, nf):
     predictions.rename("test")
     train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["loss", "test"])
 
-    # checkpointer_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)
+    checkpointer_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)
 
     # eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
     # eval_callback = nemo.core.EvaluatorCallback(
@@ -105,11 +105,12 @@ def create_dags(model_config_file, vocab, args, nf):
     #     eval_at_start=not args.do_not_eval_at_start,
     # )
     # callbacks = [train_callback, checkpointer_callback, eval_callback]
-    callbacks = [train_callback]
+    callbacks = [train_callback, checkpointer_callback]
 
     @nemo.core.callbacks.on_step_start
     def my_own_func(state):
-        print(state)
+        if state["step"] % 100 == 0:
+            print(state)
 
     callbacks.append(my_own_func)
 
diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index ee9f9cd6ce13..f3b249417350 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -21,7 +21,7 @@
 from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params
 from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor
 from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback, SimpleLossLoggerCallback
-from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState, topological_sort_from_leaves
+from nemo.core.neural_factory import Actions, OperationMode, Optimization, topological_sort_from_leaves
 from nemo.core.neural_types import *
 from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
@@ -1145,8 +1145,40 @@ def _update_callbacks(callbacks=None, registered_tensors=None, final_loss=None):
                     else:  # For now, we can use the old callback function. In the future we should improve this
                         registered_tensors["loss"] = final_loss
 
-        def get_state(self):
-            return {"step": self.step, "tensors": self._training_state, "epoch_num":self.epoch_num, "optimizer": self.optimizers}
+        def get_state(action):
+            class StateWrapper(dict):
+                def restore_state_from(self, path):
+                    if os.path.isfile(path):
+                        # map_location could be cuda:<device_id> but cpu seems to be more
+                        # general since we are also saving step and epoch_num
+                        # load_state_dict should move the variables to the relevant device
+                        checkpoint = torch.load(path, map_location="cpu")
+                        self.step = checkpoint["step"]
+                        self.epoch_num = checkpoint["epoch_num"]
+                        if checkpoint["optimizer_state"]:
+                            for opt, opt_chkpt in zip(self["optimizers"], checkpoint["optimizer_state"]):
+                                opt.load_state_dict(opt_chkpt)
+                    else:
+                        raise FileNotFoundError("Could not find checkpoint file: {0}".format(path))
+
+                def save_state_to(self, path):
+                    state = {
+                        "step": self["step"],
+                        "epoch_num": self["epoch"],
+                        "optimizer_state": [opt.state_dict() for opt in self["optimizers"]],
+                    }
+                    torch.save(state, path)
+
+            return StateWrapper(
+                {
+                    "step": action.step,
+                    "tensors": action._training_state,
+                    "epoch": action.epoch_num,
+                    "local_rank": action.local_rank,
+                    "global_rank": action.global_rank,
+                    "optimizers": action.optimizers,
+                }
+            )
 
         self._training_state = TrainingState(self)
         # Analyse the arguments passed to train.
@@ -1181,9 +1213,9 @@ def get_state(self):
 
         if tensors_to_optimize is None:
             # This is Evaluation Mode
-            self._init_callbacks(callbacks)
+            _init_callbacks(callbacks, self)
             # Do action start callbacks
-            self._perform_on_action_end(callbacks=callbacks)
+            _perform_on_action_end(callbacks, get_state(self))
             return
         # Check if tensors_to_optimize is just a list of NmTensors
         elif tensors_to_optimize is not None and (
@@ -1385,9 +1417,9 @@ def get_state(self):
                 train_dataloader = dataNM.data_iterator
                 train_sampler = None
 
-        self._init_callbacks(callbacks)
+        _init_callbacks(callbacks, self)
         # Do action start callbacks
-        self._perform_on_action_start(callbacks=callbacks)
+        _perform_on_action_start(callbacks, get_state(self))
 
         nan_or_inf = False
 
@@ -1400,7 +1432,7 @@ def get_state(self):
                 break
 
             # Register epochs start with callbacks
-            self._perform_on_epoch_start(callbacks=callbacks)
+            _perform_on_epoch_start(callbacks, get_state(self))
 
             # iteration over batches in epoch
             batch_counter = 0
@@ -1413,7 +1445,7 @@ def get_state(self):
                     curr_optimizer = training_loop[self.step % len(training_loop)][0]
                     curr_optimizer.zero_grad()
                     # Register iteration start with callbacks
-                    self._perform_on_step_start(callbacks=callbacks)
+                    _perform_on_step_start(callbacks, get_state(self))
 
                 # set learning rate policy
                 if lr_policy is not None:
@@ -1445,18 +1477,18 @@ def get_state(self):
 
                 for t, d in zip(curr_call_chain[0][2].values(), tensors):
                     if t is not None:
-                        self.training_state.set_tensor(t, d)
+                        self._training_state.set_tensor(t, d)
                 disable_allreduce = batch_counter < (batches_per_step - 1)
                 self.__nm_graph_forward_pass(
-                    call_chain=curr_call_chain, registered_tensors=self.training_state.tensor_dict,
+                    call_chain=curr_call_chain, registered_tensors=self._training_state.tensor_dict,
                 )
 
                 curr_tensors_to_optimize = training_loop[self.step % len(training_loop)][1]
                 final_loss = 0
                 for tensor in curr_tensors_to_optimize:
                     if (
-                        torch.isnan(self.training_state.tensor_dict[tensor.unique_name]).any()
-                        or torch.isinf(self.training_state.tensor_dict[tensor.unique_name]).any()
+                        torch.isnan(self._training_state.tensor_dict[tensor.unique_name]).any()
+                        or torch.isinf(self._training_state.tensor_dict[tensor.unique_name]).any()
                     ):
                         if (
                             (stop_on_nan_loss)
@@ -1472,7 +1504,7 @@ def get_state(self):
                             )
                         else:
                             logging.warning('Loss is NaN or inf, continuing training')
-                    final_loss += self.training_state.tensor_dict[tensor.unique_name]
+                    final_loss += self._training_state.tensor_dict[tensor.unique_name]
 
                 if self._optim_level in AmpOptimizations and self._optim_level != Optimization.mxprO0:
                     with amp.scale_loss(final_loss, curr_optimizer, delay_unscale=disable_allreduce) as scaled_loss:
@@ -1514,21 +1546,21 @@ def get_state(self):
                     curr_optimizer.step()
                     batch_counter = 0
                     # Register iteration end with callbacks
-                    self._update_callbacks(
-                        callbacks=callbacks, registered_tensors=self.training_state.tensor_dict, final_loss=final_loss
+                    _update_callbacks(
+                        callbacks, registered_tensors=self._training_state.tensor_dict, final_loss=final_loss
                     )
-                    self._perform_on_step_end(callbacks=callbacks)
+                    _perform_on_step_end(callbacks, get_state(self))
                     self.step += 1
-                self.training_state.clear_dict()
+                self._training_state.clear_dict()
             # End of epoch for loop
             # Register epochs end with callbacks
-            self._perform_on_epoch_end(callbacks=callbacks)
+            _perform_on_epoch_end(callbacks, get_state(self))
             self.epoch_num += 1
 
         # Check again if we should stop on NaN/inf
         self._check_nan_or_inf(placement_gpu, nan_or_inf)
 
-        self._perform_on_action_end(callbacks=callbacks)
+        _perform_on_action_end(callbacks, get_state(self))
 
     def infer(
         self,
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index ecdb38fa30dc..27ba28cbd0c7 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -84,6 +84,7 @@ def on_step_end(self, state):
                 #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
                 #                    f"Current state tensors include {state['tensors'].tensor_list()}")
 
+
 class WandBLogger(NeMoCallback):
     def __init__(self, step_freq=100, tensors_to_log=["loss"]):
         # Step_freq: how often logs are printed
@@ -104,6 +105,7 @@ def on_step_end(self, state):
                 #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
                 #                    f"Current state tensors include {state['tensors'].tensor_list()}")
 
+
 class SimpleLossLogger(NeMoCallback):
     def __init__(self, step_freq=100, tensors_to_log=["loss"]):
         # Step_freq: how often logs are printed
@@ -442,7 +444,7 @@ def __init__(
         self._force_load = force_load
 
     def __save_to(self, path, state):
-        if state.global_rank is not None and state.global_rank != 0:
+        if state["global_rank"] is not None and state["global_rank"] != 0:
             return
         if not os.path.isdir(path):
             logging.info(f"Creating {path} folder")
@@ -457,19 +459,19 @@ def __save_to(self, path, state):
                     )
                 unique_mod_names.add(str(module))
                 if self._step_freq > -1:
-                    filename = f"{module}-STEP-{state.step}.pt"
+                    filename = f"{module}-STEP-{state['step']}.pt"
                 else:
-                    filename = f"{module}-EPOCH-{state.epoch_num}.pt"
+                    filename = f"{module}-EPOCH-{state['epoch']}.pt"
                 module.save_to(os.path.join(path, filename))
 
         if self._step_freq > -1:
-            filename = f"trainer-STEP-{state.step}.pt"
-            state.save_state_to(f'{path}/{filename}')
-            self._saved_ckpts.append(f'-{state.step}.pt')
+            filename = f"trainer-STEP-{state['step']}.pt"
+            state.save_state_to(f"{path}/{filename}")
+            self._saved_ckpts.append(f"-{state['step']}.pt")
         else:
-            filename = f"trainer-EPOCH-{state.epoch_num}.pt"
-            state.save_state_to(f'{path}/{filename}')
-            self._saved_ckpts.append(f'-{state.epoch_num}.pt')
+            filename = f"trainer-EPOCH-{state['epoch']}.pt"
+            state.save_state_to(f"{path}/{filename}")
+            self._saved_ckpts.append(f"-{state['epoch']}.pt")
 
         if len(self._saved_ckpts) > self._ckpt2keep:
             for end in self._saved_ckpts[: -self._ckpt2keep]:
@@ -495,7 +497,7 @@ def __restore_from(self, path, state):
                 module_checkpoints = get_checkpoint_from_dir(modules_to_restore_name, path)
 
                 for mod, checkpoint in zip(modules_to_restore, module_checkpoints):
-                    mod.restore_from(checkpoint, state.local_rank)
+                    mod.restore_from(checkpoint, state["local_rank"])
             except (BaseException, ValueError) as e:
                 if self._force_load:
                     raise ValueError(
@@ -536,21 +538,21 @@ def on_train_start(self, state):
         for name in unique_mod_names:
             logging.info(f"{name}")
         logging.info(f"Total model parameters: {num_parameters}")
-        self.__restore_from(path=self._load_from_folder)
+        self.__restore_from(self._load_from_folder, state)
 
     def on_step_end(self, state):
         step = state["step"]
         if self._step_freq > 0 and step % self._step_freq == 0 and step > 0:
-            self.__save_to(path=self._folder)
+            self.__save_to(self._folder, state)
 
     def on_train_end(self, state):
         if self._step_freq > 0 or self._epoch_freq > 0:
-            self.__save_to(path=self._folder)
+            self.__save_to(self._folder, state)
 
     def on_epoch_end(self, state):
         epoch = state["epoch"]
         if self._epoch_freq > 0 and epoch % self._epoch_freq == 0 and epoch > 0:
-            self.__save_to(path=self._folder)
+            self.__save_to(self._folder, state)
 
 
 class EvaluatorCallback(ActionCallback):

From cf41850017e07b8b7f5ff3ce074bc8d4f444a478 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Mon, 18 May 2020 14:39:03 -0700
Subject: [PATCH 10/40] enable fetching via NmTensor and string; add
 WandBCallback, TensorboardLoggerCallback

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py | 12 ++--
 nemo/core/callbacks.py           | 94 +++++++++++++++++++++++---------
 2 files changed, 75 insertions(+), 31 deletions(-)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index f3b249417350..7fce14e678d6 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -72,11 +72,10 @@ def check_tensor_cached(self, unique_name):
         return True
 
     def get_tensor(self, name):
-        unique_name = AppState().tensor_names[name]
-        return self.tensor_dict[unique_name]
-
-    def get_and_compute_tensor(self, name):
-        unique_name = AppState().tensor_names[name]
+        if isinstance(name, NmTensor):
+            unique_name = name.unique_name
+        else:
+            unique_name = AppState().tensor_names[name]
         tensor_value = self.tensor_dict[unique_name]
         if tensor_value is None:
             nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name]
@@ -87,6 +86,9 @@ def get_and_compute_tensor(self, name):
             # print(self.tensor_dict[unique_name])
             tensor_value = self.tensor_dict[unique_name]
         return tensor_value
+    #     unique_name = AppState().tensor_names[name]
+    #     return self.tensor_dict[unique_name]
+    # def get_and_compute_tensor(self, name):
 
 
 class PtActions(Actions):
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 27ba28cbd0c7..86b99647e2dd 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -65,45 +65,87 @@ def on_train_end(self, state):
 
 
 class TensorboardLogger(NeMoCallback):
-    def __init__(self, step_freq=100, tensors_to_log=["loss"]):
+    def __init__(self, step_freq=100, tensors_to_log=["loss"], tb_writer=None, custom_tb_log_func=None):
         # Step_freq: how often logs are printed
         self.step_freq = step_freq
         self.tensors_to_log = tensors_to_log
+        if tb_writer is None:
+            logging.error("There was no tb writer")
+            # Should grab this from default tb writer
+        else:
+            self.tb_writer = tb_writer
+        self.custom_tb_log_func = custom_tb_log_func
+        self._last_epoch_start = None
 
     # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]):
     #     #tensors_to_log: List of keys into state that will be logged
 
-    def on_step_end(self, state):
-        if state["step"] % self.step_freq == 0:
-            for tensor_key in self.tensors_to_log:
-                tensor = state["tensors"].get_tensor(tensor_key)
-                if tensor is None:
-                    tensor = state["tensors"].get_and_compute_tensor(tensor_key)
-                logging.info("%s: %s", tensor_key, tensor)
-                # except KeyError:
-                #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
-                #                    f"Current state tensors include {state['tensors'].tensor_list()}")
+    def on_epoch_start(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            self._last_epoch_start = time.time()
+
+    def on_epoch_end(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            # always log epoch num and epoch_time
+            epoch_time = time.time() - self._last_epoch_start
+            self.tb_writer.add_scalar('misc/epoch', state["epoch"], state["step"])
+            self.tb_writer.add_scalar('misc/epoch_time', epoch_time, state["step"])
 
+    def on_step_end(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            if state["step"] % self.step_freq == 0:
+                tb_log_func = lambda x: self.tb_writer.add_scalar(x, state["tensors"].get_tensor(x), state["step"])
+                if self.custom_tb_log_func is not None:
+                    tb_log_func = self.custom_tb_log_func
+                for tensor_key in self.tensors_to_log:
+                    tb_log_func(tensor_key)
 
 class WandBLogger(NeMoCallback):
-    def __init__(self, step_freq=100, tensors_to_log=["loss"]):
-        # Step_freq: how often logs are printed
-        self.step_freq = step_freq
-        self.tensors_to_log = tensors_to_log
+    def __init__(self, step_freq=100, tensors_to_log=["loss"], wandb_name=None, wandb_project=None, args=None):
+        if not _WANDB_AVAILABLE:
+            logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
+        self._step_freq = step_freq
+        self._tensors_to_log = tensors_to_log
+        self._name = wandb_name
+        self._project = wandb_project
+        self._args = args
+        self._last_epoch_start = None
 
-    # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]):
-    #     #tensors_to_log: List of keys into state that will be logged
+    def on_train_start(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            if _WANDB_AVAILABLE and wandb.run is None:
+                wandb.init(name=self._name, project=self._project)
+                if self._args is not None:
+                    wandb.config.update(self._args)
+            elif _WANDB_AVAILABLE and wandb.run is not None:
+                logging.info("Re-using wandb session")
+            else:
+                logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
+                logging.info("Will not log data to weights and biases.")
+                self._step_freq = -1
 
     def on_step_end(self, state):
-        if state["step"] % self.step_freq == 0:
-            for tensor_key in self.tensors_to_log:
-                tensor = state["tensors"].get_tensor(tensor_key)
-                if tensor is None:
-                    tensor = state["tensors"].get_and_compute_tensor(tensor_key)
-                logging.info("%s: %s", tensor_key, tensor)
-                # except KeyError:
-                #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
-                #                    f"Current state tensors include {state['tensors'].tensor_list()}")
+        # log training metrics
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            if state["step"] % self._step_freq == 0 and self._step_freq > 0:
+                tensors_logged = {t: state["tensors"].get_tensor(t).cpu() for t in self._tensors_to_log}
+                # Always log learning rate
+                tensors_logged['LR'] = state["learning_rate"]
+                self._wandb_log(tensors_logged)
+
+    def on_epoch_start(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            self._last_epoch_start = time.time()
+
+    def on_epoch_end(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            # always log epoch num and epoch_time
+            epoch_time = time.time() - self._last_epoch_start
+            self._wandb_log({"epoch": state["epoch"], "epoch_time": epoch_time})
+
+    def _wandb_log(self, tensors_logged):
+        if _WANDB_AVAILABLE:
+            wandb.log(tensors_logged, step=state["step"])
 
 
 class SimpleLossLogger(NeMoCallback):

From b1df99d7edd9007904e3a5d5117317ebeb64c4f6 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Mon, 18 May 2020 14:39:31 -0700
Subject: [PATCH 11/40] style

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py | 1 +
 nemo/core/callbacks.py           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 7fce14e678d6..cae2f6740da9 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -86,6 +86,7 @@ def get_tensor(self, name):
             # print(self.tensor_dict[unique_name])
             tensor_value = self.tensor_dict[unique_name]
         return tensor_value
+
     #     unique_name = AppState().tensor_names[name]
     #     return self.tensor_dict[unique_name]
     # def get_and_compute_tensor(self, name):
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 86b99647e2dd..7b26531ef30a 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -100,6 +100,7 @@ def on_step_end(self, state):
                 for tensor_key in self.tensors_to_log:
                     tb_log_func(tensor_key)
 
+
 class WandBLogger(NeMoCallback):
     def __init__(self, step_freq=100, tensors_to_log=["loss"], wandb_name=None, wandb_project=None, args=None):
         if not _WANDB_AVAILABLE:

From fa6553f722c9cfe2e5169945be03789116372cfa Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Mon, 18 May 2020 16:14:16 -0700
Subject: [PATCH 12/40] DDP bug fix

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 47240c22f47a..c671db5644d0 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -1334,6 +1334,7 @@ def save_state_to(self, path):
             for module in AppState().modules:
                 key = module.unique_instance_id
                 num_trainable_weights = module.num_weights
+                self.ddp_module_dict[key] = module
                 if not isinstance(module, DDP) and isinstance(module, torch.nn.Module) and num_trainable_weights > 0:
                     # gpf = 1
                     # if gradient_predivide:

From ba84c807693e7d6686ebca6561da5a72ed0ad6e7 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 20 May 2020 13:43:06 -0700
Subject: [PATCH 13/40] clean up of checkpoint

Signed-off-by: Jason <jasoli@nvidia.com>
---
 examples/asr/jasper_an4.py       |   5 -
 nemo/backends/pytorch/actions.py | 189 ++++++++++++++++++-------------
 nemo/core/callbacks.py           |   6 +-
 setup.py                         |   2 +-
 4 files changed, 114 insertions(+), 88 deletions(-)

diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py
index 9ac79f3d1935..93bd887ee663 100644
--- a/examples/asr/jasper_an4.py
+++ b/examples/asr/jasper_an4.py
@@ -233,13 +233,8 @@ def main():
             folder=checkpoint_dir, step_freq=args.checkpoint_save_freq, force_load=True,
         )
 
-        # Distributed Data Parallel changes the underlying class so we need
-        # to reinstantiate Encoder and Decoder
         args.num_epochs += 10
         previous_step_count = total_steps
-        loss, eval_tensors, callbacks, total_steps, _, _ = create_dags(args.model_config, vocab, args, nf)
-
-        nf.reset_trainer()
         nf.train(
             tensors_to_optimize=[loss],
             callbacks=callbacks,
diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index c671db5644d0..f5def9142109 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -131,8 +131,8 @@ def __init__(
             local_rank=local_rank, global_rank=global_rank, optimization_level=optimization_level,
         )
 
-        self.step = 0
-        self.epoch_num = 0
+        self._step = 0
+        self._epoch = 0
         self.optimizers = []
         self.tb_writer = tb_writer
         self.cache = None
@@ -140,6 +140,27 @@ def __init__(
         self.ddp_initialized = False
         self.ddp_module_dict = {}
 
+    @property
+    def step(self):
+        return self._step
+
+    @step.setter
+    def step(self, step):
+        self._step = step
+
+    @property
+    def epoch(self):
+        return self._epoch
+
+    @epoch.setter
+    def epoch(self, epoch):
+        self._epoch = epoch
+
+    @property
+    @deprecated
+    def epoch_num(self):
+        return self._epoch
+
     def __get_top_sorted_modules_and_dataloader(self, hook):
         """ TODO
         """
@@ -311,28 +332,29 @@ def __setup_optimizer(
     def __initialize_amp(
         self, optimizer, optim_level, amp_max_loss_scale=2.0 ** 24, amp_min_loss_scale=1.0,
     ):
-        if optim_level not in AmpOptimizations:
-            raise ValueError(f"__initialize_amp() was called with unknown optim_level={optim_level}")
-        # in this case, nothing to do here
-        if optim_level == Optimization.mxprO0:
-            return optimizer
-
-        if len(AppState().modules) < 1:
-            raise ValueError("There were no modules to initialize")
-        pt_modules = []
-        for module in AppState().modules:
-            if isinstance(module, nn.Module):
-                pt_modules.append(module)
-            elif isinstance(module, TrainableNeuralModuleWrapper):
-                pt_modules.append(module._pt_module)
-
-        _, optimizer = amp.initialize(
-            max_loss_scale=amp_max_loss_scale,
-            min_loss_scale=amp_min_loss_scale,
-            models=pt_modules,
-            optimizers=optimizer,
-            opt_level=AmpOptimizations[optim_level],
-        )
+        if not self.amp_initialized:
+            if optim_level not in AmpOptimizations:
+                raise ValueError(f"__initialize_amp() was called with unknown optim_level={optim_level}")
+            # in this case, nothing to do here
+            if optim_level == Optimization.mxprO0:
+                return optimizer
+
+            if len(AppState().modules) < 1:
+                raise ValueError("There were no modules to initialize")
+            pt_modules = []
+            for module in AppState().modules:
+                if isinstance(module, nn.Module):
+                    pt_modules.append(module)
+                elif isinstance(module, TrainableNeuralModuleWrapper):
+                    pt_modules.append(module._pt_module)
+
+            _, optimizer = amp.initialize(
+                max_loss_scale=amp_max_loss_scale,
+                min_loss_scale=amp_min_loss_scale,
+                models=pt_modules,
+                optimizers=optimizer,
+                opt_level=AmpOptimizations[optim_level],
+            )
         self.amp_initialized = True
         return optimizer
 
@@ -816,43 +838,43 @@ def clear_cache(self):
         """
         self.cache = None
 
-    def save_state_to(self, path: str):
-        """
-        Saves current state such as step, epoch and optimizer parameters
-        Args:
-          path:
-
-        Returns:
-
-        """
-        state = {
-            "step": self.step,
-            "epoch_num": self.epoch_num,
-            "optimizer_state": [opt.state_dict() for opt in self.optimizers],
-        }
-        torch.save(state, path)
-
-    def restore_state_from(self, path: str):
-        """
-        Restores state such as step, epoch and optimizer parameters
-        Args:
-          path:
-
-        Returns:
-
-        """
-        if os.path.isfile(path):
-            # map_location could be cuda:<device_id> but cpu seems to be more
-            # general since we are also saving step and epoch_num
-            # load_state_dict should move the variables to the relevant device
-            checkpoint = torch.load(path, map_location="cpu")
-            self.step = checkpoint["step"]
-            self.epoch_num = checkpoint["epoch_num"]
-            if checkpoint["optimizer_state"]:
-                for opt, opt_chkpt in zip(self.optimizers, checkpoint["optimizer_state"]):
-                    opt.load_state_dict(opt_chkpt)
-        else:
-            raise FileNotFoundError("Could not find checkpoint file: {0}".format(path))
+    # def save_state_to(self, path: str):
+    #     """
+    #     Saves current state such as step, epoch and optimizer parameters
+    #     Args:
+    #       path:
+
+    #     Returns:
+
+    #     """
+    #     state = {
+    #         "step": self.step,
+    #         "epoch": self.epoch,
+    #         "optimizer_state": [opt.state_dict() for opt in self.optimizers],
+    #     }
+    #     torch.save(state, path)
+
+    # def restore_state_from(self, path: str):
+    #     """
+    #     Restores state such as step, epoch and optimizer parameters
+    #     Args:
+    #       path:
+
+    #     Returns:
+
+    #     """
+    #     if os.path.isfile(path):
+    #         # map_location could be cuda:<device_id> but cpu seems to be more
+    #         # general since we are also saving step and epoch
+    #         # load_state_dict should move the variables to the relevant device
+    #         checkpoint = torch.load(path, map_location="cpu")
+    #         self.step = checkpoint["step"]
+    #         self.epoch = checkpoint["epoch"]
+    #         if checkpoint["optimizer_state"]:
+    #             for opt, opt_chkpt in zip(self.optimizers, checkpoint["optimizer_state"]):
+    #                 opt.load_state_dict(opt_chkpt)
+    #     else:
+    #         raise FileNotFoundError("Could not find checkpoint file: {0}".format(path))
 
     @staticmethod
     def _check_all_tensors(list_of_tensors):
@@ -1133,14 +1155,32 @@ def _update_callbacks(callbacks=None, registered_tensors=None, final_loss=None):
 
         def get_state(action):
             class StateWrapper(dict):
+                def __init__(self, action):
+                    self.action = action
+                    super().__init__(
+                        {
+                            "step": action.step,
+                            "tensors": action._training_state,
+                            "epoch": action.epoch,
+                            "local_rank": action.local_rank,
+                            "global_rank": action.global_rank,
+                            "optimizers": action.optimizers,
+                    })
                 def restore_state_from(self, path):
                     if os.path.isfile(path):
                         # map_location could be cuda:<device_id> but cpu seems to be more
-                        # general since we are also saving step and epoch_num
+                        # general since we are also saving step and epoch
                         # load_state_dict should move the variables to the relevant device
                         checkpoint = torch.load(path, map_location="cpu")
-                        self.step = checkpoint["step"]
-                        self.epoch_num = checkpoint["epoch_num"]
+                        action.step = checkpoint["step"]
+                        self["step"] = action.step
+                        epoch = checkpoint.get("epoch", None)
+                        if epoch is None:
+                            epoch = checkpoint.get("epoch_num", None)
+                        if epoch is None:
+                            raise ValueError("Epoch was not found in the trainer checkpoint")
+                        action.epoch = epoch
+                        self["epoch"] = action.epoch
                         if checkpoint["optimizer_state"]:
                             for opt, opt_chkpt in zip(self["optimizers"], checkpoint["optimizer_state"]):
                                 opt.load_state_dict(opt_chkpt)
@@ -1150,21 +1190,12 @@ def restore_state_from(self, path):
                 def save_state_to(self, path):
                     state = {
                         "step": self["step"],
-                        "epoch_num": self["epoch"],
+                        "epoch": self["epoch"],
                         "optimizer_state": [opt.state_dict() for opt in self["optimizers"]],
                     }
                     torch.save(state, path)
 
-            return StateWrapper(
-                {
-                    "step": action.step,
-                    "tensors": action._training_state,
-                    "epoch": action.epoch_num,
-                    "local_rank": action.local_rank,
-                    "global_rank": action.global_rank,
-                    "optimizers": action.optimizers,
-                }
-            )
+            return StateWrapper(action)
 
         self._training_state = TrainingState(self)
         # Analyse the arguments passed to train.
@@ -1410,9 +1441,9 @@ def save_state_to(self, path):
 
         # MAIN TRAINING LOOP
         # iteration over epochs
-        while num_epochs is None or self.epoch_num < num_epochs:
+        while num_epochs is None or self.epoch < num_epochs:
             if train_sampler is not None:
-                train_sampler.set_epoch(self.epoch_num)
+                train_sampler.set_epoch(self.epoch)
             if max_steps is not None and self.step >= max_steps:
                 break
 
@@ -1434,7 +1465,7 @@ def save_state_to(self, path):
 
                 # set learning rate policy
                 if lr_policy is not None:
-                    adjusted_lr = lr_policy(optimization_params["lr"], self.step, self.epoch_num)
+                    adjusted_lr = lr_policy(optimization_params["lr"], self.step, self.epoch)
                     for param_group in curr_optimizer.param_groups:
                         param_group["lr"] = adjusted_lr
                 if self.tb_writer is not None:
@@ -1533,7 +1564,7 @@ def save_state_to(self, path):
             # End of epoch for loop
             # Register epochs end with callbacks
             _perform_on_epoch_end(callbacks, get_state(self))
-            self.epoch_num += 1
+            self.epoch += 1
         _perform_on_action_end(callbacks, get_state(self))
 
     def infer(
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 7b26531ef30a..dc9491f9a4b7 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -541,7 +541,7 @@ def __restore_from(self, path, state):
 
                 for mod, checkpoint in zip(modules_to_restore, module_checkpoints):
                     mod.restore_from(checkpoint, state["local_rank"])
-            except (BaseException, ValueError) as e:
+            except (ValueError) as e:
                 if self._force_load:
                     raise ValueError(
                         "force_load was set to True for checkpoint callback but a checkpoint was not found."
@@ -555,9 +555,9 @@ def __restore_from(self, path, state):
 
             try:
                 trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path)
-                state.restore_state_from(checkpoint)
+                state.restore_state_from(trainer_checkpoints[0])
                 # for tr, checkpoint in zip([self.action], trainer_checkpoints):
-            except (BaseException, ValueError) as e:
+            except (ValueError) as e:
                 logging.warning(e)
                 logging.warning(
                     "Trainer state such as optimizer state and current step/epoch was not restored. Pretrained weights"
diff --git a/setup.py b/setup.py
index a7c93c9d8c54..fae6a943613d 100644
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,7 @@ def is_build_action():
     if len(sys.argv) <= 1:
         return False
 
-    BUILD_TOKENS = ["egg_info", "dist", "bdist", "sdist", "install", "build", "develop", "style"]
+    BUILD_TOKENS = ["egg_info", "dist", "bdist", "sdist", "install", "build", "develop", "style", "clean"]
 
     if any([sys.argv[1].startswith(x) for x in BUILD_TOKENS]):
         return True

From fc3ce629aa2cb5244c1461512589a98789cac53a Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 20 May 2020 15:26:53 -0700
Subject: [PATCH 14/40] update an4

Signed-off-by: Jason <jasoli@nvidia.com>
---
 examples/asr/jasper_an4.py       | 109 +++++++++++++++++--------------
 nemo/backends/pytorch/actions.py |  45 +++++++------
 2 files changed, 82 insertions(+), 72 deletions(-)

diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py
index 93bd887ee663..419777e6b84e 100644
--- a/examples/asr/jasper_an4.py
+++ b/examples/asr/jasper_an4.py
@@ -7,6 +7,7 @@
 from ruamel.yaml import YAML
 
 import nemo
+from nemo.core import NeuralGraph
 import nemo.collections.asr as nemo_asr
 import nemo.utils.argparse as nm_argparse
 from nemo.collections.asr.helpers import (
@@ -21,62 +22,62 @@
 
 logging = nemo.logging
 
-
 def create_dags(model_config_file, vocab, args, nf):
 
-    # Create a data_layer for training.
-    data_layer = nemo_asr.AudioToTextDataLayer.import_from_config(
-        model_config_file,
-        "AudioToTextDataLayer_train",
-        overwrite_params={"manifest_filepath": args.train_dataset, "batch_size": args.batch_size},
-    )
+    with NeuralGraph() as g0:
+        # Create a data_layer for training.
+        data_layer = nemo_asr.AudioToTextDataLayer.import_from_config(
+            model_config_file,
+            "AudioToTextDataLayer_train",
+            overwrite_params={"manifest_filepath": args.train_dataset, "batch_size": args.batch_size},
+        )
 
-    num_samples = len(data_layer)
-    steps_per_epoch = math.ceil(num_samples / (data_layer.batch_size * args.iter_per_step * nf.world_size))
-    total_steps = steps_per_epoch * args.num_epochs
-    logging.info("Train samples=", num_samples, "num_steps=", total_steps)
+        num_samples = len(data_layer)
+        steps_per_epoch = math.ceil(num_samples / (data_layer.batch_size * args.iter_per_step * nf.world_size))
+        total_steps = steps_per_epoch * args.num_epochs
+        logging.info("Train samples=", num_samples, "num_steps=", total_steps)
 
-    # Create a data_layer for evaluation.
-    data_layer_eval = nemo_asr.AudioToTextDataLayer.import_from_config(
-        model_config_file, "AudioToTextDataLayer_eval", overwrite_params={"manifest_filepath": args.eval_datasets},
-    )
+        # Create a data_layer for evaluation.
+        data_layer_eval = nemo_asr.AudioToTextDataLayer.import_from_config(
+            model_config_file, "AudioToTextDataLayer_eval", overwrite_params={"manifest_filepath": args.eval_datasets},
+        )
 
-    num_samples = len(data_layer_eval)
-    logging.info(f"Eval samples={num_samples}")
+        num_samples = len(data_layer_eval)
+        logging.info(f"Eval samples={num_samples}")
 
-    # Instantiate data processor.
-    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config(
-        model_config_file, "AudioToMelSpectrogramPreprocessor"
-    )
+        # Instantiate data processor.
+        data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config(
+            model_config_file, "AudioToMelSpectrogramPreprocessor"
+        )
 
-    # Instantiate JASPER encoder-decoder modules.
-    jasper_encoder = nemo_asr.JasperEncoder.import_from_config(model_config_file, "JasperEncoder")
-    jasper_decoder = nemo_asr.JasperDecoderForCTC.import_from_config(
-        model_config_file, "JasperDecoderForCTC", overwrite_params={"num_classes": len(vocab)}
-    )
+        # Instantiate JASPER encoder-decoder modules.
+        jasper_encoder = nemo_asr.JasperEncoder.import_from_config(model_config_file, "JasperEncoder")
+        jasper_decoder = nemo_asr.JasperDecoderForCTC.import_from_config(
+            model_config_file, "JasperDecoderForCTC", overwrite_params={"num_classes": len(vocab)}
+        )
 
-    # Instantiate losses.
-    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
-    greedy_decoder = nemo_asr.GreedyCTCDecoder()
-
-    # Create a training graph.
-    audio, audio_len, transcript, transcript_len = data_layer()
-    processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len)
-    encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len)
-    log_probs = jasper_decoder(encoder_output=encoded)
-    predictions = greedy_decoder(log_probs=log_probs)
-    loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,)
-
-    # Create an evaluation graph.
-    audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
-    processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e)
-    encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e)
-    log_probs_e = jasper_decoder(encoder_output=encoded_e)
-    predictions_e = greedy_decoder(log_probs=log_probs_e)
-    loss_e = ctc_loss(
-        log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
-    )
-    logging.info("Num of params in encoder: {0}".format(jasper_encoder.num_weights))
+        # Instantiate losses.
+        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
+        greedy_decoder = nemo_asr.GreedyCTCDecoder()
+
+        # Create a training graph.
+        audio, audio_len, transcript, transcript_len = data_layer()
+        processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len)
+        encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len)
+        log_probs = jasper_decoder(encoder_output=encoded)
+        predictions = greedy_decoder(log_probs=log_probs)
+        loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,)
+
+        # Create an evaluation graph.
+        audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
+        processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e)
+        encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e)
+        log_probs_e = jasper_decoder(encoder_output=encoded_e)
+        predictions_e = greedy_decoder(log_probs=log_probs_e)
+        loss_e = ctc_loss(
+            log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
+        )
+    logging.error("Num of params in encoder: {0}".format(jasper_encoder.num_weights))
 
     # Callbacks to print info to console and Tensorboard.
     train_callback = nemo.core.SimpleLossLoggerCallback(
@@ -107,6 +108,7 @@ def create_dags(model_config_file, vocab, args, nf):
         total_steps,
         log_probs_e,
         encoded_len_e,
+        g0
     )
 
 
@@ -167,7 +169,7 @@ def main():
     # Get vocabulary.
     vocab = jasper_params['labels']
 
-    (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e,) = create_dags(
+    (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e, g0) = create_dags(
         args.model_config, vocab, args, nf
     )
 
@@ -235,6 +237,15 @@ def main():
 
         args.num_epochs += 10
         previous_step_count = total_steps
+
+        # Distributed Data Parallel and amp changes the underlying class so we need to reinstantiate modules
+        # Clear the module registery
+        nemo.utils.app_state.AppState().modules.clear()
+        # Delete old graph and make a new one
+        del g0
+        loss, eval_tensors, callbacks, total_steps, _, _, new_g = create_dags(args.model_config, vocab, args, nf)
+
+        nf.reset_trainer()
         nf.train(
             tensors_to_optimize=[loss],
             callbacks=callbacks,
diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index f5def9142109..84fd37dd72b5 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -332,29 +332,28 @@ def __setup_optimizer(
     def __initialize_amp(
         self, optimizer, optim_level, amp_max_loss_scale=2.0 ** 24, amp_min_loss_scale=1.0,
     ):
-        if not self.amp_initialized:
-            if optim_level not in AmpOptimizations:
-                raise ValueError(f"__initialize_amp() was called with unknown optim_level={optim_level}")
-            # in this case, nothing to do here
-            if optim_level == Optimization.mxprO0:
-                return optimizer
-
-            if len(AppState().modules) < 1:
-                raise ValueError("There were no modules to initialize")
-            pt_modules = []
-            for module in AppState().modules:
-                if isinstance(module, nn.Module):
-                    pt_modules.append(module)
-                elif isinstance(module, TrainableNeuralModuleWrapper):
-                    pt_modules.append(module._pt_module)
-
-            _, optimizer = amp.initialize(
-                max_loss_scale=amp_max_loss_scale,
-                min_loss_scale=amp_min_loss_scale,
-                models=pt_modules,
-                optimizers=optimizer,
-                opt_level=AmpOptimizations[optim_level],
-            )
+        if optim_level not in AmpOptimizations:
+            raise ValueError(f"__initialize_amp() was called with unknown optim_level={optim_level}")
+        # in this case, nothing to do here
+        if optim_level == Optimization.mxprO0:
+            return optimizer
+
+        if len(AppState().modules) < 1:
+            raise ValueError("There were no modules to initialize")
+        pt_modules = []
+        for module in AppState().modules:
+            if isinstance(module, nn.Module):
+                pt_modules.append(module)
+            elif isinstance(module, TrainableNeuralModuleWrapper):
+                pt_modules.append(module._pt_module)
+
+        _, optimizer = amp.initialize(
+            max_loss_scale=amp_max_loss_scale,
+            min_loss_scale=amp_min_loss_scale,
+            models=pt_modules,
+            optimizers=optimizer,
+            opt_level=AmpOptimizations[optim_level],
+        )
         self.amp_initialized = True
         return optimizer
 

From e5b82585ac4021285fe734df6777fda9cd6ffea7 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 20 May 2020 15:31:41 -0700
Subject: [PATCH 15/40] style

Signed-off-by: Jason <jasoli@nvidia.com>
---
 examples/asr/jasper_an4.py       | 16 +++++-----------
 nemo/backends/pytorch/actions.py |  4 +++-
 nemo/core/callbacks.py           |  2 +-
 nemo/core/neural_factory.py      |  5 ++---
 4 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py
index 045a8725f757..1100234bf816 100644
--- a/examples/asr/jasper_an4.py
+++ b/examples/asr/jasper_an4.py
@@ -7,7 +7,6 @@
 from ruamel.yaml import YAML
 
 import nemo
-from nemo.core import NeuralGraph
 import nemo.collections.asr as nemo_asr
 import nemo.utils.argparse as nm_argparse
 from nemo.collections.asr.helpers import (
@@ -18,6 +17,7 @@
     process_evaluation_epoch,
     word_error_rate,
 )
+from nemo.core import NeuralGraph
 from nemo.utils import logging
 from nemo.utils.lr_policies import CosineAnnealing
 
@@ -66,7 +66,9 @@ def create_dags(model_config_file, vocab, args, nf):
         encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len)
         log_probs = jasper_decoder(encoder_output=encoded)
         predictions = greedy_decoder(log_probs=log_probs)
-        loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,)
+        loss = ctc_loss(
+            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
+        )
 
         # Create an evaluation graph.
         audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
@@ -101,15 +103,7 @@ def create_dags(model_config_file, vocab, args, nf):
     callbacks = [train_callback, checkpointer_callback, eval_callback]
 
     # Return entities required by the actual training.
-    return (
-        loss,
-        eval_tensors,
-        callbacks,
-        total_steps,
-        log_probs_e,
-        encoded_len_e,
-        g0
-    )
+    return (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e, g0)
 
 
 def main():
diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 84fd37dd72b5..db28ec73d0d5 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -1164,7 +1164,9 @@ def __init__(self, action):
                             "local_rank": action.local_rank,
                             "global_rank": action.global_rank,
                             "optimizers": action.optimizers,
-                    })
+                        }
+                    )
+
                 def restore_state_from(self, path):
                     if os.path.isfile(path):
                         # map_location could be cuda:<device_id> but cpu seems to be more
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 96f3c80ce17e..a9a4970d0254 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -25,8 +25,8 @@
 from collections import namedtuple
 
 import nemo
-from nemo.utils.app_state import AppState
 from nemo.utils import get_checkpoint_from_dir, logging
+from nemo.utils.app_state import AppState
 
 try:
     import wandb
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index e8d379faf292..583563819041 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -34,13 +34,11 @@
 import numpy as np
 
 import nemo
-
 from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback
 from nemo.core.neural_types import NmTensor
-from nemo.utils import ExpManager
+from nemo.utils import ExpManager, logging
 from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
-from nemo.utils import logging
 
 
 # def topological_sort_from_leaves(leaf_nmtensors, cached_training_state: TrainingState = None):
@@ -149,6 +147,7 @@ def is_in_degree_zero(node, processed_nodes, cached_training_state):
 
     return top_sorted_modules
 
+
 class DeploymentFormat(Enum):
     """Which format to use when exporting a Neural Module for deployment"""
 

From 53610035d98eb184761a1b47160d0948d5e2449e Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 20 May 2020 15:39:29 -0700
Subject: [PATCH 16/40] undo comenting

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py | 74 ++++++++++++++++----------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index db28ec73d0d5..2b79ab26a11b 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -837,43 +837,43 @@ def clear_cache(self):
         """
         self.cache = None
 
-    # def save_state_to(self, path: str):
-    #     """
-    #     Saves current state such as step, epoch and optimizer parameters
-    #     Args:
-    #       path:
-
-    #     Returns:
-
-    #     """
-    #     state = {
-    #         "step": self.step,
-    #         "epoch": self.epoch,
-    #         "optimizer_state": [opt.state_dict() for opt in self.optimizers],
-    #     }
-    #     torch.save(state, path)
-
-    # def restore_state_from(self, path: str):
-    #     """
-    #     Restores state such as step, epoch and optimizer parameters
-    #     Args:
-    #       path:
-
-    #     Returns:
-
-    #     """
-    #     if os.path.isfile(path):
-    #         # map_location could be cuda:<device_id> but cpu seems to be more
-    #         # general since we are also saving step and epoch
-    #         # load_state_dict should move the variables to the relevant device
-    #         checkpoint = torch.load(path, map_location="cpu")
-    #         self.step = checkpoint["step"]
-    #         self.epoch = checkpoint["epoch"]
-    #         if checkpoint["optimizer_state"]:
-    #             for opt, opt_chkpt in zip(self.optimizers, checkpoint["optimizer_state"]):
-    #                 opt.load_state_dict(opt_chkpt)
-    #     else:
-    #         raise FileNotFoundError("Could not find checkpoint file: {0}".format(path))
+    def save_state_to(self, path: str):
+        """
+        Saves current state such as step, epoch and optimizer parameters
+        Args:
+          path:
+
+        Returns:
+
+        """
+        state = {
+            "step": self.step,
+            "epoch": self.epoch,
+            "optimizer_state": [opt.state_dict() for opt in self.optimizers],
+        }
+        torch.save(state, path)
+
+    def restore_state_from(self, path: str):
+        """
+        Restores state such as step, epoch and optimizer parameters
+        Args:
+          path:
+
+        Returns:
+
+        """
+        if os.path.isfile(path):
+            # map_location could be cuda:<device_id> but cpu seems to be more
+            # general since we are also saving step and epoch
+            # load_state_dict should move the variables to the relevant device
+            checkpoint = torch.load(path, map_location="cpu")
+            self.step = checkpoint["step"]
+            self.epoch = checkpoint["epoch"]
+            if checkpoint["optimizer_state"]:
+                for opt, opt_chkpt in zip(self.optimizers, checkpoint["optimizer_state"]):
+                    opt.load_state_dict(opt_chkpt)
+        else:
+            raise FileNotFoundError("Could not find checkpoint file: {0}".format(path))
 
     @staticmethod
     def _check_all_tensors(list_of_tensors):

From 9fc00d77e1e46fe0893ff3b899b75e0d3575952b Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 20 May 2020 15:45:04 -0700
Subject: [PATCH 17/40] unpate

Signed-off-by: Jason <jasoli@nvidia.com>
---
 tests/unit/core/test_actions_api.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/unit/core/test_actions_api.py b/tests/unit/core/test_actions_api.py
index ef631027de0f..f4a8ad555ac2 100644
--- a/tests/unit/core/test_actions_api.py
+++ b/tests/unit/core/test_actions_api.py
@@ -33,10 +33,10 @@ def test_checkpointing(self):
         optimizer = PtActions()
         optimizer.save_state_to(path)
         optimizer.step = 123
-        optimizer.epoch_num = 324
+        optimizer.epoch = 324
         optimizer.restore_state_from(path)
         self.assertEqual(optimizer.step, 0)
-        self.assertEqual(optimizer.epoch_num, 0)
+        self.assertEqual(optimizer.epoch, 0)
         self.assertEqual(len(optimizer.optimizers), 0)
         os.remove(path)
 
@@ -53,7 +53,7 @@ def test_multi_optimizer(self):
         self.assertEqual(len(optimizer.optimizers), 5)
         optimizer.save_state_to(path)
         optimizer.step = 123
-        optimizer.epoch_num = 324
+        optimizer.epoch = 324
         for i, opt in enumerate(optimizer.optimizers):
             for param_group in opt.param_groups:
                 self.assertEqual(param_group['lr'], float(i + 1))
@@ -63,6 +63,6 @@ def test_multi_optimizer(self):
             for param_group in opt.param_groups:
                 self.assertEqual(param_group['lr'], float(i + 1))
         self.assertEqual(optimizer.step, 0)
-        self.assertEqual(optimizer.epoch_num, 0)
+        self.assertEqual(optimizer.epoch, 0)
         self.assertEqual(len(optimizer.optimizers), 5)
         os.remove(path)

From d806e7e3cdf95184c5ba081567fb32e9b6bdb7e9 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 20 May 2020 17:09:57 -0700
Subject: [PATCH 18/40] wip

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py | 12 ++++++++++++
 nemo/constants.py                |  1 +
 nemo/utils/formatters/base.py    |  3 +++
 nemo/utils/nemo_logging.py       | 24 ++++++++++++++++--------
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 2b79ab26a11b..1d122b09b878 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -1435,6 +1435,7 @@ def save_state_to(self, path):
             else:
                 train_dataloader = dataNM.data_iterator
                 train_sampler = None
+        logging.info("very start")
 
         _init_callbacks(callbacks, self)
         # Do action start callbacks
@@ -1450,6 +1451,7 @@ def save_state_to(self, path):
 
             # Register epochs start with callbacks
             _perform_on_epoch_start(callbacks, get_state(self))
+            logging.info("I'm here")
 
             # iteration over batches in epoch
             batch_counter = 0
@@ -1457,12 +1459,14 @@ def save_state_to(self, path):
                 if max_steps is not None and self.step >= max_steps:
                     break
 
+                logging.info("I'm there")
                 if batch_counter == 0:
                     # Started step, zero gradients
                     curr_optimizer = training_loop[self.step % len(training_loop)][0]
                     curr_optimizer.zero_grad()
                     # Register iteration start with callbacks
                     _perform_on_step_start(callbacks, get_state(self))
+                logging.info("I'm everywhere")
 
                 # set learning rate policy
                 if lr_policy is not None:
@@ -1475,6 +1479,7 @@ def save_state_to(self, path):
                 if callbacks is not None:
                     for callback in callbacks:
                         callback.learning_rate = curr_optimizer.param_groups[0]['lr']
+                logging.info("I'm everywhere2")
 
                 # registered_tensors will contain created tensors
                 # named by output port and uuid of module which created them
@@ -1486,24 +1491,29 @@ def save_state_to(self, path):
                 tensors = []
                 if isinstance(data, torch.Tensor):
                     data = (data,)
+                logging.info(dl_device)
                 for d in data:
                     if isinstance(d, torch.Tensor):
                         tensors.append(d.to(dl_device))
                     else:
                         tensors.append(d)
+                logging.info("I'm everywhere3")
 
                 for t, d in zip(curr_call_chain[0][2].values(), tensors):
                     if t is not None:
                         self._training_state.set_tensor(t, d)
                 disable_allreduce = batch_counter < (batches_per_step - 1)
+                logging.info("before forward")
                 self.__nm_graph_forward_pass(
                     call_chain=curr_call_chain, registered_tensors=self._training_state.tensor_dict,
                 )
+                logging.info("after forward")
 
                 curr_tensors_to_optimize = training_loop[self.step % len(training_loop)][1]
                 final_loss = 0
                 for tensor in curr_tensors_to_optimize:
                     final_loss += self._training_state.tensor_dict[tensor.unique_name]
+                logging.info("Or there")
 
                 # Check for NaN/inf loss (across workers if applicable)
                 loss_nan_inf_checker = final_loss.clone()
@@ -1519,6 +1529,7 @@ def save_state_to(self, path):
                         logging.warning('Loss is NaN or inf. Skipping update.')
                         continue
 
+                logging.info("Am I Here?")
                 if self._optim_level in AmpOptimizations and self._optim_level != Optimization.mxprO0:
                     with amp.scale_loss(final_loss, curr_optimizer, delay_unscale=disable_allreduce) as scaled_loss:
                         if disable_allreduce:
@@ -1549,6 +1560,7 @@ def save_state_to(self, path):
 
                 batch_counter += 1
 
+                raise ValueError
                 if batch_counter == batches_per_step:
                     # Ended step. Do optimizer update
                     if grad_norm_clip is not None:
diff --git a/nemo/constants.py b/nemo/constants.py
index 6cd3a1f60ff8..9d6793d7630a 100644
--- a/nemo/constants.py
+++ b/nemo/constants.py
@@ -47,4 +47,5 @@
 # NEMO_ENV_VARNAME_DEBUG_VERBOSITY = "NEMO_DEBUG_VERBOSITY"
 NEMO_ENV_VARNAME_ENABLE_COLORING = "NEMO_ENABLE_COLORING"
 NEMO_ENV_VARNAME_REDIRECT_LOGS_TO_STDERR = "NEMO_REDIRECT_LOGS_TO_STDERR"
+NEMO_ENV_VARNAME_TESTING = "NEMO_TESTING"
 # NEMO_ENV_VARNAME_SAVE_LOGS_TO_DIR        = "NEMO_SAVE_LOGS_TO_DIR"
diff --git a/nemo/utils/formatters/base.py b/nemo/utils/formatters/base.py
index 6b844877b185..e507aaedecf5 100644
--- a/nemo/utils/formatters/base.py
+++ b/nemo/utils/formatters/base.py
@@ -126,3 +126,6 @@ def format(self, record):
 
 class BaseNeMoFormatter(BaseFormatter):
     DEFAULT_FORMAT = "%(color)s[NeMo %(levelname)1.1s %(asctime)s %(module)s:%(lineno)d]%(end_color)s %(message)s"
+
+class DebugNeMoFormatter(BaseFormatter):
+    DEFAULT_FORMAT = "%(color)s[NeMo %(levelname)1.1s %(asctime)s %(module)s:%(lineno)d %(rank)d]%(end_color)s %(message)s"
diff --git a/nemo/utils/nemo_logging.py b/nemo/utils/nemo_logging.py
index 1551acf84839..8a2bd06040d6 100644
--- a/nemo/utils/nemo_logging.py
+++ b/nemo/utils/nemo_logging.py
@@ -20,9 +20,9 @@
 from contextlib import contextmanager
 
 # from nemo.constants import NEMO_ENV_VARNAME_SAVE_LOGS_TO_DIR
-from nemo.constants import NEMO_ENV_VARNAME_REDIRECT_LOGS_TO_STDERR
+from nemo.constants import NEMO_ENV_VARNAME_REDIRECT_LOGS_TO_STDERR, NEMO_ENV_VARNAME_TESTING
 from nemo.utils.env_var_parsing import get_envbool, get_envint
-from nemo.utils.formatters.base import BaseNeMoFormatter
+from nemo.utils.formatters.base import BaseNeMoFormatter, DebugNeMoFormatter
 from nemo.utils.metaclasses import Singleton
 
 __all__ = ["Logger", "LogMode"]
@@ -88,7 +88,17 @@ def _define_logger(self):
                 self._logger = _logging.getLogger("nemo_logger")
                 # By default, silence all loggers except the logger for rank 0
                 self.remove_stream_handlers()
-                if get_envint("RANK", 0) == 0:
+                if get_envbool(NEMO_ENV_VARNAME_TESTING, False):
+                    old_factory = _logging.getLogRecordFactory()
+
+                    def record_factory(*args, **kwargs):
+                        record = old_factory(*args, **kwargs)
+                        record.rank = get_envint("RANK", 0)
+                        return record
+
+                    _logging.setLogRecordFactory(record_factory)
+                    self.add_stream_handlers(formatter=DebugNeMoFormatter)
+                elif get_envint("RANK", 0) == 0:
                     self.add_stream_handlers()
 
             finally:
@@ -112,7 +122,7 @@ def remove_stream_handlers(self):
         except KeyError:
             pass
 
-    def add_stream_handlers(self):
+    def add_stream_handlers(self, formatter=BaseNeMoFormatter):
         if self._logger is None:
             raise RuntimeError("Impossible to set handlers if the Logger is not predefined")
 
@@ -127,8 +137,6 @@ def add_stream_handlers(self):
             self._handlers["stream_stderr"] = _logging.StreamHandler(sys.stderr)
             self._handlers["stream_stderr"].addFilter(lambda record: record.levelno > _logging.INFO)
 
-        formatter = BaseNeMoFormatter
-
         self._handlers["stream_stdout"].setFormatter(formatter())
         self._logger.addHandler(self._handlers["stream_stdout"])
 
@@ -138,9 +146,9 @@ def add_stream_handlers(self):
         except KeyError:
             pass
 
-    def reset_stream_handler(self):
+    def reset_stream_handler(self, formatter=BaseNeMoFormatter):
         self.remove_stream_handlers()
-        self.add_stream_handlers()
+        self.add_stream_handlers(formatter=formatter)
 
     def add_file_handler(self, log_file):
         if self._logger is None:

From f1c8aa827b5e06925faf28ecc61c2070e3b52909 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 21 May 2020 10:37:01 -0700
Subject: [PATCH 19/40] more logging

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 1d122b09b878..a98a7cbe5114 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -1491,8 +1491,11 @@ def save_state_to(self, path):
                 tensors = []
                 if isinstance(data, torch.Tensor):
                     data = (data,)
+                dl_device = f"cuda:{self._local_rank}"
                 logging.info(dl_device)
                 for d in data:
+                    logging.info(d)
+                    d.to(dl_device)
                     if isinstance(d, torch.Tensor):
                         tensors.append(d.to(dl_device))
                     else:

From 7608d45e151c461734438616785b2b623133566b Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 21 May 2020 11:38:42 -0700
Subject: [PATCH 20/40] remove debugging statements

Signed-off-by: Jason <jasoli@nvidia.com>
---
 examples/asr/jasper_an4.py       |  3 ++-
 nemo/backends/pytorch/actions.py | 20 ++++----------------
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py
index 1100234bf816..7baa44e34f74 100644
--- a/examples/asr/jasper_an4.py
+++ b/examples/asr/jasper_an4.py
@@ -237,9 +237,10 @@ def main():
         nemo.utils.app_state.AppState().modules.clear()
         # Delete old graph and make a new one
         del g0
+        nf.reset_trainer()
+        # [print(p) for p in nemo.utils.app_state.AppState().modules]
         loss, eval_tensors, callbacks, total_steps, _, _, new_g = create_dags(args.model_config, vocab, args, nf)
 
-        nf.reset_trainer()
         nf.train(
             tensors_to_optimize=[loss],
             callbacks=callbacks,
diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index a98a7cbe5114..a1742ab1e81c 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -1363,7 +1363,10 @@ def save_state_to(self, path):
             #         pmodule = self.module_reference_table[key][1]
             #         num_trainable_weights = self.module_reference_table[key][1].num_weights
             self.ddp_initialized = True
-            for module in AppState().modules:
+            module_list = [mod.name for mod in AppState().modules]
+            module_list = sorted(module_list)
+            for module_name in module_list:
+                module = AppState().modules[module_name]
                 key = module.unique_instance_id
                 num_trainable_weights = module.num_weights
                 self.ddp_module_dict[key] = module
@@ -1435,7 +1438,6 @@ def save_state_to(self, path):
             else:
                 train_dataloader = dataNM.data_iterator
                 train_sampler = None
-        logging.info("very start")
 
         _init_callbacks(callbacks, self)
         # Do action start callbacks
@@ -1451,7 +1453,6 @@ def save_state_to(self, path):
 
             # Register epochs start with callbacks
             _perform_on_epoch_start(callbacks, get_state(self))
-            logging.info("I'm here")
 
             # iteration over batches in epoch
             batch_counter = 0
@@ -1459,14 +1460,12 @@ def save_state_to(self, path):
                 if max_steps is not None and self.step >= max_steps:
                     break
 
-                logging.info("I'm there")
                 if batch_counter == 0:
                     # Started step, zero gradients
                     curr_optimizer = training_loop[self.step % len(training_loop)][0]
                     curr_optimizer.zero_grad()
                     # Register iteration start with callbacks
                     _perform_on_step_start(callbacks, get_state(self))
-                logging.info("I'm everywhere")
 
                 # set learning rate policy
                 if lr_policy is not None:
@@ -1479,7 +1478,6 @@ def save_state_to(self, path):
                 if callbacks is not None:
                     for callback in callbacks:
                         callback.learning_rate = curr_optimizer.param_groups[0]['lr']
-                logging.info("I'm everywhere2")
 
                 # registered_tensors will contain created tensors
                 # named by output port and uuid of module which created them
@@ -1491,32 +1489,24 @@ def save_state_to(self, path):
                 tensors = []
                 if isinstance(data, torch.Tensor):
                     data = (data,)
-                dl_device = f"cuda:{self._local_rank}"
-                logging.info(dl_device)
                 for d in data:
-                    logging.info(d)
-                    d.to(dl_device)
                     if isinstance(d, torch.Tensor):
                         tensors.append(d.to(dl_device))
                     else:
                         tensors.append(d)
-                logging.info("I'm everywhere3")
 
                 for t, d in zip(curr_call_chain[0][2].values(), tensors):
                     if t is not None:
                         self._training_state.set_tensor(t, d)
                 disable_allreduce = batch_counter < (batches_per_step - 1)
-                logging.info("before forward")
                 self.__nm_graph_forward_pass(
                     call_chain=curr_call_chain, registered_tensors=self._training_state.tensor_dict,
                 )
-                logging.info("after forward")
 
                 curr_tensors_to_optimize = training_loop[self.step % len(training_loop)][1]
                 final_loss = 0
                 for tensor in curr_tensors_to_optimize:
                     final_loss += self._training_state.tensor_dict[tensor.unique_name]
-                logging.info("Or there")
 
                 # Check for NaN/inf loss (across workers if applicable)
                 loss_nan_inf_checker = final_loss.clone()
@@ -1532,7 +1522,6 @@ def save_state_to(self, path):
                         logging.warning('Loss is NaN or inf. Skipping update.')
                         continue
 
-                logging.info("Am I Here?")
                 if self._optim_level in AmpOptimizations and self._optim_level != Optimization.mxprO0:
                     with amp.scale_loss(final_loss, curr_optimizer, delay_unscale=disable_allreduce) as scaled_loss:
                         if disable_allreduce:
@@ -1563,7 +1552,6 @@ def save_state_to(self, path):
 
                 batch_counter += 1
 
-                raise ValueError
                 if batch_counter == batches_per_step:
                     # Ended step. Do optimizer update
                     if grad_norm_clip is not None:

From 5fe64fba8cf31f99924e529c917645f8dc63128e Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 21 May 2020 11:53:01 -0700
Subject: [PATCH 21/40] update new warning format with rank

Signed-off-by: Jason <jasoli@nvidia.com>
---
 tests/unit/utils/test_deprecated.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/utils/test_deprecated.py b/tests/unit/utils/test_deprecated.py
index 2ae3e5cb156f..06703a7aaa52 100644
--- a/tests/unit/utils/test_deprecated.py
+++ b/tests/unit/utils/test_deprecated.py
@@ -30,7 +30,7 @@
 
 class DeprecatedTest(TestCase):
     NEMO_ERR_MSG_FORMAT = re.compile(
-        r"\[NeMo W [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} deprecated:[0-9]*\] "
+        r"\[NeMo W [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} deprecated:[0-9]+( [0-9]+)?\] "
     )
 
     @pytest.mark.unit

From 01dd179cb9c48769ddc301f794a2ffbf6b268a05 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 21 May 2020 13:28:18 -0700
Subject: [PATCH 22/40] add explicit rank marker

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/utils/formatters/base.py       | 2 +-
 tests/unit/utils/test_deprecated.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/utils/formatters/base.py b/nemo/utils/formatters/base.py
index 61481154dfef..12500477b9c8 100644
--- a/nemo/utils/formatters/base.py
+++ b/nemo/utils/formatters/base.py
@@ -130,5 +130,5 @@ class BaseNeMoFormatter(BaseFormatter):
 
 class DebugNeMoFormatter(BaseFormatter):
     DEFAULT_FORMAT = (
-        "%(color)s[NeMo %(levelname)1.1s %(asctime)s %(module)s:%(lineno)d %(rank)d]%(end_color)s %(message)s"
+        "%(color)s[NeMo %(levelname)1.1s %(asctime)s %(module)s:%(lineno)d rank:%(rank)d]%(end_color)s %(message)s"
     )
diff --git a/tests/unit/utils/test_deprecated.py b/tests/unit/utils/test_deprecated.py
index 06703a7aaa52..4f1c9490e60f 100644
--- a/tests/unit/utils/test_deprecated.py
+++ b/tests/unit/utils/test_deprecated.py
@@ -30,7 +30,7 @@
 
 class DeprecatedTest(TestCase):
     NEMO_ERR_MSG_FORMAT = re.compile(
-        r"\[NeMo W [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} deprecated:[0-9]+( [0-9]+)?\] "
+        r"\[NeMo W [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} deprecated:[0-9]+( rank:[0-9]+)?\] "
     )
 
     @pytest.mark.unit

From c6ece47ca632badfc26f9b8e3f394c7f5a1afe86 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 21 May 2020 15:09:09 -0700
Subject: [PATCH 23/40] docstrings and more

Signed-off-by: Jason <jasoli@nvidia.com>
---
 examples/asr/jasper_an4.py                  |   2 +-
 examples/asr/jasper_an4_debug.py            | 307 --------------------
 nemo/backends/pytorch/actions.py            |  77 ++---
 nemo/core/neural_factory.py                 |  88 +++++-
 nemo/core/neural_types/neural_type.py       |   8 +-
 nemo/core/neural_types/nmtensor_registry.py |  34 +--
 nemo/utils/app_state.py                     |   4 +-
 7 files changed, 135 insertions(+), 385 deletions(-)
 delete mode 100755 examples/asr/jasper_an4_debug.py

diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py
index 7baa44e34f74..40172008c9da 100644
--- a/examples/asr/jasper_an4.py
+++ b/examples/asr/jasper_an4.py
@@ -79,7 +79,7 @@ def create_dags(model_config_file, vocab, args, nf):
         loss_e = ctc_loss(
             log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
         )
-    logging.error("Num of params in encoder: {0}".format(jasper_encoder.num_weights))
+    logging.info("Num of params in encoder: {0}".format(jasper_encoder.num_weights))
 
     # Callbacks to print info to console and Tensorboard.
     train_callback = nemo.core.SimpleLossLoggerCallback(
diff --git a/examples/asr/jasper_an4_debug.py b/examples/asr/jasper_an4_debug.py
deleted file mode 100755
index 761c674ad3be..000000000000
--- a/examples/asr/jasper_an4_debug.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Copyright (c) 2019 NVIDIA Corporation
-import argparse
-import math
-import os
-from functools import partial
-
-from ruamel.yaml import YAML
-
-import nemo
-import nemo.collections.asr as nemo_asr
-import nemo.utils.argparse as nm_argparse
-from nemo.collections.asr.helpers import (
-    monitor_asr_train_progress,
-    post_process_predictions,
-    post_process_transcripts,
-    process_evaluation_batch,
-    process_evaluation_epoch,
-    word_error_rate,
-)
-from nemo.utils.lr_policies import CosineAnnealing
-
-logging = nemo.logging
-
-
-def create_dags(model_config_file, vocab, args, nf):
-
-    # Create a data_layer for training.
-    data_layer = nemo_asr.AudioToTextDataLayer.import_from_config(
-        model_config_file,
-        "AudioToTextDataLayer_train",
-        overwrite_params={"manifest_filepath": args.train_dataset, "batch_size": args.batch_size},
-    )
-
-    num_samples = len(data_layer)
-    steps_per_epoch = math.ceil(num_samples / (data_layer.batch_size * args.iter_per_step * nf.world_size))
-    total_steps = steps_per_epoch * args.num_epochs
-    logging.info("Train samples=", num_samples, "num_steps=", total_steps)
-
-    # # Create a data_layer for evaluation.
-    # data_layer_eval = nemo_asr.AudioToTextDataLayer.import_from_config(
-    #     model_config_file, "AudioToTextDataLayer_eval", overwrite_params={"manifest_filepath": args.eval_datasets},
-    # )
-
-    # num_samples = len(data_layer_eval)
-    # logging.info(f"Eval samples={num_samples}")
-
-    # Instantiate data processor.
-    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config(
-        model_config_file, "AudioToMelSpectrogramPreprocessor"
-    )
-
-    # Instantiate JASPER encoder-decoder modules.
-    jasper_encoder = nemo_asr.JasperEncoder.import_from_config(model_config_file, "JasperEncoder")
-    jasper_decoder = nemo_asr.JasperDecoderForCTC.import_from_config(
-        model_config_file, "JasperDecoderForCTC", overwrite_params={"num_classes": len(vocab)}
-    )
-
-    # Instantiate losses.
-    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
-    greedy_decoder = nemo_asr.GreedyCTCDecoder()
-
-    # Create a training graph.
-    audio, audio_len, transcript, transcript_len = data_layer()
-    processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len)
-    encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len)
-    log_probs = jasper_decoder(encoder_output=encoded)
-    predictions = greedy_decoder(log_probs=log_probs)
-    loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len)
-
-    # # Create an evaluation graph.
-    # audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
-    # processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e)
-    # encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e)
-    # log_probs_e = jasper_decoder(encoder_output=encoded_e)
-    # predictions_e = greedy_decoder(log_probs=log_probs_e)
-    # loss_e = ctc_loss(
-    #     log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
-    # )
-    logging.info("Num of params in encoder: {0}".format(jasper_encoder.num_weights))
-
-    # Callbacks to print info to console and Tensorboard.
-    # train_callback = nemo.core.SimpleLossLoggerCallback(
-    #     tensors=[loss, predictions, transcript, transcript_len],
-    #     print_func=partial(monitor_asr_train_progress, labels=vocab),
-    #     get_tb_values=lambda x: [["loss", x[0]]],
-    #     tb_writer=nf.tb_writer,
-    # )
-
-    # loss.rename("test")
-    # train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["test"])
-
-    # train_callback = nemo.core.SimpleLossLogger()
-    predictions.rename("test")
-    train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["loss", "test"])
-
-    checkpointer_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)
-
-    # eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
-    # eval_callback = nemo.core.EvaluatorCallback(
-    #     eval_tensors=eval_tensors,
-    #     user_iter_callback=partial(process_evaluation_batch, labels=vocab),
-    #     user_epochs_done_callback=process_evaluation_epoch,
-    #     eval_step=args.eval_freq,
-    #     tb_writer=nf.tb_writer,
-    #     eval_at_start=not args.do_not_eval_at_start,
-    # )
-    # callbacks = [train_callback, checkpointer_callback, eval_callback]
-    callbacks = [train_callback, checkpointer_callback]
-
-    @nemo.core.callbacks.on_step_start
-    def my_own_func(state):
-        if state["step"] % 100 == 0:
-            print(state)
-
-    callbacks.append(my_own_func)
-
-    # Return entities required by the actual training.
-    return (
-        loss,
-        # eval_tensors,
-        callbacks,
-        total_steps,
-        # log_probs_e,
-        # encoded_len_e,
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        parents=[nm_argparse.NemoArgParser()], description='AN4 ASR', conflict_handler='resolve',
-    )
-
-    # Overwrite default args
-    parser.add_argument("--train_dataset", type=str, help="training dataset path")
-    parser.add_argument("--eval_datasets", type=str, help="validation dataset path")
-
-    # Create new args
-    # parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str)
-    parser.add_argument("--batch_size", default=48, type=int, help="size of the training batch")
-    parser.add_argument("--lm", default=None, type=str)
-    parser.add_argument("--test_after_training", action='store_true')
-    parser.add_argument("--momentum", type=float)
-    parser.add_argument("--beta1", default=0.95, type=float)
-    parser.add_argument("--beta2", default=0.25, type=float)
-    parser.add_argument("--do_not_eval_at_start", action='store_true')
-    parser.set_defaults(
-        model_config="./configs/jasper_an4.yaml",
-        train_dataset="~/TestData/an4_dataset/an4_train.json",
-        eval_datasets="~/TestData/an4_dataset/an4_val.json",
-        work_dir="./tmp",
-        optimizer="novograd",
-        num_epochs=50,
-        lr=0.02,
-        weight_decay=0.005,
-        checkpoint_save_freq=1000,
-        eval_freq=100,
-        amp_opt_level="O1",
-    )
-
-    args = parser.parse_args()
-    betas = (args.beta1, args.beta2)
-
-    wer_thr = 0.20
-    beam_wer_thr = 0.15
-
-    nf = nemo.core.NeuralModuleFactory(
-        local_rank=args.local_rank,
-        files_to_copy=[__file__],
-        optimization_level=args.amp_opt_level,
-        random_seed=0,
-        log_dir=args.work_dir,
-        create_tb_writer=True,
-        cudnn_benchmark=args.cudnn_benchmark,
-    )
-    tb_writer = nf.tb_writer
-    checkpoint_dir = nf.checkpoint_dir
-
-    # Load model definition
-    yaml = YAML(typ="safe")
-    with open(args.model_config) as f:
-        jasper_params = yaml.load(f)
-    # Get vocabulary.
-    vocab = jasper_params['labels']
-
-    # (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e,) = create_dags(
-    #     args.model_config, vocab, args, nf
-    # )
-
-    loss, callbacks, total_steps = create_dags(args.model_config, vocab, args, nf)
-
-    nf.train(
-        tensors_to_optimize=[loss],
-        callbacks=callbacks,
-        optimizer=args.optimizer,
-        lr_policy=CosineAnnealing(total_steps=total_steps, min_lr=args.lr / 100),
-        optimization_params={
-            "num_epochs": args.num_epochs,
-            "max_steps": args.max_steps,
-            "lr": args.lr,
-            "momentum": args.momentum,
-            "betas": betas,
-            "weight_decay": args.weight_decay,
-            "grad_norm_clip": None,
-        },
-        batches_per_step=args.iter_per_step,
-        amp_max_loss_scale=256.0,
-        # synced_batchnorm=(nf.global_rank is not None),
-    )
-
-    # if args.test_after_training:
-    #     logging.info("Testing greedy and beam search with LM WER.")
-    #     # Create BeamSearch NM
-    #     if nf.world_size > 1 or args.lm is None:
-    #         logging.warning("Skipping beam search WER as it does not work if doing distributed training.")
-    #     else:
-    #         beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
-    #             vocab=vocab, beam_width=64, alpha=2.0, beta=1.5, lm_path=args.lm, num_cpus=max(os.cpu_count(), 1),
-    #         )
-    #         beam_predictions = beam_search_with_lm(log_probs=log_probs_e, log_probs_length=encoded_len_e)
-    #         eval_tensors.append(beam_predictions)
-
-    #     evaluated_tensors = nf.infer(eval_tensors)
-    #     if nf.global_rank in [0, None]:
-    #         greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
-    #         references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab)
-    #         wer = word_error_rate(hypotheses=greedy_hypotheses, references=references)
-    #         logging.info("Greedy WER: {:.2f}%".format(wer * 100))
-    #         if wer > wer_thr:
-    #             nf.sync_all_processes(False)
-    #             raise ValueError(f"Final eval greedy WER {wer * 100:.2f}% > :" f"than {wer_thr * 100:.2f}%")
-    #     nf.sync_all_processes()
-
-    #     if nf.world_size == 1 and args.lm is not None:
-    #         beam_hypotheses = []
-    #         # Over mini-batch
-    #         for i in evaluated_tensors[-1]:
-    #             # Over samples
-    #             for j in i:
-    #                 beam_hypotheses.append(j[0][1])
-
-    #         beam_wer = word_error_rate(hypotheses=beam_hypotheses, references=references)
-    #         logging.info("Beam WER {:.2f}%".format(beam_wer * 100))
-    #         assert beam_wer <= beam_wer_thr, "Final eval beam WER {:.2f}%  > than {:.2f}%".format(
-    #             beam_wer * 100, beam_wer_thr * 100
-    #         )
-    #         assert beam_wer <= wer, "Final eval beam WER > than the greedy WER."
-
-    #     # Reload model weights and train for extra 10 epochs
-    #     checkpointer_callback = nemo.core.CheckpointCallback(
-    #         folder=checkpoint_dir, step_freq=args.checkpoint_save_freq, force_load=True,
-    #     )
-
-    #     # Distributed Data Parallel changes the underlying class so we need
-    #     # to reinstantiate Encoder and Decoder
-    #     args.num_epochs += 10
-    #     previous_step_count = total_steps
-    #     loss, eval_tensors, callbacks, total_steps, _, _ = create_dags(args.model_config, vocab, args, nf)
-
-    #     nf.reset_trainer()
-    #     nf.train(
-    #         tensors_to_optimize=[loss],
-    #         callbacks=callbacks,
-    #         optimizer=args.optimizer,
-    #         lr_policy=CosineAnnealing(warmup_steps=previous_step_count, total_steps=total_steps),
-    #         optimization_params={
-    #             "num_epochs": args.num_epochs,
-    #             "lr": args.lr / 100,
-    #             "momentum": args.momentum,
-    #             "betas": betas,
-    #             "weight_decay": args.weight_decay,
-    #             "grad_norm_clip": None,
-    #         },
-    #         reset=True,
-    #         amp_max_loss_scale=256.0,
-    #         # synced_batchnorm=(nf.global_rank is not None),
-    #     )
-
-    #     evaluated_tensors = nf.infer(eval_tensors)
-    #     if nf.global_rank in [0, None]:
-    #         greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
-    #         references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab)
-    #         wer_new = word_error_rate(hypotheses=greedy_hypotheses, references=references)
-    #         logging.info("New greedy WER: {:.2f}%".format(wer_new * 100))
-    #         if wer_new > wer * 1.1:
-    #             nf.sync_all_processes(False)
-    #             raise ValueError(
-    #                 f"Fine tuning: new WER {wer_new * 100:.2f}% > than the " f"previous WER {wer * 100:.2f}%"
-    #             )
-    #     nf.sync_all_processes()
-
-    #     # Open the log file and ensure that epochs is strictly increasing
-    #     if nf._exp_manager.log_file:
-    #         epochs = []
-    #         with open(nf._exp_manager.log_file, "r") as log_file:
-    #             line = log_file.readline()
-    #             while line:
-    #                 index = line.find("Starting epoch")
-    #                 if index != -1:
-    #                     epochs.append(int(line[index + len("Starting epoch") :]))
-    #                 line = log_file.readline()
-    #         for i, e in enumerate(epochs):
-    #             if i != e:
-    #                 raise ValueError("Epochs from logfile was not understood")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index a1742ab1e81c..efd3c9bfcf83 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -20,9 +20,9 @@
 from nemo.backends.pytorch.nm import DataLayerNM, TrainableNM
 from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params
 from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor
-from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback, SimpleLossLoggerCallback
-from nemo.core.neural_factory import Actions, OperationMode, Optimization, topological_sort_from_leaves
-from nemo.core.neural_types import *
+from nemo.core.callbacks import ActionCallback, NeMoCallback, SimpleLossLoggerCallback
+from nemo.core.neural_factory import Actions, OperationMode, Optimization, topological_sort_from_leaves, TrainingState
+from nemo.core.neural_types import NeuralType, AxisKind
 from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
 from nemo.utils.helpers import get_checkpoint_from_dir
@@ -50,48 +50,6 @@
 }
 
 
-class TrainingState:
-    def __init__(self, action):
-        tensor_naming_registery = AppState().tensor_names
-        self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None)
-        self._action = action
-
-    def tensor_list(self):
-        return self.tensor_dict.keys()
-
-    def clear_dict(self):
-        for name in self.tensor_dict:
-            self.tensor_dict[name] = None
-
-    def set_tensor(self, tensor, value):
-        self.tensor_dict[tensor.unique_name] = value
-
-    def check_tensor_cached(self, unique_name):
-        if self.tensor_dict[unique_name] is None:
-            return False
-        return True
-
-    def get_tensor(self, name):
-        if isinstance(name, NmTensor):
-            unique_name = name.unique_name
-        else:
-            unique_name = AppState().tensor_names[name]
-        tensor_value = self.tensor_dict[unique_name]
-        if tensor_value is None:
-            nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name]
-            callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self)
-            # print(callchain)
-            callchain.insert(0, ())
-            self._action.nm_graph_forward_pass(callchain, self.tensor_dict)
-            # print(self.tensor_dict[unique_name])
-            tensor_value = self.tensor_dict[unique_name]
-        return tensor_value
-
-    #     unique_name = AppState().tensor_names[name]
-    #     return self.tensor_dict[unique_name]
-    # def get_and_compute_tensor(self, name):
-
-
 class PtActions(Actions):
     def __init__(
         self, local_rank=None, global_rank=None, tb_writer=None, optimization_level=Optimization.mxprO0,
@@ -161,8 +119,16 @@ def epoch(self, epoch):
     def epoch_num(self):
         return self._epoch
 
-    def __get_top_sorted_modules_and_dataloader(self, hook):
-        """ TODO
+    def __get_top_sorted_modules_and_dataloader(self, hook: List[NmTensor]):
+        """A function that accepts a list of NmTensors that need to be computed and constructs a call DAG that starts
+        from a datalayerNM and can be used to compute the NmTensors.
+
+        args:
+            leaf_nmtensors (List[NmTensors]): The tensors to be computed
+
+        returns:
+            top_sorted_modules: the callchain DAG
+            tdataset: the datalayer at the top of the callchain
         """
         top_sorted_modules = topological_sort_from_leaves(hook)
 
@@ -1119,7 +1085,7 @@ def _perform_on_batch_start(callbacks, state):
                     if isinstance(callback, ActionCallback):
                         continue
                     elif isinstance(callback, NeMoCallback):
-                        callback.on_epoch_start(state)
+                        callback.on_batch_start(state)
                     else:
                         raise ValueError(
                             "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
@@ -1131,7 +1097,7 @@ def _perform_on_batch_end(callbacks, state):
                     if isinstance(callback, ActionCallback):
                         continue
                     elif isinstance(callback, NeMoCallback):
-                        callback.on_epoch_end(state)
+                        callback.on_batch_end(state)
                     else:
                         raise ValueError(
                             "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
@@ -1152,9 +1118,14 @@ def _update_callbacks(callbacks=None, registered_tensors=None, final_loss=None):
                     else:  # For now, we can use the old callback function. In the future we should improve this
                         registered_tensors["loss"] = final_loss
 
-        def get_state(action):
+        def get_state(action: 'PtAction'):
+            """Helper function used to create a state for callbacks
+            """
             class StateWrapper(dict):
                 def __init__(self, action):
+                    """A class that wraps a dictionary but adds the functions: restore_state_from and save_state_to
+                    which are helper functions for CheckpointCallback to use.
+                    """
                     self.action = action
                     super().__init__(
                         {
@@ -1467,6 +1438,9 @@ def save_state_to(self, path):
                     # Register iteration start with callbacks
                     _perform_on_step_start(callbacks, get_state(self))
 
+                # Perform batch start callbacks
+                _perform_on_batch_start(callbacks, get_state(self))
+
                 # set learning rate policy
                 if lr_policy is not None:
                     adjusted_lr = lr_policy(optimization_params["lr"], self.step, self.epoch)
@@ -1550,6 +1524,9 @@ def save_state_to(self, path):
                         else:
                             final_loss.backward(bps_scale.to(final_loss.get_device()))
 
+                # Perform batch end callbacks
+                _perform_on_batch_end(callbacks, get_state(self))
+
                 batch_counter += 1
 
                 if batch_counter == batches_per_step:
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 583563819041..29ca0713a8a9 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -25,11 +25,10 @@
     'DeploymentFormat',
 ]
 
-import copy
 import random
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import numpy as np
 
@@ -37,13 +36,90 @@
 from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback
 from nemo.core.neural_types import NmTensor
 from nemo.utils import ExpManager, logging
-from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
+from nemo.utils.app_state import AppState
+
+class TrainingState:
+    def __init__(self, action: 'Actions'):
+        """A class used to wrap the current training state of an Actions.train() function. This class holds a mapping
+        of tensor.unique_name -> it's backend tensor (eg Pytorch Tensor) or None if the tensor has been been computed
+        on the current step.
+
+        args:
+            action (Actions): The Actions object this state is associated with.
+        """
+        tensor_naming_registery = AppState().tensor_names
+        self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None)
+        self._action = action
 
+    def tensor_list(self):
+        """Returns a list the unique names of all tensors.
+        """
+        return self.tensor_dict.keys()
+
+    def clear_dict(self):
+        """Clears the dictionary by setting all values to None. Used in-between training batches to clear it's state.
+        """
+        for name in self.tensor_dict:
+            self.tensor_dict[name] = None
 
-# def topological_sort_from_leaves(leaf_nmtensors, cached_training_state: TrainingState = None):
-def topological_sort_from_leaves(leaf_nmtensors, cached_training_state=None):
-    from nemo.backends.pytorch.nm import DataLayerNM
+    def set_tensor(self, tensor: NmTensor, value: 'torch.Tensor'):
+        """Sets the value of tensor
+
+        args:
+            tensor (NmTensor)
+            value (torch.Tensor)
+        """
+        self.tensor_dict[tensor.unique_name] = value
+
+    def check_tensor_cached(self, unique_name: str):
+        """Checks to see the tensor value has been computed in the current step yet.
+
+        args:
+            unique_name (str): The NmTensor.unique_name that we want to check for.
+        """
+        if self.tensor_dict[unique_name] is None:
+            return False
+        return True
+
+    def get_tensor(self, name: Union[str, NmTensor], compute: bool = True):
+        """Returns the value associated with a tensor. And optionally, computes the value of the tensor if not already
+        set.
+
+        args:
+            name (str, NmTensor): The user-defined name for a tensor or the NmTensor itself.
+            compute (bool): If True and the tensor has not already been computed, there will be an attempt to create a
+                call DAG and then do a forward pass on this call DAG to compute the tensor. If False, it will return
+                None if the tensor has not been computed yet.
+                Defaults to True.
+        """
+        if isinstance(name, NmTensor):
+            unique_name = name.unique_name
+        else:
+            unique_name = AppState().tensor_names[name]
+        tensor_value = self.tensor_dict[unique_name]
+        if tensor_value is None and compute:
+            nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name]
+            callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self)
+            callchain.insert(0, ())
+            self._action.nm_graph_forward_pass(callchain, self.tensor_dict)
+            tensor_value = self.tensor_dict[unique_name]
+        return tensor_value
+
+
+def topological_sort_from_leaves(leaf_nmtensors: List[NmTensor], cached_training_state: TrainingState = None):
+    """A function that accepts a list of NmTensors that need to be computed and constructs a callchain DAG that starts
+    from a datalayerNM and can be used to compute the NmTensors.
+
+    args:
+        leaf_nmtensors (List[NmTensors]): The tensors to be computed
+        cached_training_state (TrainingState): A dictionary of already computed tensors.
+            Defaults to None meaning an empty cache.
+
+    returns:
+        top_sorted_modules: the callchain DAG
+    """
+    from nemo.backends.pytorch.nm import DataLayerNM  # TODO: Replace this with a backend agnostic data layer
 
     def create_node(producer, producer_args):
         if producer_args is None:
diff --git a/nemo/core/neural_types/neural_type.py b/nemo/core/neural_types/neural_type.py
index d3da8a80fdf5..c52f2680b73c 100644
--- a/nemo/core/neural_types/neural_type.py
+++ b/nemo/core/neural_types/neural_type.py
@@ -324,8 +324,12 @@ def unique_name(self):
             raise ValueError("This NmTensor does not have a unique name")
         return f"{self._output_port_name}~~~{self._producer_name}~~~{self._uuid}"
 
-    def rename(self, new_name):
-        """TODO
+    def rename(self, new_name: str):
+        """Renames the tensor from its old name to a new user-defined name for easy access within callbacks. Note,
+        a tensor's unique_name is never changed. This simply adds a reference from new_name -> tensor.unique_name
+
+        args:
+            new_name (str): the new tensor's name.
         """
         AppState().tensor_names.rename_NmTensor(self, new_name)
 
diff --git a/nemo/core/neural_types/nmtensor_registry.py b/nemo/core/neural_types/nmtensor_registry.py
index 5055319c2cef..f1d9591039a4 100755
--- a/nemo/core/neural_types/nmtensor_registry.py
+++ b/nemo/core/neural_types/nmtensor_registry.py
@@ -18,30 +18,27 @@
 class NmTensorNameRegistry:
     def __init__(self):
         """
-            Constructor. Initializes the manager. Sets active graph to None.
+            Constructor. Initializes the NmTensorNameRegistry. Reserves the default 'loss' name.
 
-            TODO: Should probably be a property of a graph
+            TODO: We should be recording the tensors of each graph rather than all the tensors.
         """
         # Create the nmtensor_naming_dict
         # which contains a mapping of str to NMTensor.unique_name
         self._nmtensor_naming_dict = {"loss": "loss"}  # Reserve keyname of 'loss'
-        # self._nmtensor_uniname_set = set(["loss"])
         self._nmtensor_uniname_dict = {"loss": None}
 
-    # def summary(self):
-    #     """ Prints a nice summary. """
-    #     desc = ""
-    #     for graph in self:
-    #         desc = desc + "`{}`: {}\n".format(graph.name, graph)
-    #     return desc
-
     @property
     def unique_names(self):
+        """Returns the set of all NmTensors.unique_names + 'loss'
+        """
         return self._nmtensor_uniname_dict.keys()
 
-    # def register(self, tensor: NmTensor):
-    def register(self, tensor):
-        """TODO
+    def register(self, tensor: 'NmTensor'):
+        """Helper function to register a newly created NmTensor by adding it to self.__nmtensor_uniname_dict.
+        Should be called from NmTensor.__init__()
+
+        args:
+            tensor (NmTensor): The tensor to be registered.
         """
 
         # Check if object is already in a set.
@@ -51,9 +48,12 @@ def register(self, tensor):
         # Finally, add object to the set.
         self._nmtensor_uniname_dict[tensor.unique_name] = tensor
 
-    # def rename_NmTensor(self, tensor: NmTensor, new_name: str):
-    def rename_NmTensor(self, tensor, new_name: str):
-        """ TODO
+    def rename_NmTensor(self, tensor: 'NmTensor', new_name: str):
+        """Helper function that changes the naming dictionary to facilitate user name -> tensor.unique_name lookup.
+
+        args:
+            tensor (NmTensor): The tensor to be renamed.
+            new_name (str): its new name.
         """
         # Find old name if exists
         old_name = tensor.unique_name
@@ -68,7 +68,7 @@ def rename_NmTensor(self, tensor, new_name: str):
             raise KeyError(f"{new_name} already exists in current graph. Please use a unique name")
         self._nmtensor_naming_dict[new_name] = tensor.unique_name
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str):
         """
         Object getter function.
 
diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py
index 6183526b87fe..8bbf120c0f60 100644
--- a/nemo/utils/app_state.py
+++ b/nemo/utils/app_state.py
@@ -52,10 +52,10 @@ def __init__(self, device=None):
 
     @property
     def tensor_names(self):
-        """ Property returning the existing modules.
+        """ Property returning the NmTensorNameRegistry which maps user-defined names to tensor's unique_names.
 
             Returns:
-                Existing modules (a set object).
+                NmTensorNameRegistry.
         """
         return self._nmtensor_name_registry
 

From 3c3bee9e74f584b3f865665ea5d4f94ecaeebd66 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 21 May 2020 15:09:29 -0700
Subject: [PATCH 24/40] style

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py | 5 +++--
 nemo/core/neural_factory.py      | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index efd3c9bfcf83..94f82c5399c5 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -21,8 +21,8 @@
 from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params
 from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor
 from nemo.core.callbacks import ActionCallback, NeMoCallback, SimpleLossLoggerCallback
-from nemo.core.neural_factory import Actions, OperationMode, Optimization, topological_sort_from_leaves, TrainingState
-from nemo.core.neural_types import NeuralType, AxisKind
+from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState, topological_sort_from_leaves
+from nemo.core.neural_types import AxisKind, NeuralType
 from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
 from nemo.utils.helpers import get_checkpoint_from_dir
@@ -1121,6 +1121,7 @@ def _update_callbacks(callbacks=None, registered_tensors=None, final_loss=None):
         def get_state(action: 'PtAction'):
             """Helper function used to create a state for callbacks
             """
+
             class StateWrapper(dict):
                 def __init__(self, action):
                     """A class that wraps a dictionary but adds the functions: restore_state_from and save_state_to
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 29ca0713a8a9..2dc63ffca36b 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -36,8 +36,9 @@
 from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback
 from nemo.core.neural_types import NmTensor
 from nemo.utils import ExpManager, logging
-from nemo.utils.decorators import deprecated
 from nemo.utils.app_state import AppState
+from nemo.utils.decorators import deprecated
+
 
 class TrainingState:
     def __init__(self, action: 'Actions'):

From dba45362f785d6d74fe1cac3b00aadc7e6b528fe Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 21 May 2020 16:34:38 -0700
Subject: [PATCH 25/40] callback docstrings

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py |   7 +
 nemo/core/callbacks.py           | 581 ++++++++++++++++++-------------
 nemo/core/neural_factory.py      |  11 +-
 3 files changed, 353 insertions(+), 246 deletions(-)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 94f82c5399c5..86d8ce6aaba7 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -1126,6 +1126,13 @@ class StateWrapper(dict):
                 def __init__(self, action):
                     """A class that wraps a dictionary but adds the functions: restore_state_from and save_state_to
                     which are helper functions for CheckpointCallback to use.
+                    The StateWrapper is a dictionary that contains the following mapping:
+                        "step" (int): the current training step
+                        "epoch" (int): the current epoch step
+                        "local_rank" (int): the local rank that the process is running on
+                        "global_rank" (int): the global rank that the process is running on
+                        "optimizers" (list): a list of optimizers defined during the training process
+                        "tensors" (TrainingState): A TrainingState object that can be used to access tensor values
                     """
                     self.action = action
                     super().__init__(
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index a9a4970d0254..2adac28d5530 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -23,6 +23,7 @@
 import warnings
 from abc import ABC, abstractmethod
 from collections import namedtuple
+from typing import Callable, List, Union
 
 import nemo
 from nemo.utils import get_checkpoint_from_dir, logging
@@ -37,6 +38,11 @@
 
 
 class NeMoCallback(ABC):
+    """The base class for callbacks inside of NeMo. It contains no __init__ which children classes are responsible for.
+    Each callback contains 8 functions which are called at different stages of train(). All functions must take as the
+    first argument: the current action state. This state is a StateWrapper object.
+    TODO: Add a link to documentation.
+    """
     def on_train_start(self, state):
         pass
 
@@ -62,113 +68,10 @@ def on_train_end(self, state):
         pass
 
 
-class TensorboardLogger(NeMoCallback):
-    def __init__(self, step_freq=100, tensors_to_log=["loss"], tb_writer=None, custom_tb_log_func=None):
-        # Step_freq: how often logs are printed
-        self.step_freq = step_freq
-        self.tensors_to_log = tensors_to_log
-        if tb_writer is None:
-            logging.error("There was no tb writer")
-            # Should grab this from default tb writer
-        else:
-            self.tb_writer = tb_writer
-        self.custom_tb_log_func = custom_tb_log_func
-        self._last_epoch_start = None
-
-    # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]):
-    #     #tensors_to_log: List of keys into state that will be logged
-
-    def on_epoch_start(self, state):
-        if state["global_rank"] is None or state["global_rank"] == 0:
-            self._last_epoch_start = time.time()
-
-    def on_epoch_end(self, state):
-        if state["global_rank"] is None or state["global_rank"] == 0:
-            # always log epoch num and epoch_time
-            epoch_time = time.time() - self._last_epoch_start
-            self.tb_writer.add_scalar('misc/epoch', state["epoch"], state["step"])
-            self.tb_writer.add_scalar('misc/epoch_time', epoch_time, state["step"])
-
-    def on_step_end(self, state):
-        if state["global_rank"] is None or state["global_rank"] == 0:
-            if state["step"] % self.step_freq == 0:
-                tb_log_func = lambda x: self.tb_writer.add_scalar(x, state["tensors"].get_tensor(x), state["step"])
-                if self.custom_tb_log_func is not None:
-                    tb_log_func = self.custom_tb_log_func
-                for tensor_key in self.tensors_to_log:
-                    tb_log_func(tensor_key)
-
-
-class WandBLogger(NeMoCallback):
-    def __init__(self, step_freq=100, tensors_to_log=["loss"], wandb_name=None, wandb_project=None, args=None):
-        if not _WANDB_AVAILABLE:
-            logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
-        self._step_freq = step_freq
-        self._tensors_to_log = tensors_to_log
-        self._name = wandb_name
-        self._project = wandb_project
-        self._args = args
-        self._last_epoch_start = None
-
-    def on_train_start(self, state):
-        if state["global_rank"] is None or state["global_rank"] == 0:
-            if _WANDB_AVAILABLE and wandb.run is None:
-                wandb.init(name=self._name, project=self._project)
-                if self._args is not None:
-                    wandb.config.update(self._args)
-            elif _WANDB_AVAILABLE and wandb.run is not None:
-                logging.info("Re-using wandb session")
-            else:
-                logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
-                logging.info("Will not log data to weights and biases.")
-                self._step_freq = -1
-
-    def on_step_end(self, state):
-        # log training metrics
-        if state["global_rank"] is None or state["global_rank"] == 0:
-            if state["step"] % self._step_freq == 0 and self._step_freq > 0:
-                tensors_logged = {t: state["tensors"].get_tensor(t).cpu() for t in self._tensors_to_log}
-                # Always log learning rate
-                tensors_logged['LR'] = state["learning_rate"]
-                self._wandb_log(tensors_logged)
-
-    def on_epoch_start(self, state):
-        if state["global_rank"] is None or state["global_rank"] == 0:
-            self._last_epoch_start = time.time()
-
-    def on_epoch_end(self, state):
-        if state["global_rank"] is None or state["global_rank"] == 0:
-            # always log epoch num and epoch_time
-            epoch_time = time.time() - self._last_epoch_start
-            self._wandb_log({"epoch": state["epoch"], "epoch_time": epoch_time})
-
-    def _wandb_log(self, tensors_logged):
-        if _WANDB_AVAILABLE:
-            wandb.log(tensors_logged, step=state["step"])
-
-
-class SimpleLossLogger(NeMoCallback):
-    def __init__(self, step_freq=100, tensors_to_log=["loss"]):
-        # Step_freq: how often logs are printed
-        self.step_freq = step_freq
-        self.tensors_to_log = tensors_to_log
-
-    # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]):
-    #     #tensors_to_log: List of keys into state that will be logged
-
-    def on_step_end(self, state):
-        if state["step"] % self.step_freq == 0:
-            for tensor_key in self.tensors_to_log:
-                tensor = state["tensors"].get_tensor(tensor_key)
-                if tensor is None:
-                    tensor = state["tensors"].get_and_compute_tensor(tensor_key)
-                logging.info("%s: %s", tensor_key, tensor)
-                # except KeyError:
-                #     raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. "
-                #                    f"Current state tensors include {state['tensors'].tensor_list()}")
-
-
 def on_train_start(func):
+    """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
+    on_train_start callback event.
+    """
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -180,6 +83,9 @@ def on_train_start(self, state):
 
 
 def on_epoch_start(func):
+    """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
+    on_epoch_start callback event.
+    """
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -191,6 +97,9 @@ def on_epoch_start(self, state):
 
 
 def on_batch_start(func):
+    """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
+    on_batch_start callback event.
+    """
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -202,6 +111,9 @@ def on_batch_start(self, state):
 
 
 def on_step_start(func):
+    """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
+    on_step_start callback event.
+    """
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -213,6 +125,9 @@ def on_step_start(self, state):
 
 
 def on_step_end(func):
+    """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
+    on_step_end callback event.
+    """
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -224,6 +139,9 @@ def on_step_end(self, state):
 
 
 def on_batch_end(func):
+    """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
+    on_batch_end callback event.
+    """
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -235,6 +153,9 @@ def on_batch_end(self, state):
 
 
 def on_epoch_end(func):
+    """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
+    on_epoch_end callback event.
+    """
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -246,6 +167,9 @@ def on_epoch_end(self, state):
 
 
 def on_train_end(func):
+    """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
+    on_train_end callback event.
+    """
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -256,6 +180,313 @@ def on_train_end(self, state):
     return NeMoCallbackWrapper(func)
 
 
+
+class SimpleLogger(NeMoCallback):
+    def __init__(self, step_freq:int = 100, tensors_to_log: List[Union[str, 'NmTensor']] = ["loss"]):
+        """A simple callback that prints tensors to screen. It's default option is to print the training loss every
+        100 steps. Additional tensors can be printed by adding them to the tensors_to_log argument.
+
+        args:
+            step_freq (int): The frequency of printing to screen. Defaults to every 100 steps
+            tensors_to_log (List of str or NmTensor): A list of either tensor names or NmTensors which will be printed
+                every step_freq steps.
+                Defaults to ["loss"] which only prints the loss.
+        """
+        self.step_freq = step_freq
+        self.tensors_to_log = tensors_to_log
+
+    def on_step_end(self, state):
+        if state["step"] % self.step_freq == 0:
+            for tensor_key in self.tensors_to_log:
+                tensor = state["tensors"].get_tensor(tensor_key)
+                logging.info("%s: %s", tensor_key, tensor)
+
+
+class TensorboardLogger(NeMoCallback):
+    def __init__(
+            self,
+            tb_writer: 'torch.utils.tensorboard.SummaryWriter',
+            step_freq:int=100,
+            tensors_to_log:List[Union[str, 'NmTensor']]=["loss"],
+            custom_tb_log_func:Callable[[Union[str, 'NmTensor']],None]=None,
+            log_epoch:bool=True
+        ):
+        """A tensorboard callback that logs tensors using a tensorboard writer object. It's default option is to log
+        the loss every 100 steps. Additional scalar tensors can be logged by adding them to the tensors_to_log
+        argument. In order to log complex tensorboard entities, the custom_tb_log_func must be passed it. By default,
+        it always logs the current epoch and the time taken per epoch.
+
+        args:
+            tb_writer (required): The tensorboard logger object.
+            step_freq (int): The frequency of tensorboard logging. Defaults to every 100 steps
+            tensors_to_log (List of str or NmTensor): A list of either tensor names or NmTensors which will be logged
+                every step_freq steps.
+                Defaults to ["loss"] which only prints the loss.
+            custom_tb_log_func (func): TensorboardLogger loops through tensors_to_log and passes these elements to
+                custom_tb_log_func. So a custom_tb_log_func will receive one argument on each call with the arugment
+                being an element from tensors_to_log.
+                Defaults to None which logs each tensors_to_log as a scalar.
+            log_epoch (bool): Whether to log epoch and epoch training time to tensorboard.
+                Defaults to True.
+        """
+        self.step_freq = step_freq
+        self.tensors_to_log = tensors_to_log
+        self.tb_writer = tb_writer
+        self.custom_tb_log_func = custom_tb_log_func
+        self._last_epoch_start = None
+        self._log_epoch = log_epoch
+
+    def on_epoch_start(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            self._last_epoch_start = time.time()
+
+    def on_epoch_end(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            if self._log_epoch:
+                epoch_time = time.time() - self._last_epoch_start
+                self.tb_writer.add_scalar('misc/epoch', state["epoch"], state["step"])
+                self.tb_writer.add_scalar('misc/epoch_time', epoch_time, state["step"])
+
+    def on_step_end(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            if state["step"] % self.step_freq == 0:
+                tb_log_func = lambda x: self.tb_writer.add_scalar(x, state["tensors"].get_tensor(x), state["step"])
+                if self.custom_tb_log_func is not None:
+                    tb_log_func = self.custom_tb_log_func
+                for tensor_key in self.tensors_to_log:
+                    tb_log_func(tensor_key)
+
+
+class WandBLogger(NeMoCallback):
+    def __init__(
+            self,
+            step_freq:int=100,
+            tensors_to_log:List[Union[str, 'NmTensor']]=["loss"],
+            wandb_name:str=None,
+            wandb_project:str=None,
+            args=None,
+            log_epoch:bool=True
+        ):
+        """A [Weights & Biases](https://docs.wandb.com/) callback that logs tensors to W&B. It's default option is to
+        log the loss every 100 steps. Additional scalar tensors can be logged by adding them to the tensors_to_log
+        argument. By default, it always logs the current epoch and the time taken per epoch.
+
+        args:
+            step_freq (int): The frequency of tensorboard logging. Defaults to every 100 steps
+            tensors_to_log (List of str or NmTensor): A list of either tensor names or NmTensors which will be logged
+                every step_freq steps.
+                Defaults to ["loss"] which only prints the loss.
+            wandb_name(str): wandb experiment name.
+                Defaults to None
+            wandb_project(str): wandb project name.
+                Defaults to None
+            args: argparse flags which will be logged as hyperparameters.
+                Defaults to None.
+            log_epoch (bool): Whether to log epoch and epoch training time to tensorboard.
+                Defaults to True.
+        """
+        if not _WANDB_AVAILABLE:
+            logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
+        self._step_freq = step_freq
+        self._tensors_to_log = tensors_to_log
+        self._name = wandb_name
+        self._project = wandb_project
+        self._args = args
+        self._last_epoch_start = None
+        self._log_epoch = log_epoch
+
+    def on_train_start(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            if _WANDB_AVAILABLE and wandb.run is None:
+                wandb.init(name=self._name, project=self._project)
+                if self._args is not None:
+                    wandb.config.update(self._args)
+            elif _WANDB_AVAILABLE and wandb.run is not None:
+                logging.info("Re-using wandb session")
+            else:
+                logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
+                logging.info("Will not log data to weights and biases.")
+                self._step_freq = -1
+
+    def on_step_end(self, state):
+        # log training metrics
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            if state["step"] % self._step_freq == 0 and self._step_freq > 0:
+                tensors_logged = {t: state["tensors"].get_tensor(t).cpu() for t in self._tensors_to_log}
+                # Always log learning rate
+                tensors_logged['LR'] = state["learning_rate"]
+                self._wandb_log(tensors_logged)
+
+    def on_epoch_start(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            self._last_epoch_start = time.time()
+
+    def on_epoch_end(self, state):
+        if state["global_rank"] is None or state["global_rank"] == 0:
+            if self._log_epoch:
+                epoch_time = time.time() - self._last_epoch_start
+                self._wandb_log({"epoch": state["epoch"], "epoch_time": epoch_time})
+
+    def _wandb_log(self, tensors_logged):
+        if _WANDB_AVAILABLE:
+            wandb.log(tensors_logged, step=state["step"])
+
+
+class CheckpointCallback(NeMoCallback):
+    def __init__(
+        self,
+        folder:str,
+        load_from_folder:str=None,
+        step_freq:int=-1,
+        epoch_freq:int=-1,
+        checkpoints_to_keep:int=4,
+        force_load:bool=False,
+    ):
+        """A callback that does checkpointing of module weights and trainer (incl. optimizer) status.
+
+        args:
+            folder (str, required): A path where checkpoints are to be stored and loaded from if load_from_folder is
+                None.
+            load_from_folder (str): A path where checkpoints can be loaded from.
+                Defaults to None.
+            step_freq (int): How often in terms of steps to save checkpoints. One of step_freq or epoch_freq is
+                required.
+            epoch_freq (int): How often in terms of epochs to save checkpoints. One of step_freq or epoch_freq is
+                required.
+            checkpoints_to_keep (int): Number of most recent checkpoints to keep. Older checkpoints will be deleted.
+                Defaults to 4.
+            force_load (bool): Whether to crash if loading is unsuccessful.
+                Defaults to False
+        """
+        if step_freq == -1 and epoch_freq == -1:
+            logging.warning("No checkpoints will be saved because step_freq and epoch_freq are both -1.")
+
+        if step_freq > -1 and epoch_freq > -1:
+            logging.warning("You config the model to save by both steps and epochs. Please use one or the other")
+            epoch_freq = -1
+
+        self._step_freq = step_freq
+        self._epoch_freq = epoch_freq
+        self._folder = folder
+        self._load_from_folder = load_from_folder if load_from_folder else folder
+        self._ckpt2keep = checkpoints_to_keep
+        self._saved_ckpts = []
+        # If True, run will fail if we cannot load module weights
+        self._force_load = force_load
+
+    def __save_to(self, path, state):
+        if state["global_rank"] is not None and state["global_rank"] != 0:
+            return
+        if not os.path.isdir(path):
+            logging.info(f"Creating {path} folder")
+            os.makedirs(path, exist_ok=True)
+        unique_mod_names = set()
+        for module in AppState().modules:
+            if module.num_weights > 0:
+                if str(module) in unique_mod_names:
+                    raise NotImplementedError(
+                        "There were two instances of the same module. Please overwrite __str__() of one of the "
+                        "modules."
+                    )
+                unique_mod_names.add(str(module))
+                if self._step_freq > -1:
+                    filename = f"{module}-STEP-{state['step']}.pt"
+                else:
+                    filename = f"{module}-EPOCH-{state['epoch']}.pt"
+                module.save_to(os.path.join(path, filename))
+
+        if self._step_freq > -1:
+            filename = f"trainer-STEP-{state['step']}.pt"
+            state.save_state_to(f"{path}/{filename}")
+            self._saved_ckpts.append(f"-{state['step']}.pt")
+        else:
+            filename = f"trainer-EPOCH-{state['epoch']}.pt"
+            state.save_state_to(f"{path}/{filename}")
+            self._saved_ckpts.append(f"-{state['epoch']}.pt")
+
+        if len(self._saved_ckpts) > self._ckpt2keep:
+            for end in self._saved_ckpts[: -self._ckpt2keep]:
+                for file in glob.glob(f'{path}/*{end}'):
+                    os.remove(file)
+            self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :]
+        logging.info(f'Saved checkpoint: {path}/{filename}')
+
+    def __restore_from(self, path, state):
+        if not os.path.isdir(path):
+            if self._force_load:
+                raise ValueError("force_load was set to True for checkpoint callback but a checkpoint was not found.")
+            logging.warning(f"Checkpoint folder {path} not found!")
+        else:
+            logging.info(f"Found checkpoint folder {path}. Will attempt to restore checkpoints from it.")
+            modules_to_restore = []
+            modules_to_restore_name = []
+            for module in AppState().modules:
+                if module.num_weights > 0:
+                    modules_to_restore.append(module)
+                    modules_to_restore_name.append(str(module))
+            try:
+                module_checkpoints = get_checkpoint_from_dir(modules_to_restore_name, path)
+
+                for mod, checkpoint in zip(modules_to_restore, module_checkpoints):
+                    mod.restore_from(checkpoint, state["local_rank"])
+            except (ValueError) as e:
+                if self._force_load:
+                    raise ValueError(
+                        "force_load was set to True for checkpoint callback but a checkpoint was not found."
+                    )
+                logging.warning(e)
+                logging.warning(
+                    f"Checkpoint folder {path} was present but nothing was restored. Continuing training from random "
+                    "initialization."
+                )
+                return
+
+            try:
+                trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path)
+                state.restore_state_from(trainer_checkpoints[0])
+                # for tr, checkpoint in zip([self.action], trainer_checkpoints):
+            except (ValueError) as e:
+                logging.warning(e)
+                logging.warning(
+                    "Trainer state such as optimizer state and current step/epoch was not restored. Pretrained weights"
+                    " have still been restore and fine-tuning should continue fine."
+                )
+                return
+
+    def on_train_start(self, state):
+        num_parameters = 0
+        unique_mod_names = set()
+        for module in AppState().modules:
+            if module.num_weights > 0:
+                if str(module) in unique_mod_names:
+                    raise NotImplementedError(
+                        "There were two instances of the same module. Please overwrite __str__() of one of the "
+                        "modules."
+                    )
+                unique_mod_names.add(str(module))
+                num_parameters += module.num_weights
+        logging.info(f"Found {len(unique_mod_names)} modules with weights:")
+        for name in unique_mod_names:
+            logging.info(f"{name}")
+        logging.info(f"Total model parameters: {num_parameters}")
+        self.__restore_from(self._load_from_folder, state)
+
+    def on_step_end(self, state):
+        step = state["step"]
+        if self._step_freq > 0 and step % self._step_freq == 0 and step > 0:
+            self.__save_to(self._folder, state)
+
+    def on_train_end(self, state):
+        if self._step_freq > 0 or self._epoch_freq > 0:
+            self.__save_to(self._folder, state)
+
+    def on_epoch_end(self, state):
+        epoch = state["epoch"]
+        if self._epoch_freq > 0 and epoch % self._epoch_freq == 0 and epoch > 0:
+            self.__save_to(self._folder, state)
+
+
+
 class ActionCallback(ABC):
     """Abstract interface for callbacks.
     """
@@ -458,144 +689,6 @@ def on_iteration_end(self):
                 logging.info(f"Step time: {run_time} seconds")
 
 
-class CheckpointCallback(NeMoCallback):
-    """
-    For callback documentation: please see
-    https://nvidia.github.io/NeMo/tutorials/callbacks.html
-    """
-
-    def __init__(
-        self, folder, load_from_folder=None, step_freq=-1, epoch_freq=-1, checkpoints_to_keep=4, force_load=False,
-    ):
-        super().__init__()
-        if step_freq == -1 and epoch_freq == -1:
-            logging.warning("No checkpoints will be saved because step_freq and epoch_freq are both -1.")
-
-        if step_freq > -1 and epoch_freq > -1:
-            logging.warning("You config the model to save by both steps and epochs. Please use one or the other")
-            epoch_freq = -1
-
-        self._step_freq = step_freq
-        self._epoch_freq = epoch_freq
-        self._folder = folder
-        self._load_from_folder = load_from_folder if load_from_folder else folder
-        self._ckpt2keep = checkpoints_to_keep
-        self._saved_ckpts = []
-        # If True, run will fail if we cannot load module weights
-        self._force_load = force_load
-
-    def __save_to(self, path, state):
-        if state["global_rank"] is not None and state["global_rank"] != 0:
-            return
-        if not os.path.isdir(path):
-            logging.info(f"Creating {path} folder")
-            os.makedirs(path, exist_ok=True)
-        unique_mod_names = set()
-        for module in AppState().modules:
-            if module.num_weights > 0:
-                if str(module) in unique_mod_names:
-                    raise NotImplementedError(
-                        "There were two instances of the same module. Please overwrite __str__() of one of the "
-                        "modules."
-                    )
-                unique_mod_names.add(str(module))
-                if self._step_freq > -1:
-                    filename = f"{module}-STEP-{state['step']}.pt"
-                else:
-                    filename = f"{module}-EPOCH-{state['epoch']}.pt"
-                module.save_to(os.path.join(path, filename))
-
-        if self._step_freq > -1:
-            filename = f"trainer-STEP-{state['step']}.pt"
-            state.save_state_to(f"{path}/{filename}")
-            self._saved_ckpts.append(f"-{state['step']}.pt")
-        else:
-            filename = f"trainer-EPOCH-{state['epoch']}.pt"
-            state.save_state_to(f"{path}/{filename}")
-            self._saved_ckpts.append(f"-{state['epoch']}.pt")
-
-        if len(self._saved_ckpts) > self._ckpt2keep:
-            for end in self._saved_ckpts[: -self._ckpt2keep]:
-                for file in glob.glob(f'{path}/*{end}'):
-                    os.remove(file)
-            self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :]
-        logging.info(f'Saved checkpoint: {path}/{filename}')
-
-    def __restore_from(self, path, state):
-        if not os.path.isdir(path):
-            if self._force_load:
-                raise ValueError("force_load was set to True for checkpoint callback but a checkpoint was not found.")
-            logging.warning(f"Checkpoint folder {path} not found!")
-        else:
-            logging.info(f"Found checkpoint folder {path}. Will attempt to restore checkpoints from it.")
-            modules_to_restore = []
-            modules_to_restore_name = []
-            for module in AppState().modules:
-                if module.num_weights > 0:
-                    modules_to_restore.append(module)
-                    modules_to_restore_name.append(str(module))
-            try:
-                module_checkpoints = get_checkpoint_from_dir(modules_to_restore_name, path)
-
-                for mod, checkpoint in zip(modules_to_restore, module_checkpoints):
-                    mod.restore_from(checkpoint, state["local_rank"])
-            except (ValueError) as e:
-                if self._force_load:
-                    raise ValueError(
-                        "force_load was set to True for checkpoint callback but a checkpoint was not found."
-                    )
-                logging.warning(e)
-                logging.warning(
-                    f"Checkpoint folder {path} was present but nothing was restored. Continuing training from random "
-                    "initialization."
-                )
-                return
-
-            try:
-                trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path)
-                state.restore_state_from(trainer_checkpoints[0])
-                # for tr, checkpoint in zip([self.action], trainer_checkpoints):
-            except (ValueError) as e:
-                logging.warning(e)
-                logging.warning(
-                    "Trainer state such as optimizer state and current step/epoch was not restored. Pretrained weights"
-                    " have still been restore and fine-tuning should continue fine."
-                )
-                return
-
-    def on_train_start(self, state):
-        num_parameters = 0
-        unique_mod_names = set()
-        for module in AppState().modules:
-            if module.num_weights > 0:
-                if str(module) in unique_mod_names:
-                    raise NotImplementedError(
-                        "There were two instances of the same module. Please overwrite __str__() of one of the "
-                        "modules."
-                    )
-                unique_mod_names.add(str(module))
-                num_parameters += module.num_weights
-        logging.info(f"Found {len(unique_mod_names)} modules with weights:")
-        for name in unique_mod_names:
-            logging.info(f"{name}")
-        logging.info(f"Total model parameters: {num_parameters}")
-        self.__restore_from(self._load_from_folder, state)
-
-    def on_step_end(self, state):
-        step = state["step"]
-        if self._step_freq > 0 and step % self._step_freq == 0 and step > 0:
-            self.__save_to(self._folder, state)
-
-    def on_train_end(self, state):
-        if self._step_freq > 0 or self._epoch_freq > 0:
-            self.__save_to(self._folder, state)
-
-    def on_epoch_end(self, state):
-        epoch = state["epoch"]
-        if self._epoch_freq > 0 and epoch % self._epoch_freq == 0 and epoch > 0:
-            self.__save_to(self._folder, state)
-
-
 class EvaluatorCallback(ActionCallback):
     """
     For callback documentation: please see
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 2dc63ffca36b..8dba04e4acd3 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -78,6 +78,9 @@ def check_tensor_cached(self, unique_name: str):
 
         args:
             unique_name (str): The NmTensor.unique_name that we want to check for.
+
+        returns:
+            (bool) whether the tensor with unique_name has been computed yet.
         """
         if self.tensor_dict[unique_name] is None:
             return False
@@ -93,6 +96,10 @@ def get_tensor(self, name: Union[str, NmTensor], compute: bool = True):
                 call DAG and then do a forward pass on this call DAG to compute the tensor. If False, it will return
                 None if the tensor has not been computed yet.
                 Defaults to True.
+
+        returns:
+            (torch.tensor or None) representing the computed value of the requested name. Returns None if compute is
+            False and the tensor has not been computed yet.
         """
         if isinstance(name, NmTensor):
             unique_name = name.unique_name
@@ -298,7 +305,7 @@ def global_rank(self):
     def train(
         self,
         tensors_to_optimize: List[NmTensor],
-        callbacks: Optional[List[ActionCallback]],
+        callbacks: Optional[List[Union[ActionCallback, NeMoCallback]]],
         lr_policy=None,
         batches_per_step=None,
         stop_on_nan_loss=False,
@@ -740,7 +747,7 @@ def train(
         training_graph=None,
         optimizer=None,
         optimization_params=None,
-        callbacks: Optional[List[ActionCallback]] = None,
+        callbacks: Optional[List[Union[ActionCallback, NeMoCallback]]] = None,
         lr_policy=None,
         batches_per_step=None,
         stop_on_nan_loss=False,

From d95b2d4d45c7084a696a3035d340fc16352d11ad Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 21 May 2020 16:34:58 -0700
Subject: [PATCH 26/40] style

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/core/callbacks.py | 55 ++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 2adac28d5530..99a76d5be872 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -43,6 +43,7 @@ class NeMoCallback(ABC):
     first argument: the current action state. This state is a StateWrapper object.
     TODO: Add a link to documentation.
     """
+
     def on_train_start(self, state):
         pass
 
@@ -72,6 +73,7 @@ def on_train_start(func):
     """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
     on_train_start callback event.
     """
+
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -86,6 +88,7 @@ def on_epoch_start(func):
     """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
     on_epoch_start callback event.
     """
+
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -100,6 +103,7 @@ def on_batch_start(func):
     """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
     on_batch_start callback event.
     """
+
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -114,6 +118,7 @@ def on_step_start(func):
     """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
     on_step_start callback event.
     """
+
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -128,6 +133,7 @@ def on_step_end(func):
     """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
     on_step_end callback event.
     """
+
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -142,6 +148,7 @@ def on_batch_end(func):
     """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
     on_batch_end callback event.
     """
+
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -156,6 +163,7 @@ def on_epoch_end(func):
     """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
     on_epoch_end callback event.
     """
+
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -170,6 +178,7 @@ def on_train_end(func):
     """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the
     on_train_end callback event.
     """
+
     class NeMoCallbackWrapper(NeMoCallback):
         def __init__(self, my_func):
             self._func = my_func
@@ -180,9 +189,8 @@ def on_train_end(self, state):
     return NeMoCallbackWrapper(func)
 
 
-
 class SimpleLogger(NeMoCallback):
-    def __init__(self, step_freq:int = 100, tensors_to_log: List[Union[str, 'NmTensor']] = ["loss"]):
+    def __init__(self, step_freq: int = 100, tensors_to_log: List[Union[str, 'NmTensor']] = ["loss"]):
         """A simple callback that prints tensors to screen. It's default option is to print the training loss every
         100 steps. Additional tensors can be printed by adding them to the tensors_to_log argument.
 
@@ -204,13 +212,13 @@ def on_step_end(self, state):
 
 class TensorboardLogger(NeMoCallback):
     def __init__(
-            self,
-            tb_writer: 'torch.utils.tensorboard.SummaryWriter',
-            step_freq:int=100,
-            tensors_to_log:List[Union[str, 'NmTensor']]=["loss"],
-            custom_tb_log_func:Callable[[Union[str, 'NmTensor']],None]=None,
-            log_epoch:bool=True
-        ):
+        self,
+        tb_writer: 'torch.utils.tensorboard.SummaryWriter',
+        step_freq: int = 100,
+        tensors_to_log: List[Union[str, 'NmTensor']] = ["loss"],
+        custom_tb_log_func: Callable[[Union[str, 'NmTensor']], None] = None,
+        log_epoch: bool = True,
+    ):
         """A tensorboard callback that logs tensors using a tensorboard writer object. It's default option is to log
         the loss every 100 steps. Additional scalar tensors can be logged by adding them to the tensors_to_log
         argument. In order to log complex tensorboard entities, the custom_tb_log_func must be passed it. By default,
@@ -259,14 +267,14 @@ def on_step_end(self, state):
 
 class WandBLogger(NeMoCallback):
     def __init__(
-            self,
-            step_freq:int=100,
-            tensors_to_log:List[Union[str, 'NmTensor']]=["loss"],
-            wandb_name:str=None,
-            wandb_project:str=None,
-            args=None,
-            log_epoch:bool=True
-        ):
+        self,
+        step_freq: int = 100,
+        tensors_to_log: List[Union[str, 'NmTensor']] = ["loss"],
+        wandb_name: str = None,
+        wandb_project: str = None,
+        args=None,
+        log_epoch: bool = True,
+    ):
         """A [Weights & Biases](https://docs.wandb.com/) callback that logs tensors to W&B. It's default option is to
         log the loss every 100 steps. Additional scalar tensors can be logged by adding them to the tensors_to_log
         argument. By default, it always logs the current epoch and the time taken per epoch.
@@ -335,12 +343,12 @@ def _wandb_log(self, tensors_logged):
 class CheckpointCallback(NeMoCallback):
     def __init__(
         self,
-        folder:str,
-        load_from_folder:str=None,
-        step_freq:int=-1,
-        epoch_freq:int=-1,
-        checkpoints_to_keep:int=4,
-        force_load:bool=False,
+        folder: str,
+        load_from_folder: str = None,
+        step_freq: int = -1,
+        epoch_freq: int = -1,
+        checkpoints_to_keep: int = 4,
+        force_load: bool = False,
     ):
         """A callback that does checkpointing of module weights and trainer (incl. optimizer) status.
 
@@ -486,7 +494,6 @@ def on_epoch_end(self, state):
             self.__save_to(self._folder, state)
 
 
-
 class ActionCallback(ABC):
     """Abstract interface for callbacks.
     """

From 7bb53cdfcde685f74368c9b1d735195251b94fe7 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 21 May 2020 17:00:23 -0700
Subject: [PATCH 27/40] add deprecation warnings

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py | 7 ++++++-
 nemo/core/callbacks.py           | 6 ++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 86d8ce6aaba7..49848ebe0c42 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -115,7 +115,12 @@ def epoch(self, epoch):
         self._epoch = epoch
 
     @property
-    @deprecated
+    @deprecated(version="0.12", explanation="epoch_num has been deprecated in favour of epoch.")
+    def epoch_num(self):
+        return self._epoch
+
+    @epoch_num.setter
+    @deprecated(version="0.12", explanation="epoch_num has been deprecated in favour of epoch.")
     def epoch_num(self):
         return self._epoch
 
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 99a76d5be872..511da94199a3 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -28,6 +28,7 @@
 import nemo
 from nemo.utils import get_checkpoint_from_dir, logging
 from nemo.utils.app_state import AppState
+from nemo.utils.decorators import deprecated
 
 try:
     import wandb
@@ -560,6 +561,7 @@ class ModuleSaverCallback(ActionCallback):
     https://nvidia.github.io/NeMo/tutorials/callbacks.html
     """
 
+    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
     def __init__(
         self, save_modules_list, step_freq=1000, folder=None, checkpoints_to_keep=4,
     ):
@@ -618,6 +620,7 @@ class SimpleLossLoggerCallback(ActionCallback):
     https://nvidia.github.io/NeMo/tutorials/callbacks.html
     """
 
+    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
     def __init__(
         self, tensors, print_func=None, get_tb_values=None, log_to_tb_func=None, step_freq=25, tb_writer=None,
     ):
@@ -835,6 +838,7 @@ class ValueSetterCallback(ActionCallback):
     Policy = _Policy
     Method = _Method
 
+    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
     def __init__(self, module, arg_name, policies=None, total_steps=None, tb_writer=None):
         super().__init__()
 
@@ -880,6 +884,7 @@ def on_iteration_start(self):
 
 
 class UnfreezeCallback(ActionCallback):
+    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
     def __init__(self, modules, start_epoch=0):
         super().__init__()
 
@@ -897,6 +902,7 @@ class OldWandbCallback(ActionCallback):
     Log metrics to [Weights & Biases](https://docs.wandb.com/)
     """
 
+    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
     def __init__(
         self, train_tensors=[], wandb_name=None, wandb_project=None, args=None, update_freq=25,
     ):

From d615efa060a3eacbb32948292446d80c97a075a7 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 21 May 2020 17:01:19 -0700
Subject: [PATCH 28/40] changelog

Signed-off-by: Jason <jasoli@nvidia.com>
---
 CHANGELOG.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a749af6e06b..5ffaf33f5b89 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -83,6 +83,7 @@ To release a new version, please update the changelog as followed:
 
 ### Changed
 - Syncs across workers at each step to check for NaN or inf loss. Terminates all workers if stop\_on\_nan\_loss is set (as before), lets Apex deal with it if apex.amp optimization level is O1 or higher, and skips the step across workers otherwise. ([PR #637](https://github.com/NVIDIA/NeMo/pull/637)) - @redoctopus
+- Updated the callback system. Old callbacks will be deprecated in version 0.12. ([PR #615](https://github.com/NVIDIA/NeMo/pull/615)) - @blisc
 
 ### Dependencies Update
 
@@ -123,7 +124,7 @@ files, along with unit tests, examples and tutorials
 ([PR #375](https://github.com/NVIDIA/NeMo/pull/375)) - @titu1994
 
 ### Changed
-- Refactoring of `nemo_nlp` collections: 
+- Refactoring of `nemo_nlp` collections:
 ([PR #368](https://github.com/NVIDIA/NeMo/pull/368)) - @VahidooX, @yzhang123, @ekmb
     - renaming and restructuring of files, folder, and functions in `nemo_nlp`
     - losses cleaned up. LossAggregatorNM moved to nemo/backends/pytorch/common/losses
@@ -138,7 +139,7 @@ files, along with unit tests, examples and tutorials
 ([PR #284](https://github.com/NVIDIA/NeMo/pull/284)) - @stasbel
 - NeMo is not longer using pep8 code style rules. Code style rules are now enforced with `isort` and `black` incorporated into CI checks.
 ([PR #286](https://github.com/NVIDIA/NeMo/pull/286)) - @stasbel
-- Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params).  
+- Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params).
 ([PR #309](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia
 - Updated nemo's use of the logging library. from nemo import logging is now the reccomended way of using the nemo logger. neural_factory.logger and all other instances of logger are now deprecated and planned for removal in the next version. Please see PR 267 for complete change information.
 ([PR #267](https://github.com/NVIDIA/NeMo/pull/267), [PR #283](https://github.com/NVIDIA/NeMo/pull/283), [PR #305](https://github.com/NVIDIA/NeMo/pull/305), [PR #311](https://github.com/NVIDIA/NeMo/pull/311)) - @blisc
@@ -147,7 +148,7 @@ files, along with unit tests, examples and tutorials
 
 - Added TRADE (dialogue state tracking model) on MultiWOZ dataset
 ([PR #322](https://github.com/NVIDIA/NeMo/pull/322)) - @chiphuyen, @VahidooX
-- Question answering: 
+- Question answering:
 ([PR #390](https://github.com/NVIDIA/NeMo/pull/390)) - @yzhang123
     - Changed question answering task to use Roberta and Albert as alternative backends to Bert
     - Added inference mode that does not require ground truth labels
@@ -158,7 +159,7 @@ files, along with unit tests, examples and tutorials
 ### Deprecated
 
 ### Fixed
-- Critical fix of the training action on CPU 
+- Critical fix of the training action on CPU
 ([PR #308](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia
 - Fixed issue in Tacotron 2 prenet
 ([PR #444](https://github.com/NVIDIA/NeMo/pull/444)) - @blisc

From 21f4cf10bba1082bf174cc585c0751840c99f04f Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 21 May 2020 17:09:51 -0700
Subject: [PATCH 29/40] rename oldwandbcallback

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/core/callbacks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 511da94199a3..97784d2b66e3 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -897,7 +897,7 @@ def on_iteration_start(self):
                 m.unfreeze()
 
 
-class OldWandbCallback(ActionCallback):
+class WandbCallback(ActionCallback):
     """
     Log metrics to [Weights & Biases](https://docs.wandb.com/)
     """

From 1c99f548b93366528c68ef6af93e1b4d74c59d82 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Fri, 22 May 2020 15:11:54 -0700
Subject: [PATCH 30/40] test

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/core/callbacks.py                 |  16 ++
 nemo/utils/nemo_logging.py             |  33 ++++
 tests/unit/core/test_nemo_callbacks.py | 209 +++++++++++++++++++++++++
 3 files changed, 258 insertions(+)
 create mode 100755 tests/unit/core/test_nemo_callbacks.py

diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 97784d2b66e3..4b9826e9b6c1 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -15,6 +15,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# __all__ = [
+#     "NeMoCallback",
+#     "SimpleLogger",
+#     "TensorboardLogger",
+#     "WandBLogger",
+#     "CheckpointCallback",
+#     "on_train_start",
+#     "on_train_end",
+#     "on_epoch_start",
+#     "on_epoch_end",
+#     "on_batch_start",
+#     "on_batch_end",
+#     "on_step_start",
+#     "on_step_end",
+# ]
+
 import datetime
 import glob
 import os
diff --git a/nemo/utils/nemo_logging.py b/nemo/utils/nemo_logging.py
index 8a2bd06040d6..ee5cb0f6ee4d 100644
--- a/nemo/utils/nemo_logging.py
+++ b/nemo/utils/nemo_logging.py
@@ -212,6 +212,39 @@ def patch_stderr_handler(self, stream):
         else:
             raise RuntimeError("Impossible to patch logging handlers if handler does not exist")
 
+    @contextmanager
+    def patch_stdout_handler(self, stream):
+        """ Useful for unittests
+        """
+        if self._logger is not None:
+            try:
+                old_stream = self._handlers["stream_stdout"].stream
+                if old_stream is None:
+                    raise ValueError
+
+                # Port backwards set_stream() from python 3.7
+                self._handlers["stream_stdout"].acquire()
+                try:
+                    self._handlers["stream_stdout"].flush()
+                    self._handlers["stream_stdout"].stream = stream
+                finally:
+                    self._handlers["stream_stdout"].release()
+
+                yield stream
+            except (KeyError, ValueError):
+                raise RuntimeError("Impossible to patch logging handlers if handler does not exist")
+            finally:
+                # Port backwards set_stream() from python 3.7
+                self._handlers["stream_stdout"].acquire()
+                try:
+                    self._handlers["stream_stdout"].flush()
+                    self._handlers["stream_stdout"].stream = old_stream
+                finally:
+                    self._handlers["stream_stdout"].release()
+
+        else:
+            raise RuntimeError("Impossible to patch logging handlers if handler does not exist")
+
     @contextmanager
     def temp_verbosity(self, verbosity_level):
         """Sets the a temporary threshold for what messages will be logged."""
diff --git a/tests/unit/core/test_nemo_callbacks.py b/tests/unit/core/test_nemo_callbacks.py
new file mode 100755
index 000000000000..2242ece9775e
--- /dev/null
+++ b/tests/unit/core/test_nemo_callbacks.py
@@ -0,0 +1,209 @@
+# ! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import os
+import shutil
+from io import StringIO
+
+import pytest
+from tensorboard.backend.event_processing import event_file_inspector as efi
+from torch.utils.tensorboard import SummaryWriter
+
+from nemo.backends.pytorch.tutorials import MSELoss, RealFunctionDataLayer, TaylorNet
+from nemo.core.callbacks import *
+from nemo.utils import logging
+
+@pytest.mark.usefixtures("neural_factory")
+class TestNeMoCallbacks():
+    @pytest.fixture()
+    def clean_up(self):
+        yield
+        self.nf.reset_trainer()
+
+    @pytest.fixture()
+    def create_tensorboard_file(self):
+        os.makedirs("temp")
+        summary_writter = SummaryWriter("temp")
+        yield summary_writter
+        shutil.rmtree("temp")
+
+    @pytest.mark.unit
+    def test_SimpleLogger(self, clean_up):
+        data_source = RealFunctionDataLayer(n=100, batch_size=1)
+        trainable_module = TaylorNet(dim=4)
+        loss = MSELoss()
+
+        # Create the graph by connnecting the modules.
+        x, y = data_source()
+        y_pred = trainable_module(x=x)
+        loss_tensor = loss(predictions=y_pred, target=y)
+
+        # Mock up both std and stderr streams.
+        with logging.patch_stdout_handler(StringIO()) as std_out:
+            self.nf.train(
+                tensors_to_optimize=[loss_tensor],
+                callbacks=[SimpleLogger(step_freq=1)],
+                optimization_params={"max_steps": 4, "lr": 0.01},
+                optimizer="sgd"
+            )
+
+        output_lines = std_out.getvalue().splitlines()
+        assert len(output_lines) == 4
+        for line in output_lines:
+            assert "loss" in line
+
+    @pytest.mark.unit
+    def test_rename_and_log(self, clean_up):
+        data_source = RealFunctionDataLayer(n=100, batch_size=1)
+        trainable_module = TaylorNet(dim=4)
+        loss = MSELoss()
+
+        # Create the graph by connnecting the modules.
+        x, y = data_source()
+        y_pred = trainable_module(x=x)
+        loss_tensor = loss(predictions=y_pred, target=y)
+
+        y_pred.rename("y_pred")
+
+        # Mock up both std and stderr streams.
+        with logging.patch_stdout_handler(StringIO()) as std_out:
+            self.nf.train(
+                tensors_to_optimize=[loss_tensor],
+                callbacks=[SimpleLogger(step_freq=1, tensors_to_log=['y_pred'])],
+                optimization_params={"max_steps": 4, "lr": 0.01},
+                optimizer="sgd"
+            )
+
+        output_lines = std_out.getvalue().splitlines()
+        assert len(output_lines) == 4
+        for line in output_lines:
+            assert "y_pred" in line
+
+    @pytest.mark.unit
+    def test_TensorboardLogger(self, clean_up, create_tensorboard_file):
+        data_source = RealFunctionDataLayer(n=100, batch_size=1)
+        trainable_module = TaylorNet(dim=4)
+        loss = MSELoss()
+
+        # Create the graph by connnecting the modules.
+        x, y = data_source()
+        y_pred = trainable_module(x=x)
+        loss_tensor = loss(predictions=y_pred, target=y)
+
+        tb_logger = TensorboardLogger(create_tensorboard_file, step_freq=1)
+        callbacks = [tb_logger]
+
+        self.nf.train(
+            tensors_to_optimize=[loss_tensor],
+            callbacks=callbacks,
+            optimization_params={"max_steps": 4, "lr": 0.01},
+            optimizer="sgd"
+        )
+
+        # efi.inspect("temp", tag="loss")
+        inspection_units = efi.get_inspection_units("temp", "", "loss")
+
+        # Make sure there is only 1 tensorboard file
+        assert len(inspection_units) == 1
+
+        # Assert that there the loss scalars has been logged 4 times
+        assert len(inspection_units[0].field_to_obs['scalars']) == 4
+
+    @pytest.mark.unit
+    def test_epoch_decorators(self, clean_up):
+        data_source = RealFunctionDataLayer(n=24, batch_size=12)
+        trainable_module = TaylorNet(dim=4)
+        loss = MSELoss()
+
+        # Create the graph by connnecting the modules.
+        x, y = data_source()
+        y_pred = trainable_module(x=x)
+        loss_tensor = loss(predictions=y_pred, target=y)
+
+        epoch_start_counter = [0]
+        epoch_end_counter = [0]
+        @on_epoch_start
+        def count_epoch_starts(state, counter=epoch_start_counter):
+            counter[0] += 1
+
+        @on_epoch_end
+        def count_epoch_ends(state, counter=epoch_end_counter):
+            counter[0] -= 1
+
+        callbacks = [count_epoch_starts, count_epoch_ends]
+
+        self.nf.train(
+            tensors_to_optimize=[loss_tensor],
+            callbacks=callbacks,
+            optimization_params={"max_steps": 4, "lr": 0.01},
+            optimizer="sgd"
+        )
+
+        assert epoch_start_counter[0] == 2
+        assert epoch_end_counter[0] == -2
+
+    @pytest.mark.unit
+    def test_step_batch_decorators(self, clean_up):
+        """Showcase the difference between step and batch"""
+        data_source = RealFunctionDataLayer(n=24, batch_size=12)
+        trainable_module = TaylorNet(dim=4)
+        loss = MSELoss()
+
+        # Create the graph by connnecting the modules.
+        x, y = data_source()
+        y_pred = trainable_module(x=x)
+        loss_tensor = loss(predictions=y_pred, target=y)
+
+        epoch_step_counter = [0]
+        epoch_batch_counter = [0]
+        @on_step_end
+        def count_steps(state, counter=epoch_step_counter):
+            counter[0] += 1
+
+        @on_batch_end
+        def count_batches(state, counter=epoch_batch_counter):
+            counter[0] += 1
+
+        callbacks = [count_steps, count_batches]
+
+        self.nf.train(
+            tensors_to_optimize=[loss_tensor],
+            callbacks=callbacks,
+            optimization_params={"max_steps": 4, "lr": 0.01},
+            optimizer="sgd"
+        )
+
+        # when grad accumlation steps (aka iter_per_step or batches_per_step) = 1, num_steps == num_batches
+        assert epoch_step_counter[0] == 4
+        assert epoch_batch_counter[0] == 4
+
+        epoch_step_counter[0] = 0
+        epoch_batch_counter[0] = 0
+
+        self.nf.train(
+            tensors_to_optimize=[loss_tensor],
+            callbacks=callbacks,
+            optimization_params={"max_steps": 4, "lr": 0.01},
+            optimizer="sgd",
+            reset=True,
+            batches_per_step=2
+        )
+
+        # when grad accumlation steps != 1, num_steps != num_batches
+        assert epoch_step_counter[0] == 4
+        assert epoch_batch_counter[0] == 8

From b976ec0a2b1839052534341d83a743603b3241be Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Fri, 22 May 2020 15:12:10 -0700
Subject: [PATCH 31/40] style

Signed-off-by: Jason <jasoli@nvidia.com>
---
 tests/unit/core/test_nemo_callbacks.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/unit/core/test_nemo_callbacks.py b/tests/unit/core/test_nemo_callbacks.py
index 2242ece9775e..3152b6fd4ffc 100755
--- a/tests/unit/core/test_nemo_callbacks.py
+++ b/tests/unit/core/test_nemo_callbacks.py
@@ -28,8 +28,9 @@
 from nemo.core.callbacks import *
 from nemo.utils import logging
 
+
 @pytest.mark.usefixtures("neural_factory")
-class TestNeMoCallbacks():
+class TestNeMoCallbacks:
     @pytest.fixture()
     def clean_up(self):
         yield
@@ -59,7 +60,7 @@ def test_SimpleLogger(self, clean_up):
                 tensors_to_optimize=[loss_tensor],
                 callbacks=[SimpleLogger(step_freq=1)],
                 optimization_params={"max_steps": 4, "lr": 0.01},
-                optimizer="sgd"
+                optimizer="sgd",
             )
 
         output_lines = std_out.getvalue().splitlines()
@@ -86,7 +87,7 @@ def test_rename_and_log(self, clean_up):
                 tensors_to_optimize=[loss_tensor],
                 callbacks=[SimpleLogger(step_freq=1, tensors_to_log=['y_pred'])],
                 optimization_params={"max_steps": 4, "lr": 0.01},
-                optimizer="sgd"
+                optimizer="sgd",
             )
 
         output_lines = std_out.getvalue().splitlines()
@@ -112,7 +113,7 @@ def test_TensorboardLogger(self, clean_up, create_tensorboard_file):
             tensors_to_optimize=[loss_tensor],
             callbacks=callbacks,
             optimization_params={"max_steps": 4, "lr": 0.01},
-            optimizer="sgd"
+            optimizer="sgd",
         )
 
         # efi.inspect("temp", tag="loss")
@@ -137,6 +138,7 @@ def test_epoch_decorators(self, clean_up):
 
         epoch_start_counter = [0]
         epoch_end_counter = [0]
+
         @on_epoch_start
         def count_epoch_starts(state, counter=epoch_start_counter):
             counter[0] += 1
@@ -151,7 +153,7 @@ def count_epoch_ends(state, counter=epoch_end_counter):
             tensors_to_optimize=[loss_tensor],
             callbacks=callbacks,
             optimization_params={"max_steps": 4, "lr": 0.01},
-            optimizer="sgd"
+            optimizer="sgd",
         )
 
         assert epoch_start_counter[0] == 2
@@ -171,6 +173,7 @@ def test_step_batch_decorators(self, clean_up):
 
         epoch_step_counter = [0]
         epoch_batch_counter = [0]
+
         @on_step_end
         def count_steps(state, counter=epoch_step_counter):
             counter[0] += 1
@@ -185,7 +188,7 @@ def count_batches(state, counter=epoch_batch_counter):
             tensors_to_optimize=[loss_tensor],
             callbacks=callbacks,
             optimization_params={"max_steps": 4, "lr": 0.01},
-            optimizer="sgd"
+            optimizer="sgd",
         )
 
         # when grad accumlation steps (aka iter_per_step or batches_per_step) = 1, num_steps == num_batches
@@ -201,7 +204,7 @@ def count_batches(state, counter=epoch_batch_counter):
             optimization_params={"max_steps": 4, "lr": 0.01},
             optimizer="sgd",
             reset=True,
-            batches_per_step=2
+            batches_per_step=2,
         )
 
         # when grad accumlation steps != 1, num_steps != num_batches

From 6ec04aa342cc9ee66dbf6d1673b60d6cb2d565a7 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 27 May 2020 15:55:10 -0700
Subject: [PATCH 32/40] first commit of changes

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py            |  20 +-
 nemo/core/actions.py                        | 299 +++++++++++++
 nemo/core/neural_factory.py                 | 442 --------------------
 nemo/core/neural_types/__init__.py          |   1 -
 nemo/core/neural_types/nmtensor_registry.py |   7 +-
 tests/unit/core/test_nemo_callbacks.py      |  19 +-
 6 files changed, 313 insertions(+), 475 deletions(-)
 create mode 100755 nemo/core/actions.py

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 49848ebe0c42..7ae84d1893f3 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -21,7 +21,8 @@
 from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params
 from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor
 from nemo.core.callbacks import ActionCallback, NeMoCallback, SimpleLossLoggerCallback
-from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState, topological_sort_from_leaves
+from nemo.core.neural_factory import OperationMode, Optimization
+from nemo.core.actions import Actions, TrainingState, topological_sort_from_leaves
 from nemo.core.neural_types import AxisKind, NeuralType
 from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
@@ -1387,23 +1388,6 @@ def save_state_to(self, path):
                     )
                     self.ddp_module_dict[key] = module
 
-                # # Convert batchnorm modules to synced if applicable
-                # if synced_batchnorm and isinstance(pmodule, torch.nn.Module):
-                #     world_size = dist.get_world_size()
-                #     if synced_batchnorm_groupsize > 0 and world_size % synced_batchnorm_groupsize != 0:
-                #         raise ValueError(
-                #             f"Synchronized batch norm group size"
-                #             f" ({synced_batchnorm_groupsize}) must be 0"
-                #             f" or divide total number of GPUs"
-                #             f" ({world_size})."
-                #         )
-                #     process_group = create_syncbn_process_group(synced_batchnorm_groupsize)
-                #     pmodule = convert_syncbn(pmodule, process_group=process_group)
-
-                # self.module_reference_table[key] = (
-                #     self.module_reference_table[key][0],
-                #     pmodule,
-                # )
         # single GPU/CPU training
         else:
             if t_dataset is not None:
diff --git a/nemo/core/actions.py b/nemo/core/actions.py
new file mode 100755
index 000000000000..6a988b265e06
--- /dev/null
+++ b/nemo/core/actions.py
@@ -0,0 +1,299 @@
+# ! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import List, Optional, Union
+
+
+from nemo.core.neural_types import NmTensor
+from nemo.utils.app_state import AppState
+from nemo.core.neural_modules import ModuleType
+from nemo.core.neural_factory import Optimization
+
+
+def topological_sort_from_leaves(leaf_nmtensors: List[NmTensor], cached_training_state: 'TrainingState' = None):
+    """A function that accepts a list of NmTensors that need to be computed and constructs a callchain DAG that starts
+    from a datalayerNM and can be used to compute the NmTensors.
+
+    args:
+        leaf_nmtensors (List[NmTensors]): The tensors to be computed
+        cached_training_state (TrainingState): A dictionary of already computed tensors.
+            Defaults to None meaning an empty cache.
+
+    returns:
+        top_sorted_modules: the callchain DAG
+    """
+
+    def create_node(producer, producer_args):
+        if producer_args is None:
+            return tuple((producer, ()))
+        return tuple((producer, tuple([(k, v) for k, v in producer_args.items()]),))
+
+    def is_in_degree_zero(node, processed_nodes, cached_training_state):
+        """A node has in degree of zero"""
+        if node[1] == ():
+            return True
+        for _, nmtensor in node[1]:
+            node = create_node(nmtensor.producer, nmtensor.producer_args)
+            if node not in processed_nodes:
+                if cached_training_state and cached_training_state.check_tensor_cached(nmtensor.unique_name):
+                    continue
+                return False
+        return True
+
+    hooks = leaf_nmtensors if isinstance(leaf_nmtensors, list) else [leaf_nmtensors]
+
+    # ensures that no tensors are processed twice
+    processed_nmtensors = set()
+
+    indices_to_remove = []
+    # Check for duplicates in hook
+    for i, nmtensor in enumerate(hooks):
+        if nmtensor in processed_nmtensors:
+            indices_to_remove.append(i)
+        else:
+            processed_nmtensors.add(nmtensor)
+
+    for i in reversed(indices_to_remove):
+        hooks.pop(i)
+
+    _top_sorted_modules = []
+    all_nodes = {}
+
+    # extract all nodes to all_nodes set
+    hooks_lst = list(hooks)
+    while len(hooks_lst) > 0:
+        # take nmtensor from the end of the list
+        nmtensor = hooks_lst.pop()
+        producer_args = nmtensor.producer_args
+
+        node = create_node(nmtensor.producer, producer_args)
+        # Store nmtensor as an output of its producer
+        # first make sure all keys are present per output port
+        # and nm is inside all_nodes
+        if node not in all_nodes:
+            all_nodes[node] = {k: None for k in nmtensor.producer.output_ports}
+        # second, populate output port with current nmtensor
+        # where applicable
+        all_nodes[node][nmtensor.name] = nmtensor
+        processed_nmtensors.add(nmtensor)
+
+        new_tensors = set()
+        if producer_args is not None and producer_args != {}:
+            for _, new_nmtensor in producer_args.items():
+                if new_nmtensor not in processed_nmtensors:
+                    new_tensors.add(new_nmtensor)
+
+        if cached_training_state:
+            for _, input_nmtensor in producer_args.items():
+                if cached_training_state.check_tensor_cached(input_nmtensor.unique_name):
+                    new_tensors.remove(input_nmtensor)
+
+        for new_nmtensor in new_tensors:
+            # put in the start of list
+            hooks_lst.insert(0, new_nmtensor)
+
+    all_node_with_output = []
+    # Iterate over all_nodes to create new nodes that include its output
+    # now all nodes have (module, input tensors, output tensors)
+    for node in all_nodes:
+        all_node_with_output.append(tuple((node[0], node[1], all_nodes[node])))
+
+    processed_nodes = []
+    while len(all_node_with_output) > 0:
+        for node in all_node_with_output.copy():
+            # if node's in_degree is zero it can be added to
+            # _top_sorted_modules
+            # this will also reduce in_degree of its children
+            if is_in_degree_zero(node, processed_nodes, cached_training_state):
+                _top_sorted_modules.append(node)
+                processed_nodes.append((node[0], node[1]))
+                all_node_with_output.remove(node)
+
+    # Create top_sorted_modules aka callchain
+    top_sorted_modules = []
+    for i, mod in enumerate(_top_sorted_modules):
+        top_sorted_modules.append((mod[0], dict(mod[1]), mod[2]))
+        # Ensure that there is only one dataset in callchain
+        if i > 0 and mod[0].type == ModuleType.datalayer:
+            raise ValueError("There were more than one DataLayer NeuralModule inside your DAG.")
+
+        if cached_training_state and mod[0].type == ModuleType.datalayer:
+            raise ValueError("Could not compute tensor from current cached training state.")
+
+    return top_sorted_modules
+
+
+class TrainingState:
+    def __init__(self, action: 'Actions'):
+        """A class used to wrap the current training state of an Actions.train() function. This class holds a mapping
+        of tensor.unique_name -> it's backend tensor (eg Pytorch Tensor) or None if the tensor has been been computed
+        on the current step.
+
+        args:
+            action (Actions): The Actions object this state is associated with.
+        """
+        tensor_naming_registery = AppState().tensor_names
+        self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None)
+        self._action = action
+
+    def tensor_list(self):
+        """Returns a list the unique names of all tensors.
+        """
+        return self.tensor_dict.keys()
+
+    def clear_dict(self):
+        """Clears the dictionary by setting all values to None. Used in-between training batches to clear it's state.
+        """
+        for name in self.tensor_dict:
+            self.tensor_dict[name] = None
+
+    def set_tensor(self, tensor: NmTensor, value: 'torch.Tensor'):
+        """Sets the value of tensor
+
+        args:
+            tensor (NmTensor)
+            value (torch.Tensor)
+        """
+        self.tensor_dict[tensor.unique_name] = value
+
+    def check_tensor_cached(self, unique_name: str):
+        """Checks to see the tensor value has been computed in the current step yet.
+
+        args:
+            unique_name (str): The NmTensor.unique_name that we want to check for.
+
+        returns:
+            (bool) whether the tensor with unique_name has been computed yet.
+        """
+        if self.tensor_dict[unique_name] is None:
+            return False
+        return True
+
+    def get_tensor(self, name: Union[str, NmTensor], compute: bool = True):
+        """Returns the value associated with a tensor. And optionally, computes the value of the tensor if not already
+        set.
+
+        args:
+            name (str, NmTensor): The user-defined name for a tensor or the NmTensor itself.
+            compute (bool): If True and the tensor has not already been computed, there will be an attempt to create a
+                call DAG and then do a forward pass on this call DAG to compute the tensor. If False, it will return
+                None if the tensor has not been computed yet.
+                Defaults to True.
+
+        returns:
+            (torch.tensor or None) representing the computed value of the requested name. Returns None if compute is
+            False and the tensor has not been computed yet.
+        """
+        if isinstance(name, NmTensor):
+            unique_name = name.unique_name
+        else:
+            unique_name = AppState().tensor_names[name]
+        tensor_value = self.tensor_dict[unique_name]
+        if tensor_value is None and compute:
+            nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name]
+            callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self)
+            callchain.insert(0, ())
+            self._action.nm_graph_forward_pass(callchain, self.tensor_dict)
+            tensor_value = self.tensor_dict[unique_name]
+        return tensor_value
+
+
+class Actions(ABC):
+    """Basic actions allowed on graphs of Neural Modules"""
+
+    def __init__(self, local_rank, global_rank, optimization_level=Optimization.mxprO0):
+        self._local_rank = local_rank
+        self._global_rank = global_rank
+        self._optim_level = optimization_level
+
+    @property
+    def local_rank(self):
+        """Local rank during distributed execution. None if single GPU/CPU
+
+        Returns:
+            (int) rank or worker or None if not in distributed model
+        """
+        return self._local_rank
+
+    @property
+    def global_rank(self):
+        """Global rank during distributed execution. None if single GPU/CPU
+
+        Returns:
+            (int) rank or worker or None if not in distributed model
+        """
+        return self._global_rank
+
+    @abstractmethod
+    def train(
+        self,
+        tensors_to_optimize: List[NmTensor],
+        callbacks: Optional[List[Union[ActionCallback, NeMoCallback]]],
+        lr_policy=None,
+        batches_per_step=None,
+        stop_on_nan_loss=False,
+    ):
+        """This action executes training and (optionally) evaluation.
+
+        Args:
+            tensors_to_optimize: which tensors to optimize. Typically this is
+                single loss tesnor.
+            callbacks: list of callback objects
+            lr_policy: function which should take (initial_lr, step, epoch) and
+                return learning rate
+            batches_per_step: number of mini-batches to process before one
+                optimizer step. (default: None, same as 1). Use this
+                to simulate larger batch sizes on hardware which could not fit
+                larger batch in memory otherwise. Effectively, this will make
+                "algorithmic" batch size per GPU/worker = batches_per_step*
+                batch_size
+            stop_on_nan_loss: (default: False) If set to True, the training
+                will stop if loss=nan or inf. If set to False, the training
+                will continue.
+
+        Returns:
+            None
+        """
+        pass
+
+    @abstractmethod
+    def infer(self, tensors: List[NmTensor]):
+        """This action executes inference. Nothing is optimized.
+        Args:
+          tensors: which tensors to evaluate.
+
+        Returns:
+          None
+        """
+        pass
+
+    @abstractmethod
+    def create_optimizer(self, optimizer, things_to_optimize, optimizer_params):
+        """
+        Creates an optimizer object to be use in the train() method.
+
+        Args:
+            optimizer: Specifies which optimizer to use.
+            things_to_optimize: A list of neural modules or tensors to be
+                optimized.
+            optimizer_params: Specifies the parameters of the optimizer
+
+        Returns:
+            Optimizer
+        """
+        pass
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 8dba04e4acd3..87de6e7ac3ac 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -20,13 +20,11 @@
     'OperationMode',
     'Optimization',
     'DeviceType',
-    'Actions',
     'NeuralModuleFactory',
     'DeploymentFormat',
 ]
 
 import random
-from abc import ABC, abstractmethod
 from enum import Enum
 from typing import List, Optional, Union
 
@@ -36,202 +34,9 @@
 from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback
 from nemo.core.neural_types import NmTensor
 from nemo.utils import ExpManager, logging
-from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
 
 
-class TrainingState:
-    def __init__(self, action: 'Actions'):
-        """A class used to wrap the current training state of an Actions.train() function. This class holds a mapping
-        of tensor.unique_name -> it's backend tensor (eg Pytorch Tensor) or None if the tensor has been been computed
-        on the current step.
-
-        args:
-            action (Actions): The Actions object this state is associated with.
-        """
-        tensor_naming_registery = AppState().tensor_names
-        self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None)
-        self._action = action
-
-    def tensor_list(self):
-        """Returns a list the unique names of all tensors.
-        """
-        return self.tensor_dict.keys()
-
-    def clear_dict(self):
-        """Clears the dictionary by setting all values to None. Used in-between training batches to clear it's state.
-        """
-        for name in self.tensor_dict:
-            self.tensor_dict[name] = None
-
-    def set_tensor(self, tensor: NmTensor, value: 'torch.Tensor'):
-        """Sets the value of tensor
-
-        args:
-            tensor (NmTensor)
-            value (torch.Tensor)
-        """
-        self.tensor_dict[tensor.unique_name] = value
-
-    def check_tensor_cached(self, unique_name: str):
-        """Checks to see the tensor value has been computed in the current step yet.
-
-        args:
-            unique_name (str): The NmTensor.unique_name that we want to check for.
-
-        returns:
-            (bool) whether the tensor with unique_name has been computed yet.
-        """
-        if self.tensor_dict[unique_name] is None:
-            return False
-        return True
-
-    def get_tensor(self, name: Union[str, NmTensor], compute: bool = True):
-        """Returns the value associated with a tensor. And optionally, computes the value of the tensor if not already
-        set.
-
-        args:
-            name (str, NmTensor): The user-defined name for a tensor or the NmTensor itself.
-            compute (bool): If True and the tensor has not already been computed, there will be an attempt to create a
-                call DAG and then do a forward pass on this call DAG to compute the tensor. If False, it will return
-                None if the tensor has not been computed yet.
-                Defaults to True.
-
-        returns:
-            (torch.tensor or None) representing the computed value of the requested name. Returns None if compute is
-            False and the tensor has not been computed yet.
-        """
-        if isinstance(name, NmTensor):
-            unique_name = name.unique_name
-        else:
-            unique_name = AppState().tensor_names[name]
-        tensor_value = self.tensor_dict[unique_name]
-        if tensor_value is None and compute:
-            nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name]
-            callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self)
-            callchain.insert(0, ())
-            self._action.nm_graph_forward_pass(callchain, self.tensor_dict)
-            tensor_value = self.tensor_dict[unique_name]
-        return tensor_value
-
-
-def topological_sort_from_leaves(leaf_nmtensors: List[NmTensor], cached_training_state: TrainingState = None):
-    """A function that accepts a list of NmTensors that need to be computed and constructs a callchain DAG that starts
-    from a datalayerNM and can be used to compute the NmTensors.
-
-    args:
-        leaf_nmtensors (List[NmTensors]): The tensors to be computed
-        cached_training_state (TrainingState): A dictionary of already computed tensors.
-            Defaults to None meaning an empty cache.
-
-    returns:
-        top_sorted_modules: the callchain DAG
-    """
-    from nemo.backends.pytorch.nm import DataLayerNM  # TODO: Replace this with a backend agnostic data layer
-
-    def create_node(producer, producer_args):
-        if producer_args is None:
-            return tuple((producer, ()))
-        else:
-            return tuple((producer, tuple([(k, v) for k, v in producer_args.items()]),))
-
-    def is_in_degree_zero(node, processed_nodes, cached_training_state):
-        """A node has in degree of zero"""
-        if node[1] == ():
-            return True
-        for portname, nmtensor in node[1]:
-            nd = create_node(nmtensor.producer, nmtensor.producer_args)
-            if nd not in processed_nodes:
-                if cached_training_state and cached_training_state.check_tensor_cached(nmtensor.unique_name):
-                    continue
-                return False
-        return True
-
-    hooks = leaf_nmtensors if isinstance(leaf_nmtensors, list) else [leaf_nmtensors]
-
-    # ensures that no tensors are processed twice
-    processed_nmtensors = set()
-
-    indices_to_remove = []
-    # Check for duplicates in hook
-    for i, nmtensor in enumerate(hooks):
-        if nmtensor in processed_nmtensors:
-            indices_to_remove.append(i)
-        else:
-            processed_nmtensors.add(nmtensor)
-
-    for i in reversed(indices_to_remove):
-        hooks.pop(i)
-
-    _top_sorted_modules = []
-    all_nodes = {}
-
-    # extract all nodes to all_nodes set
-    hooks_lst = list(hooks)
-    while len(hooks_lst) > 0:
-        # take nmtensor from the end of the list
-        nmtensor = hooks_lst.pop()
-        producer_args = nmtensor.producer_args
-
-        node = create_node(nmtensor.producer, producer_args)
-        # Store nmtensor as an output of its producer
-        # first make sure all keys are present per output port
-        # and nm is inside all_nodes
-        if node not in all_nodes:
-            all_nodes[node] = {k: None for k in nmtensor.producer.output_ports}
-        # second, populate output port with current nmtensor
-        # where applicable
-        all_nodes[node][nmtensor.name] = nmtensor
-        processed_nmtensors.add(nmtensor)
-
-        new_tensors = set()
-        if producer_args is not None and producer_args != {}:
-            for _, new_nmtensor in producer_args.items():
-                if new_nmtensor not in processed_nmtensors:
-                    new_tensors.add(new_nmtensor)
-
-        # TODO
-        if cached_training_state:
-            for name, input_nmtensor in producer_args.items():
-                if cached_training_state.check_tensor_cached(input_nmtensor.unique_name):
-                    new_tensors.remove(input_nmtensor)
-
-        for new_nmtensor in new_tensors:
-            # put in the start of list
-            hooks_lst.insert(0, new_nmtensor)
-
-    all_node_with_output = []
-    # Iterate over all_nodes to create new nodes that include its output
-    # now all nodes have (module, input tensors, output tensors)
-    for node in all_nodes:
-        all_node_with_output.append(tuple((node[0], node[1], all_nodes[node])))
-
-    processed_nodes = []
-    while len(all_node_with_output) > 0:
-        for node in all_node_with_output.copy():
-            # if node's in_degree is zero it can be added to
-            # _top_sorted_modules
-            # this will also reduce in_degree of its children
-            if is_in_degree_zero(node, processed_nodes, cached_training_state):
-                _top_sorted_modules.append(node)
-                processed_nodes.append((node[0], node[1]))
-                all_node_with_output.remove(node)
-
-    # Create top_sorted_modules aka callchain
-    top_sorted_modules = []
-    for i, m in enumerate(_top_sorted_modules):
-        top_sorted_modules.append((m[0], dict(m[1]), m[2]))
-        # Ensure that there is only one dataset in callchain
-        if i > 0 and isinstance(m[0], DataLayerNM):
-            raise ValueError("There were more than one DataLayer NeuralModule inside your DAG.")
-
-        # TODO
-        if cached_training_state and isinstance(m[0], DataLayerNM):
-            raise ValueError("Could not compute tensor from current cached training state.")
-
-    return top_sorted_modules
-
-
 class DeploymentFormat(Enum):
     """Which format to use when exporting a Neural Module for deployment"""
 
@@ -275,238 +80,6 @@ class DeviceType(Enum):
     AllGpu = 3
 
 
-class Actions(ABC):
-    """Basic actions allowed on graphs of Neural Modules"""
-
-    def __init__(self, local_rank, global_rank, optimization_level=Optimization.mxprO0):
-        self._local_rank = local_rank
-        self._global_rank = global_rank
-        self._optim_level = optimization_level
-
-    @property
-    def local_rank(self):
-        """Local rank during distributed execution. None if single GPU/CPU
-
-        Returns:
-            (int) rank or worker or None if not in distributed model
-        """
-        return self._local_rank
-
-    @property
-    def global_rank(self):
-        """Global rank during distributed execution. None if single GPU/CPU
-
-        Returns:
-            (int) rank or worker or None if not in distributed model
-        """
-        return self._global_rank
-
-    @abstractmethod
-    def train(
-        self,
-        tensors_to_optimize: List[NmTensor],
-        callbacks: Optional[List[Union[ActionCallback, NeMoCallback]]],
-        lr_policy=None,
-        batches_per_step=None,
-        stop_on_nan_loss=False,
-    ):
-        """This action executes training and (optionally) evaluation.
-
-        Args:
-            tensors_to_optimize: which tensors to optimize. Typically this is
-                single loss tesnor.
-            callbacks: list of callback objects
-            lr_policy: function which should take (initial_lr, step, epoch) and
-                return learning rate
-            batches_per_step: number of mini-batches to process before one
-                optimizer step. (default: None, same as 1). Use this
-                to simulate larger batch sizes on hardware which could not fit
-                larger batch in memory otherwise. Effectively, this will make
-                "algorithmic" batch size per GPU/worker = batches_per_step*
-                batch_size
-            stop_on_nan_loss: (default: False) If set to True, the training
-                will stop if loss=nan or inf. If set to False, the training
-                will continue.
-
-        Returns:
-            None
-        """
-        pass
-
-    @abstractmethod
-    def infer(self, tensors: List[NmTensor]):
-        """This action executes inference. Nothing is optimized.
-        Args:
-          tensors: which tensors to evaluate.
-
-        Returns:
-          None
-        """
-        pass
-
-    # @abstractmethod
-    # def save_state_to(self, path: str):
-    #     """
-    #     Saves current state such as step, epoch and optimizer parameters
-    #     Args:
-    #       path:
-
-    #     Returns:
-
-    #     """
-    #     pass
-
-    # @abstractmethod
-    # def restore_state_from(self, path: str):
-    #     """
-    #     Restores state such as step, epoch and optimizer parameters
-    #     Args:
-    #       path:
-
-    #     Returns:
-
-    #     """
-    #     pass
-
-    @abstractmethod
-    def create_optimizer(self, optimizer, things_to_optimize, optimizer_params):
-        """
-        Creates an optimizer object to be use in the train() method.
-
-        Args:
-            optimizer: Specifies which optimizer to use.
-            things_to_optimize: A list of neural modules or tensors to be
-                optimized.
-            optimizer_params: Specifies the parameters of the optimizer
-
-        Returns:
-            Optimizer
-        """
-        pass
-
-    # def _perform_on_step_start(self, callbacks):
-    #     # TODO: Most of these checks can be relaxed since we enforce callbacks
-    #     # to be a list of ActionCallback objects
-    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-    #         for callback in callbacks:
-    #             if isinstance(callback, ActionCallback):
-    #                 callback.on_iteration_start()
-    #             elif isinstance(callback, NeMoCallback):
-    #                 callback.on_step_start(self.state)
-    #             else:
-    #                 raise ValueError(
-    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-    #                 )
-
-    # def _perform_on_step_end(self, callbacks):
-    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-    #         for callback in callbacks:
-    #             if isinstance(callback, ActionCallback):
-    #                 callback.on_iteration_end()
-    #             elif isinstance(callback, NeMoCallback):
-    #                 callback.on_step_end(self.state)
-    #             else:
-    #                 raise ValueError(
-    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-    #                 )
-
-    # def _perform_on_action_start(self, callbacks):
-    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-    #         for callback in callbacks:
-    #             if isinstance(callback, ActionCallback):
-    #                 callback.on_action_start()
-    #             elif isinstance(callback, NeMoCallback):
-    #                 callback.on_train_start(self.state)
-    #             else:
-    #                 raise ValueError(
-    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-    #                 )
-
-    # def _perform_on_action_end(self, callbacks):
-    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-    #         for callback in callbacks:
-    #             if isinstance(callback, ActionCallback):
-    #                 callback.on_action_end()
-    #             elif isinstance(callback, NeMoCallback):
-    #                 callback.on_train_end(self.state)
-    #             else:
-    #                 raise ValueError(
-    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-    #                 )
-
-    # def _perform_on_epoch_start(self, callbacks):
-    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-    #         for callback in callbacks:
-    #             if isinstance(callback, ActionCallback):
-    #                 callback.on_epoch_start()
-    #             elif isinstance(callback, NeMoCallback):
-    #                 callback.on_epoch_start(self.state)
-    #             else:
-    #                 raise ValueError(
-    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-    #                 )
-
-    # def _perform_on_epoch_end(self, callbacks):
-    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-    #         for callback in callbacks:
-    #             if isinstance(callback, ActionCallback):
-    #                 callback.on_epoch_end()
-    #             elif isinstance(callback, NeMoCallback):
-    #                 callback.on_epoch_end(self.state)
-    #             else:
-    #                 raise ValueError(
-    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-    #                 )
-
-    # def _perform_on_batch_start(self, callbacks):
-    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-    #         for callback in callbacks:
-    #             if isinstance(callback, ActionCallback):
-    #                 continue
-    #             elif isinstance(callback, NeMoCallback):
-    #                 callback.on_epoch_start(self.state)
-    #             else:
-    #                 raise ValueError(
-    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-    #                 )
-
-    # def _perform_on_batch_end(self, callbacks):
-    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-    #         for callback in callbacks:
-    #             if isinstance(callback, ActionCallback):
-    #                 continue
-    #             elif isinstance(callback, NeMoCallback):
-    #                 callback.on_epoch_end(self.state)
-    #             else:
-    #                 raise ValueError(
-    #                     "Callback was not a child of ActionCallback nor NeMoCallback and was not understood"
-    #                 )
-
-    # def _init_callbacks(self, callbacks):
-    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-    #         for callback in callbacks:
-    #             if isinstance(callback, ActionCallback):
-    #                 callback.action = self
-
-    # def _update_callbacks(
-    #     self, callbacks=None, registered_tensors=None, final_loss=None,
-    # ):
-    #     # if self.local_rank is None or self.local_rank == 0:
-    #     if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
-    #         for callback in callbacks:
-    #             if isinstance(callback, ActionCallback):
-    #                 callback._registered_tensors = registered_tensors
-    #             else:  # For now, we can use the old callback function. In the future we should improve this
-    #                 self.training_state.tensor_dict["loss"] = final_loss
-
-
-def _str_to_opt_level(opt_str: str) -> Optimization:
-    number = int(opt_str[1:])
-    if number not in Optimization._value2member_map_:
-        raise ValueError(f"Unknown optimization value {opt_str}")
-    return Optimization(number)
-
-
 class NeuralModuleFactory(object):
     _DEFAULT = None
 
@@ -716,21 +289,6 @@ def get_module(self, name, collection, params, pretrained=False):
           NeuralModule instance
         """
 
-        # TK: "optimization_level" is not passed as parameter anymore.
-        # if params is not None and "optimization_level" in params:
-        #    if params["optimization_level"] != self._optim_level:
-        #        logging.warning(
-        #            "Module's {0} requested optimization level {1} is"
-        #            "different from the one specified by factory - {2}."
-        #            "Using: {3} for this module".format(
-        #                name, params["optimization_level"], self._optim_level, params["optimization_level"],
-        #            )
-        #        )
-        # else:
-        #    if params is None:
-        #        params = {}
-        #    params["optimization_level"] = self._optim_level
-
         if self._backend == Backend.PyTorch:
             return self.__get_pytorch_module(name=name, collection=collection, params=params, pretrained=pretrained,)
         else:
diff --git a/nemo/core/neural_types/__init__.py b/nemo/core/neural_types/__init__.py
index 0ae947d90137..1fb5bf349076 100644
--- a/nemo/core/neural_types/__init__.py
+++ b/nemo/core/neural_types/__init__.py
@@ -19,4 +19,3 @@
 from nemo.core.neural_types.comparison import *
 from nemo.core.neural_types.elements import *
 from nemo.core.neural_types.neural_type import *
-from nemo.core.neural_types.nmtensor_registry import NmTensorNameRegistry
diff --git a/nemo/core/neural_types/nmtensor_registry.py b/nemo/core/neural_types/nmtensor_registry.py
index f1d9591039a4..c8188c65af7f 100755
--- a/nemo/core/neural_types/nmtensor_registry.py
+++ b/nemo/core/neural_types/nmtensor_registry.py
@@ -25,13 +25,14 @@ def __init__(self):
         # Create the nmtensor_naming_dict
         # which contains a mapping of str to NMTensor.unique_name
         self._nmtensor_naming_dict = {"loss": "loss"}  # Reserve keyname of 'loss'
-        self._nmtensor_uniname_dict = {"loss": None}
+        # Create a set object to track all unique_names
+        self._nmtensor_uniname_dict = set(["loss"])
 
     @property
     def unique_names(self):
         """Returns the set of all NmTensors.unique_names + 'loss'
         """
-        return self._nmtensor_uniname_dict.keys()
+        return list(self._nmtensor_uniname_dict)
 
     def register(self, tensor: 'NmTensor'):
         """Helper function to register a newly created NmTensor by adding it to self.__nmtensor_uniname_dict.
@@ -46,7 +47,7 @@ def register(self, tensor: 'NmTensor'):
             pass
 
         # Finally, add object to the set.
-        self._nmtensor_uniname_dict[tensor.unique_name] = tensor
+        self._nmtensor_uniname_dict.add(tensor.unique_name)
 
     def rename_NmTensor(self, tensor: 'NmTensor', new_name: str):
         """Helper function that changes the naming dictionary to facilitate user name -> tensor.unique_name lookup.
diff --git a/tests/unit/core/test_nemo_callbacks.py b/tests/unit/core/test_nemo_callbacks.py
index 3152b6fd4ffc..21e1671eed19 100755
--- a/tests/unit/core/test_nemo_callbacks.py
+++ b/tests/unit/core/test_nemo_callbacks.py
@@ -1,7 +1,7 @@
 # ! /usr/bin/python
 # -*- coding: utf-8 -*-
 
-# Copyright 2019 NVIDIA. All Rights Reserved.
+# Copyright 2020 NVIDIA. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,13 +36,6 @@ def clean_up(self):
         yield
         self.nf.reset_trainer()
 
-    @pytest.fixture()
-    def create_tensorboard_file(self):
-        os.makedirs("temp")
-        summary_writter = SummaryWriter("temp")
-        yield summary_writter
-        shutil.rmtree("temp")
-
     @pytest.mark.unit
     def test_SimpleLogger(self, clean_up):
         data_source = RealFunctionDataLayer(n=100, batch_size=1)
@@ -96,7 +89,7 @@ def test_rename_and_log(self, clean_up):
             assert "y_pred" in line
 
     @pytest.mark.unit
-    def test_TensorboardLogger(self, clean_up, create_tensorboard_file):
+    def test_TensorboardLogger(self, clean_up, tmpdir):
         data_source = RealFunctionDataLayer(n=100, batch_size=1)
         trainable_module = TaylorNet(dim=4)
         loss = MSELoss()
@@ -106,7 +99,11 @@ def test_TensorboardLogger(self, clean_up, create_tensorboard_file):
         y_pred = trainable_module(x=x)
         loss_tensor = loss(predictions=y_pred, target=y)
 
-        tb_logger = TensorboardLogger(create_tensorboard_file, step_freq=1)
+        logging_dir = tmpdir.mkdir("temp")
+
+        writer = SummaryWriter(logging_dir)
+
+        tb_logger = TensorboardLogger(writer, step_freq=1)
         callbacks = [tb_logger]
 
         self.nf.train(
@@ -117,7 +114,7 @@ def test_TensorboardLogger(self, clean_up, create_tensorboard_file):
         )
 
         # efi.inspect("temp", tag="loss")
-        inspection_units = efi.get_inspection_units("temp", "", "loss")
+        inspection_units = efi.get_inspection_units(logging_dir, "", "loss")
 
         # Make sure there is only 1 tensorboard file
         assert len(inspection_units) == 1

From 7009bee78377bb06521f141265a29fc263ec2aa0 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 27 May 2020 16:23:37 -0700
Subject: [PATCH 33/40] some fixes

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/core/actions.py    | 2 +-
 nemo/utils/app_state.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/nemo/core/actions.py b/nemo/core/actions.py
index 6a988b265e06..686ad1b0c478 100755
--- a/nemo/core/actions.py
+++ b/nemo/core/actions.py
@@ -243,7 +243,7 @@ def global_rank(self):
     def train(
         self,
         tensors_to_optimize: List[NmTensor],
-        callbacks: Optional[List[Union[ActionCallback, NeMoCallback]]],
+        callbacks: Optional[List[Union['ActionCallback', 'NeMoCallback']]],
         lr_policy=None,
         batches_per_step=None,
         stop_on_nan_loss=False,
diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py
index 8bbf120c0f60..32c46767e5b2 100644
--- a/nemo/utils/app_state.py
+++ b/nemo/utils/app_state.py
@@ -22,6 +22,7 @@
 from nemo.utils.metaclasses import Singleton
 from nemo.utils.neural_graph.neural_graph_manager import NeuralGraphManager
 from nemo.utils.neural_graph.object_registry import ObjectRegistry
+from nemo.core.neural_types import NmTensorNameRegistry
 
 
 class AppState(metaclass=Singleton):
@@ -48,7 +49,7 @@ def __init__(self, device=None):
         # Create graph manager (registry with some additional functionality).
         self._neural_graph_manager = NeuralGraphManager()
         # Create NmTensor registry
-        self._nmtensor_name_registry = nemo.core.neural_types.NmTensorNameRegistry()
+        self._nmtensor_name_registry = NmTensorNameRegistry()
 
     @property
     def tensor_names(self):

From 9f4566bd4f56620f69a8731b88fef7ad972f608b Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 27 May 2020 16:27:55 -0700
Subject: [PATCH 34/40] style

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py | 2 +-
 nemo/core/actions.py             | 5 ++---
 nemo/utils/app_state.py          | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 7ae84d1893f3..d3e3261d5e55 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -20,9 +20,9 @@
 from nemo.backends.pytorch.nm import DataLayerNM, TrainableNM
 from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params
 from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor
+from nemo.core.actions import Actions, TrainingState, topological_sort_from_leaves
 from nemo.core.callbacks import ActionCallback, NeMoCallback, SimpleLossLoggerCallback
 from nemo.core.neural_factory import OperationMode, Optimization
-from nemo.core.actions import Actions, TrainingState, topological_sort_from_leaves
 from nemo.core.neural_types import AxisKind, NeuralType
 from nemo.utils.app_state import AppState
 from nemo.utils.decorators import deprecated
diff --git a/nemo/core/actions.py b/nemo/core/actions.py
index 686ad1b0c478..ad0757e04b39 100755
--- a/nemo/core/actions.py
+++ b/nemo/core/actions.py
@@ -18,11 +18,10 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional, Union
 
-
+from nemo.core.neural_factory import Optimization
+from nemo.core.neural_modules import ModuleType
 from nemo.core.neural_types import NmTensor
 from nemo.utils.app_state import AppState
-from nemo.core.neural_modules import ModuleType
-from nemo.core.neural_factory import Optimization
 
 
 def topological_sort_from_leaves(leaf_nmtensors: List[NmTensor], cached_training_state: 'TrainingState' = None):
diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py
index 32c46767e5b2..22ffdf8fce2a 100644
--- a/nemo/utils/app_state.py
+++ b/nemo/utils/app_state.py
@@ -19,10 +19,10 @@
 # Moreover, at that point nemo module doesn't contain "core", so during "python module registration"
 # nothing from nemo.core, including e.g. types (so we cannot use them for "python 3 type hints").
 import nemo
+from nemo.core.neural_types import NmTensorNameRegistry
 from nemo.utils.metaclasses import Singleton
 from nemo.utils.neural_graph.neural_graph_manager import NeuralGraphManager
 from nemo.utils.neural_graph.object_registry import ObjectRegistry
-from nemo.core.neural_types import NmTensorNameRegistry
 
 
 class AppState(metaclass=Singleton):

From 307f550414cd29f4ba366bb89dca4e676d024d16 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 27 May 2020 17:14:48 -0700
Subject: [PATCH 35/40] move nmtensor_registry

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/utils/__init__.py                                 | 4 ++--
 nemo/utils/app_state.py                                | 2 +-
 nemo/{core/neural_types => utils}/nmtensor_registry.py | 0
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename nemo/{core/neural_types => utils}/nmtensor_registry.py (100%)

diff --git a/nemo/utils/__init__.py b/nemo/utils/__init__.py
index b9058a854c3c..15872561c92a 100644
--- a/nemo/utils/__init__.py
+++ b/nemo/utils/__init__.py
@@ -15,8 +15,8 @@
 # limitations under the License.
 # =============================================================================
 
-from .nemo_logging import Logger as _Logger
-from .nemo_logging import LogMode as logging_mode
+from nemo.utils.nemo_logging import Logger as _Logger
+from nemo.utils.nemo_logging import LogMode as logging_mode
 
 logging = _Logger()
 
diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py
index 22ffdf8fce2a..45c134ee9995 100644
--- a/nemo/utils/app_state.py
+++ b/nemo/utils/app_state.py
@@ -19,10 +19,10 @@
 # Moreover, at that point nemo module doesn't contain "core", so during "python module registration"
 # nothing from nemo.core, including e.g. types (so we cannot use them for "python 3 type hints").
 import nemo
-from nemo.core.neural_types import NmTensorNameRegistry
 from nemo.utils.metaclasses import Singleton
 from nemo.utils.neural_graph.neural_graph_manager import NeuralGraphManager
 from nemo.utils.neural_graph.object_registry import ObjectRegistry
+from nemo.utils.nmtensor_registry import NmTensorNameRegistry
 
 
 class AppState(metaclass=Singleton):
diff --git a/nemo/core/neural_types/nmtensor_registry.py b/nemo/utils/nmtensor_registry.py
similarity index 100%
rename from nemo/core/neural_types/nmtensor_registry.py
rename to nemo/utils/nmtensor_registry.py

From 31fc556ddd14b721a9918c397ec36d0fcf6817ac Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 27 May 2020 17:18:49 -0700
Subject: [PATCH 36/40] update tests

Signed-off-by: Jason <jasoli@nvidia.com>
---
 tests/unit/core/test_nemo_callbacks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/core/test_nemo_callbacks.py b/tests/unit/core/test_nemo_callbacks.py
index 21e1671eed19..a2e0bae39f03 100755
--- a/tests/unit/core/test_nemo_callbacks.py
+++ b/tests/unit/core/test_nemo_callbacks.py
@@ -114,7 +114,7 @@ def test_TensorboardLogger(self, clean_up, tmpdir):
         )
 
         # efi.inspect("temp", tag="loss")
-        inspection_units = efi.get_inspection_units(logging_dir, "", "loss")
+        inspection_units = efi.get_inspection_units(str(logging_dir), "", "loss")
 
         # Make sure there is only 1 tensorboard file
         assert len(inspection_units) == 1

From b9e4441524ca7d8affaf4967a6b6a190bdbdd271 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 27 May 2020 17:27:57 -0700
Subject: [PATCH 37/40] clean code for comments

Signed-off-by: Jason <jasoli@nvidia.com>
---
 examples/asr/jasper_an4.py       |  1 -
 nemo/backends/pytorch/actions.py | 66 ++------------------------------
 nemo/core/callbacks.py           |  2 -
 nemo/utils/nemo_logging.py       |  4 --
 4 files changed, 4 insertions(+), 69 deletions(-)

diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py
index 40172008c9da..888d046ef936 100644
--- a/examples/asr/jasper_an4.py
+++ b/examples/asr/jasper_an4.py
@@ -238,7 +238,6 @@ def main():
         # Delete old graph and make a new one
         del g0
         nf.reset_trainer()
-        # [print(p) for p in nemo.utils.app_state.AppState().modules]
         loss, eval_tensors, callbacks, total_steps, _, _, new_g = create_dags(args.model_config, vocab, args, nf)
 
         nf.train(
diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index d3e3261d5e55..95d3a9d1736b 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -30,8 +30,6 @@
 
 # these imports will happen on as-needed basis
 amp = None
-# convert_syncbn = None
-# create_syncbn_process_group = None
 LARC = None
 FusedLAMB = None
 FusedAdam = None
@@ -63,16 +61,12 @@ def __init__(
                     global amp
                     amp = importlib.import_module('apex.amp')
                 if local_rank is not None:
-                    # global convert_syncbn
-                    # global create_syncbn_process_group
                     global LARC
                     global FusedLAMB
                     global FusedAdam
                     global FusedNovoGrad
                     parallel = importlib.import_module('apex.parallel')
                     apex_optimizer = importlib.import_module('apex.optimizers')
-                    # convert_syncbn = parallel.convert_syncbn_model
-                    # create_syncbn_process_group = parallel.create_syncbn_process_group
                     LARC = parallel.LARC
                     FusedLAMB = apex_optimizer.FusedLAMB
                     FusedAdam = apex_optimizer.FusedAdam
@@ -150,12 +144,6 @@ def __get_top_sorted_modules_and_dataloader(self, hook: List[NmTensor]):
                     "distributed mode. Please instantiate NeuralModuleFactory first and pass its instance as "
                     "`factory` parameter to all your Neural Module objects.".format(str(m[0]))
                 )
-            # key = m[0].unique_instance_id
-            # if key not in self.module_reference_table:
-            #     if isinstance(m[0], TrainableNeuralModuleWrapper):
-            #         self.module_reference_table[key] = (m[0], m[0]._pt_module)
-            #     else:
-            #         self.module_reference_table[key] = (m[0], m[0])
 
         return top_sorted_modules, tdataset
 
@@ -349,18 +337,9 @@ def __nm_graph_forward_pass(
                 if in_cache:
                     continue
             call_args = call_chain[ind][1]
-            # module = call_chain[ind][0]
-            # pmodule = self.module_reference_table[m_id][1]
             m_id = call_chain[ind][0].unique_instance_id
             pmodule = self.ddp_module_dict[m_id] if self.ddp_initialized else call_chain[ind][0]
 
-            # if self._local_rank is not None:
-            #     if isinstance(pmodule, DDP):
-            #         if disable_allreduce:
-            #             pmodule.disable_allreduce()
-            #         else:
-            #             pmodule.enable_allreduce()
-
             if mode == OperationMode.training:
                 # if module.is_trainable():
                 if isinstance(pmodule, nn.Module):
@@ -374,14 +353,8 @@ def __nm_graph_forward_pass(
             # prepare call signature for `module`
             call_set = {}
             for tensor_name, nmtensor in call_args.items():
-                # _add_uuid_2_name(nmtensor.name, nmtensor.producer._uuid)
                 key = nmtensor.unique_name
                 call_set[tensor_name] = registered_tensors[key]
-            # actual PyTorch module call with signature
-            # if isinstance(self.module_reference_table[m_id][0], TrainableNeuralModuleWrapper,):
-            #     new_tensors = pmodule(**call_set)
-            # else:
-            #     new_tensors = pmodule(force_pt=True, **call_set)
             new_tensors = pmodule(force_pt=True, **call_set)
 
             if not isinstance(new_tensors, List):
@@ -462,11 +435,6 @@ def _eval(self, tensors_2_evaluate, callback, step, verbose=False):
                 assert dist.is_initialized()
                 is_distributed = True
                 world_size = torch.distributed.get_world_size()
-                # logging.info(
-                #     "Doing distributed evaluation. Rank {0} of {1}".format(
-                #         self.local_rank, world_size
-                #     )
-                # )
 
                 if dl_nm.dataset is not None:
                     sampler = None
@@ -638,11 +606,6 @@ def _infer(
                 assert dist.is_initialized()
                 is_distributed = True
                 world_size = torch.distributed.get_world_size()
-                # logging.info(
-                #     "Doing distributed evaluation. Rank {0} of {1}".format(
-                #         self.local_rank, world_size
-                #     )
-                # )
                 if dl_nm.dataset is not None:
                     sampler = None
                     if not isinstance(dl_nm.dataset, torch.utils.data.IterableDataset):
@@ -729,12 +692,6 @@ def _infer(
                     use_cache=use_cache,
                 )
 
-                # if offload_to_cpu:
-                #     # Take all cuda tensors and save them to value_dict as
-                #     # cpu tensors to save GPU memory
-                #     for name, tensor in registered_e_tensors.items():
-                #         if isinstance(tensor, torch.Tensor):
-                #             registered_e_tensors[name] = tensor.cpu()
                 if cache:
                     self.append_to_cache(registered_e_tensors, offload_to_cpu)
 
@@ -913,10 +870,10 @@ def __extract_dynamic_axes(port_name: str, ntype: NeuralType, dynamic_axes: defa
 
         module.eval()
         try:
-            # # Remove NeMo-related things from the module
-            # # We need to change __call__ method. Note that this will change the
-            # # whole class, not just this object! Which is why we need to repair it
-            # # in the finally block
+            # Remove NeMo-related things from the module
+            # We need to change __call__ method. Note that this will change the
+            # whole class, not just this object! Which is why we need to repair it
+            # in the finally block
             __orig_call__ = type(module).__call__
             type(module).__call__ = torch.nn.Module.__call__
 
@@ -1313,10 +1270,6 @@ def save_state_to(self, path):
         dataNM = training_loop[0][2][0][0]
         placement_gpu = dataNM.placement == DeviceType.AllGpu
         if placement_gpu:
-            # if len(training_loop) > 1:
-            #     raise NotImplementedError(
-            #         "Distributed training does nor work with multiple "
-            #         "optimizers")
             logging.info("Doing distributed training")
             if t_dataset is not None:
                 train_sampler = None
@@ -1341,12 +1294,6 @@ def save_state_to(self, path):
                 else:
                     train_sampler = None
 
-            # for train_iter in training_loop:
-            #     call_chain = train_iter[2]
-            #     for i in range(1, len(call_chain) - 1):
-            #         key = call_chain[i][0].unique_instance_id
-            #         pmodule = self.module_reference_table[key][1]
-            #         num_trainable_weights = self.module_reference_table[key][1].num_weights
             self.ddp_initialized = True
             module_list = [mod.name for mod in AppState().modules]
             module_list = sorted(module_list)
@@ -1356,11 +1303,6 @@ def save_state_to(self, path):
                 num_trainable_weights = module.num_weights
                 self.ddp_module_dict[key] = module
                 if not isinstance(module, DDP) and isinstance(module, torch.nn.Module) and num_trainable_weights > 0:
-                    # gpf = 1
-                    # if gradient_predivide:
-                    #     gpf = dist.get_world_size()
-                    # pmodule = DDP(pmodule, gradient_predivide_factor=gpf)  # Old Apex Method
-
                     # Per pytorch docs, convert sync bn prior to DDP
                     if synced_batchnorm:
                         world_size = dist.get_world_size()
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 4b9826e9b6c1..d79eb23536db 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -469,7 +469,6 @@ def __restore_from(self, path, state):
             try:
                 trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path)
                 state.restore_state_from(trainer_checkpoints[0])
-                # for tr, checkpoint in zip([self.action], trainer_checkpoints):
             except (ValueError) as e:
                 logging.warning(e)
                 logging.warning(
@@ -891,7 +890,6 @@ def on_iteration_start(self):
             setattr(self.module, self.arg_name, value)
             if self.tb_writer is not None:
                 class_name = self.module.__class__.__name__
-                # name = f'param/{class_name}.{self.arg_name}'
                 name = f"param/{class_name}.{self.arg_name}"
                 self.tb_writer.add_scalar(name, value, self.step)
         else:
diff --git a/nemo/utils/nemo_logging.py b/nemo/utils/nemo_logging.py
index ee5cb0f6ee4d..7fed7ff0c5c3 100644
--- a/nemo/utils/nemo_logging.py
+++ b/nemo/utils/nemo_logging.py
@@ -366,7 +366,3 @@ def critical(self, msg, *args, mode=LogMode.EACH, **kwargs):
             and not self._logged_once(msg, mode)
         ):
             self._logger._log(Logger.CRITICAL, msg, args, **kwargs)
-
-
-# # Necessary to catch the correct caller
-# _logging._srcfile = os.path.normcase(inspect.getfile(Logger.__class__))

From c036084e24bc6504b841707785b32aa022f33367 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 28 May 2020 11:40:33 -0700
Subject: [PATCH 38/40] add back str_to_opt_level

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/core/neural_factory.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 87de6e7ac3ac..40cfee69f838 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -80,6 +80,13 @@ class DeviceType(Enum):
     AllGpu = 3
 
 
+def _str_to_opt_level(opt_str: str) -> Optimization:
+    number = int(opt_str[1:])
+    if number not in Optimization._value2member_map_:
+        raise ValueError(f"Unknown optimization value {opt_str}")
+    return Optimization(number)
+
+
 class NeuralModuleFactory(object):
     _DEFAULT = None
 

From 1e429afcd934894134855249bef480de1eb555af Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 28 May 2020 11:52:17 -0700
Subject: [PATCH 39/40] split callbacks into two files; update error messages

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/backends/pytorch/actions.py |   4 +-
 nemo/core/callbacks.py           | 483 +------------------------------
 2 files changed, 13 insertions(+), 474 deletions(-)

diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 95d3a9d1736b..76323fa8521b 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -1236,7 +1236,9 @@ def save_state_to(self, path):
         if callbacks is not None:
             for callback in callbacks:
                 if not isinstance(callback, ActionCallback) and not isinstance(callback, NeMoCallback):
-                    raise ValueError("A callback was received that was not a child of ActionCallback")
+                    raise ValueError(
+                        "A callback was received that was not a child of ActionCallback nor a child of NeMoCallback"
+                    )
                 elif isinstance(callback, SimpleLossLoggerCallback):
                     if logging_callchain:
                         raise ValueError("We only support one logger callback but more than one were found")
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index d79eb23536db..d667b4130529 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -31,20 +31,23 @@
 #     "on_step_end",
 # ]
 
-import datetime
 import glob
 import os
-import sys
 import time
-import warnings
-from abc import ABC, abstractmethod
-from collections import namedtuple
+from abc import ABC
 from typing import Callable, List, Union
 
-import nemo
+from nemo.core.deprecated_callbacks import (
+    ActionCallback,
+    EvaluatorCallback,
+    ModuleSaverCallback,
+    SimpleLossLoggerCallback,
+    UnfreezeCallback,
+    ValueSetterCallback,
+    WandbCallback,
+)
 from nemo.utils import get_checkpoint_from_dir, logging
 from nemo.utils.app_state import AppState
-from nemo.utils.decorators import deprecated
 
 try:
     import wandb
@@ -508,469 +511,3 @@ def on_epoch_end(self, state):
         epoch = state["epoch"]
         if self._epoch_freq > 0 and epoch % self._epoch_freq == 0 and epoch > 0:
             self.__save_to(self._folder, state)
-
-
-class ActionCallback(ABC):
-    """Abstract interface for callbacks.
-    """
-
-    def __init__(self):
-        self._registered_tensors = {}
-        self._action = None
-
-    @property
-    def step(self):
-        return self.action.step
-
-    @property
-    def epoch_num(self):
-        return self.action.epoch_num
-
-    @property
-    def registered_tensors(self):
-        return self._registered_tensors
-
-    @property
-    def local_rank(self):
-        return self.action.local_rank
-
-    @property
-    def global_rank(self):
-        return self.action.global_rank
-
-    @property
-    def action(self):
-        return self._action
-
-    @action.setter
-    def action(self, action_obj):
-        self._action = action_obj
-
-    @property
-    def logger(self):
-        warnings.warn("This will be deprecated in future releases. Please use nemo.logging instead")
-        return nemo.logging
-
-    def on_action_start(self):
-        pass
-
-    def on_action_end(self):
-        pass
-
-    def on_epoch_start(self):
-        pass
-
-    def on_epoch_end(self):
-        pass
-
-    def on_iteration_start(self):
-        pass
-
-    def on_iteration_end(self):
-        pass
-
-
-class ModuleSaverCallback(ActionCallback):
-    """
-    For callback documentation: please see
-    https://nvidia.github.io/NeMo/tutorials/callbacks.html
-    """
-
-    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
-    def __init__(
-        self, save_modules_list, step_freq=1000, folder=None, checkpoints_to_keep=4,
-    ):
-        super().__init__()
-        self._save_modules_list = save_modules_list
-        self._folder = folder
-        self._step_freq = step_freq
-        self._ckpt2keep = checkpoints_to_keep
-        self._saved_ckpts = []
-
-    def on_iteration_end(self):
-        step = self.step
-        if (
-            self._step_freq > 0
-            and step % self._step_freq == 0
-            and step > 0
-            and (self.global_rank is None or self.global_rank == 0)
-        ):
-            for m in self._save_modules_list:
-                class_name = m.__class__.__name__
-                uid = m.unique_instance_id
-                fn = f"{class_name}_{uid}-STEP-{step}.pt"
-                if self._folder is None:
-                    file_name = fn
-                else:
-                    file_name = os.path.join(self._folder, fn)
-                logging.info(f"Saving module {class_name} in {file_name}")
-                m.save_to(file_name)
-                logging.info("Saved.")
-            self._saved_ckpts.append(f'-{self.step}.pt')
-            if len(self._saved_ckpts) > self._ckpt2keep:
-                for end in self._saved_ckpts[: -self._ckpt2keep]:
-                    for file in glob.glob(f'{self._folder}/*{end}'):
-                        os.remove(file)
-                self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :]
-
-    def on_action_end(self):
-        step = self.step
-        if self.global_rank is None or self.global_rank == 0:
-            for m in self._save_modules_list:
-                class_name = m.__class__.__name__
-                uid = m.unique_instance_id
-                fn = f"{class_name}_{uid}-STEP-{step}.pt"
-                if self._folder is None:
-                    file_name = fn
-                else:
-                    file_name = os.path.join(self._folder, fn)
-                logging.info(f"Saving module {class_name} in {file_name}")
-                m.save_to(file_name)
-                logging.info("Saved.")
-
-
-class SimpleLossLoggerCallback(ActionCallback):
-    """
-    For callback documentation: please see
-    https://nvidia.github.io/NeMo/tutorials/callbacks.html
-    """
-
-    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
-    def __init__(
-        self, tensors, print_func=None, get_tb_values=None, log_to_tb_func=None, step_freq=25, tb_writer=None,
-    ):
-
-        super().__init__()
-        if not isinstance(tensors, list):
-            tensors = [tensors]
-        self._tensors = tensors
-        self._print_func = print_func
-        self._get_tb_values = get_tb_values
-        self._log_to_tb_func = log_to_tb_func
-        self._step_freq = step_freq
-        self._swriter = tb_writer
-        self._start_time = None
-        self._last_epoch_start = None
-        self._last_iter_start = None
-
-    @property
-    def tensors(self):
-        return self._tensors
-
-    def on_action_start(self):
-        if self.global_rank is None or self.global_rank == 0:
-            logging.info("Starting .....")
-            self._start_time = time.time()
-
-    def on_action_end(self):
-        if self.global_rank is None or self.global_rank == 0:
-            if self._swriter is not None:
-                self._swriter.close()
-            delta = datetime.timedelta(seconds=(time.time() - self._start_time))
-            logging.info("Done in %s", delta)
-
-    def on_epoch_start(self):
-        if self.global_rank is None or self.global_rank == 0:
-            logging.info(f"Starting epoch {self.epoch_num}")
-            self._last_epoch_start = time.time()
-
-    def on_epoch_end(self):
-        if self.global_rank is None or self.global_rank == 0:
-            step = self.step
-
-            delta = datetime.timedelta(seconds=(time.time() - self._last_epoch_start))
-            logging.info(f"Finished epoch {self.epoch_num} in {delta}")
-
-            if self._swriter is not None:
-                value = self.epoch_num
-                self._swriter.add_scalar('misc/epoch', value, step)
-                value = time.time() - self._last_epoch_start
-                self._swriter.add_scalar('misc/epoch_time', value, step)
-
-    def on_iteration_start(self):
-        if self.global_rank is None or self.global_rank == 0:
-            self._last_iter_start = time.time()
-
-    def on_iteration_end(self):
-        if self.global_rank is None or self.global_rank == 0:
-            step = self.step
-            if step % self._step_freq == 0:
-                tensor_values = [self.registered_tensors[t.unique_name] for t in self.tensors]
-                logging.info(f"Step: {step}")
-                if self._print_func:
-                    self._print_func(tensor_values)
-                sys.stdout.flush()
-                if self._swriter is not None:
-                    if self._get_tb_values:
-                        tb_objects = self._get_tb_values(tensor_values)
-                        for name, value in tb_objects:
-                            value = value.item()
-                            self._swriter.add_scalar(name, value, step)
-                    if self._log_to_tb_func:
-                        self._log_to_tb_func(self._swriter, tensor_values, step)
-                    run_time = time.time() - self._last_iter_start
-                    self._swriter.add_scalar('misc/step_time', run_time, step)
-                run_time = time.time() - self._last_iter_start
-                logging.info(f"Step time: {run_time} seconds")
-
-
-class EvaluatorCallback(ActionCallback):
-    """
-    For callback documentation: please see
-    https://nvidia.github.io/NeMo/tutorials/callbacks.html
-    """
-
-    def __init__(
-        self,
-        eval_tensors,
-        user_iter_callback,
-        user_epochs_done_callback,
-        tb_writer=None,
-        tb_writer_func=None,
-        eval_step=1,
-        eval_epoch=None,
-        wandb_name=None,
-        wandb_project=None,
-        eval_at_start=True,
-    ):
-        # TODO: Eval_epoch currently does nothing
-        if eval_step is None and eval_epoch is None:
-            raise ValueError("Either eval_step or eval_epoch must be set. " f"But got: {eval_step} and {eval_epoch}")
-        if (eval_step is not None and eval_step <= 0) or (eval_epoch is not None and eval_epoch <= 0):
-            raise ValueError(f"Eval_step and eval_epoch must be > 0." f"But got: {eval_step} and {eval_epoch}")
-        super().__init__()
-        self._eval_tensors = eval_tensors
-        self._swriter = tb_writer
-        self._tb_writer_func = tb_writer_func
-        self._eval_frequency = eval_step
-        self._eval_at_start = eval_at_start
-        # will be passed to callbacks below
-        self._global_var_dict = {}
-
-        # Callbacks
-        self.user_iter_callback = user_iter_callback
-        self.user_done_callback = user_epochs_done_callback
-
-        # Weights and biases
-        self._wandb_project = wandb_project
-        self._wandb_name = wandb_name
-
-    @property
-    def eval_tensors(self):
-        return self._eval_tensors
-
-    @property
-    def tb_writer_func(self):
-        return self._tb_writer_func
-
-    @property
-    def swriter(self):
-        return self._swriter
-
-    def on_epoch_end(self):
-        pass
-
-    def on_iteration_end(self):
-        if self.step == 0 and not self._eval_at_start:
-            return
-        if self.step % self._eval_frequency == 0:
-            if self.global_rank == 0 or self.global_rank is None:
-                logging.info('Doing Evaluation ' + '.' * 30)
-            start_time = time.time()
-            self.action._eval(self._eval_tensors, self, self.step)
-            elapsed_time = time.time() - start_time
-            if self.global_rank == 0 or self.global_rank is None:
-                logging.info(f'Evaluation time: {elapsed_time} seconds')
-
-    def on_action_start(self):
-        if self.global_rank is None or self.global_rank == 0:
-            if self._wandb_name is not None or self._wandb_project is not None:
-                if _WANDB_AVAILABLE and wandb.run is None:
-                    wandb.init(name=self._wandb_name, project=self._wandb_project)
-                elif _WANDB_AVAILABLE and wandb.run is not None:
-                    logging.info("Re-using wandb session")
-                else:
-                    logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
-                    logging.info("Will not log data to weights and biases.")
-                    self._wandb_name = None
-                    self._wandb_project = None
-
-    def on_action_end(self):
-        step = self.step
-        if self.global_rank == 0 or self.global_rank is None:
-            logging.info('Final Evaluation ' + '.' * 30)
-        start_time = time.time()
-        self.action._eval(self._eval_tensors, self, step)
-        elapsed_time = time.time() - start_time
-        if self.global_rank == 0 or self.global_rank is None:
-            logging.info(f'Evaluation time: {elapsed_time} seconds')
-
-    def clear_global_var_dict(self):
-        self._global_var_dict = {}
-
-    def wandb_log(self, tensors_logged):
-        if self._wandb_name is not None and _WANDB_AVAILABLE:
-            wandb.log(tensors_logged, step=self.step)
-
-
-_Policy = namedtuple('Policy', 'method start end')
-
-
-class _Method(ABC):
-    """ Classes inherited from _Method are used for
-    ValueSetterCallback below
-    """
-
-    @abstractmethod
-    def __call__(self, step, total_steps):
-        pass
-
-
-class _Const(_Method):
-    def __init__(self, value):
-        super().__init__()
-
-        self.value = value
-
-    def __call__(self, step, total_steps):
-        return self.value
-
-
-class _Linear(_Method):
-    def __init__(self, a, b):
-        super().__init__()
-        self.a, self.b = a, b
-
-    def __call__(self, step, total_steps):
-        return self.a + (step / (total_steps - 1)) * (self.b - self.a)
-
-
-_Method.Const = _Const
-_Method.Linear = _Linear
-
-
-class ValueSetterCallback(ActionCallback):
-    Policy = _Policy
-    Method = _Method
-
-    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
-    def __init__(self, module, arg_name, policies=None, total_steps=None, tb_writer=None):
-        super().__init__()
-
-        if policies is None:
-            initial_value = getattr(module, arg_name)
-            policies = [_Policy(method=Const(initial_value), start=0.0, end=1.0)]
-
-        new_policies = []
-        for p in policies:
-            start, end = p.start, p.end
-            if isinstance(start, float):
-                start = int(start * total_steps)
-            if isinstance(end, float):
-                end = int(end * total_steps)
-            new_policies.append(_Policy(p.method, start, end))
-        policies = new_policies
-        assert policies[0].start == 0
-        assert policies[-1].end == total_steps
-
-        self.module = module
-        self.arg_name = arg_name
-        self.policies = policies
-        self.total_steps = total_steps
-        self.tb_writer = tb_writer
-
-        self.cur_i = 0
-
-    def on_iteration_start(self):
-        cur_policy = self.policies[self.cur_i]
-        if self.step < cur_policy.end:
-            step = self.step - cur_policy.start
-            total_steps = cur_policy.end - cur_policy.start
-            value = cur_policy.method(step, total_steps)
-            setattr(self.module, self.arg_name, value)
-            if self.tb_writer is not None:
-                class_name = self.module.__class__.__name__
-                name = f"param/{class_name}.{self.arg_name}"
-                self.tb_writer.add_scalar(name, value, self.step)
-        else:
-            self.cur_i += 1
-            self.on_iteration_start()
-
-
-class UnfreezeCallback(ActionCallback):
-    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
-    def __init__(self, modules, start_epoch=0):
-        super().__init__()
-
-        self.modules = modules
-        self.start_epoch = start_epoch
-
-    def on_iteration_start(self):
-        if self.epoch_num == self.start_epoch:
-            for m in self.modules:
-                m.unfreeze()
-
-
-class WandbCallback(ActionCallback):
-    """
-    Log metrics to [Weights & Biases](https://docs.wandb.com/)
-    """
-
-    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
-    def __init__(
-        self, train_tensors=[], wandb_name=None, wandb_project=None, args=None, update_freq=25,
-    ):
-        """
-        Args:
-            train_tensors: list of tensors to evaluate and log based on training batches
-            wandb_name: wandb experiment name
-            wandb_project: wandb project name
-            args: argparse flags - will be logged as hyperparameters
-            update_freq: frequency with which to log updates
-        """
-        super().__init__()
-
-        if not _WANDB_AVAILABLE:
-            logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
-
-        self._update_freq = update_freq
-        self._train_tensors = train_tensors
-        self._name = wandb_name
-        self._project = wandb_project
-        self._args = args
-
-    def on_action_start(self):
-        if self.global_rank is None or self.global_rank == 0:
-            if _WANDB_AVAILABLE and wandb.run is None:
-                wandb.init(name=self._name, project=self._project)
-                if self._args is not None:
-                    wandb.config.update(self._args)
-            elif _WANDB_AVAILABLE and wandb.run is not None:
-                logging.info("Re-using wandb session")
-            else:
-                logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
-                logging.info("Will not log data to weights and biases.")
-                self._update_freq = -1
-
-    def on_iteration_end(self):
-        # log training metrics
-        if self.global_rank is None or self.global_rank == 0:
-            if self.step % self._update_freq == 0 and self._update_freq > 0:
-                tensors_logged = {t.name: self.registered_tensors[t.unique_name].cpu() for t in self._train_tensors}
-                # Always log learning rate
-                tensors_logged['LR'] = self.learning_rate
-                self.wandb_log(tensors_logged)
-
-    def on_epoch_start(self):
-        if self.global_rank is None or self.global_rank == 0:
-            self._last_epoch_start = time.time()
-
-    def on_epoch_end(self):
-        if self.global_rank is None or self.global_rank == 0:
-            # always log epoch num and epoch_time
-            epoch_time = time.time() - self._last_epoch_start
-            self.wandb_log({"epoch": self.epoch_num, "epoch_time": epoch_time})
-
-    def wandb_log(self, tensors_logged):
-        if _WANDB_AVAILABLE:
-            wandb.log(tensors_logged, step=self.step)

From fdae1f35b28608ed941321e9d32ed110d6e43ac9 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Thu, 28 May 2020 12:03:49 -0700
Subject: [PATCH 40/40] add deprecated callbacks files

Signed-off-by: Jason <jasoli@nvidia.com>
---
 nemo/core/deprecated_callbacks.py | 509 ++++++++++++++++++++++++++++++
 1 file changed, 509 insertions(+)
 create mode 100755 nemo/core/deprecated_callbacks.py

diff --git a/nemo/core/deprecated_callbacks.py b/nemo/core/deprecated_callbacks.py
new file mode 100755
index 000000000000..a0c7608f2d58
--- /dev/null
+++ b/nemo/core/deprecated_callbacks.py
@@ -0,0 +1,509 @@
+# ! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "ActionCallback",
+    "ModuleSaverCallback",
+    "SimpleLossLoggerCallback",
+    "EvaluatorCallback",
+    "ValueSetterCallback",
+    "UnfreezeCallback",
+    "WandbCallback",
+]
+
+import datetime
+import glob
+import os
+import sys
+import time
+from abc import ABC, abstractmethod
+from collections import namedtuple
+
+from nemo.utils import logging
+from nemo.utils.decorators import deprecated
+
+try:
+    import wandb
+
+    _WANDB_AVAILABLE = True
+except (ImportError, ModuleNotFoundError):
+    _WANDB_AVAILABLE = False
+
+
+class ActionCallback(ABC):
+    """Abstract interface for callbacks.
+    """
+
+    def __init__(self):
+        self._registered_tensors = {}
+        self._action = None
+
+    @property
+    def step(self):
+        return self.action.step
+
+    @property
+    def epoch_num(self):
+        return self.action.epoch_num
+
+    @property
+    def registered_tensors(self):
+        return self._registered_tensors
+
+    @property
+    def local_rank(self):
+        return self.action.local_rank
+
+    @property
+    def global_rank(self):
+        return self.action.global_rank
+
+    @property
+    def action(self):
+        return self._action
+
+    @action.setter
+    def action(self, action_obj):
+        self._action = action_obj
+
+    @property
+    def logger(self):
+        return logging
+
+    def on_action_start(self):
+        pass
+
+    def on_action_end(self):
+        pass
+
+    def on_epoch_start(self):
+        pass
+
+    def on_epoch_end(self):
+        pass
+
+    def on_iteration_start(self):
+        pass
+
+    def on_iteration_end(self):
+        pass
+
+
+class ModuleSaverCallback(ActionCallback):
+    """
+    For callback documentation: please see
+    https://nvidia.github.io/NeMo/tutorials/callbacks.html
+    """
+
+    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
+    def __init__(
+        self, save_modules_list, step_freq=1000, folder=None, checkpoints_to_keep=4,
+    ):
+        super().__init__()
+        self._save_modules_list = save_modules_list
+        self._folder = folder
+        self._step_freq = step_freq
+        self._ckpt2keep = checkpoints_to_keep
+        self._saved_ckpts = []
+
+    def on_iteration_end(self):
+        step = self.step
+        if (
+            self._step_freq > 0
+            and step % self._step_freq == 0
+            and step > 0
+            and (self.global_rank is None or self.global_rank == 0)
+        ):
+            for m in self._save_modules_list:
+                class_name = m.__class__.__name__
+                uid = m.unique_instance_id
+                fn = f"{class_name}_{uid}-STEP-{step}.pt"
+                if self._folder is None:
+                    file_name = fn
+                else:
+                    file_name = os.path.join(self._folder, fn)
+                logging.info(f"Saving module {class_name} in {file_name}")
+                m.save_to(file_name)
+                logging.info("Saved.")
+            self._saved_ckpts.append(f'-{self.step}.pt')
+            if len(self._saved_ckpts) > self._ckpt2keep:
+                for end in self._saved_ckpts[: -self._ckpt2keep]:
+                    for file in glob.glob(f'{self._folder}/*{end}'):
+                        os.remove(file)
+                self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :]
+
+    def on_action_end(self):
+        step = self.step
+        if self.global_rank is None or self.global_rank == 0:
+            for m in self._save_modules_list:
+                class_name = m.__class__.__name__
+                uid = m.unique_instance_id
+                fn = f"{class_name}_{uid}-STEP-{step}.pt"
+                if self._folder is None:
+                    file_name = fn
+                else:
+                    file_name = os.path.join(self._folder, fn)
+                logging.info(f"Saving module {class_name} in {file_name}")
+                m.save_to(file_name)
+                logging.info("Saved.")
+
+
+class SimpleLossLoggerCallback(ActionCallback):
+    """
+    For callback documentation: please see
+    https://nvidia.github.io/NeMo/tutorials/callbacks.html
+    """
+
+    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
+    def __init__(
+        self, tensors, print_func=None, get_tb_values=None, log_to_tb_func=None, step_freq=25, tb_writer=None,
+    ):
+
+        super().__init__()
+        if not isinstance(tensors, list):
+            tensors = [tensors]
+        self._tensors = tensors
+        self._print_func = print_func
+        self._get_tb_values = get_tb_values
+        self._log_to_tb_func = log_to_tb_func
+        self._step_freq = step_freq
+        self._swriter = tb_writer
+        self._start_time = None
+        self._last_epoch_start = None
+        self._last_iter_start = None
+
+    @property
+    def tensors(self):
+        return self._tensors
+
+    def on_action_start(self):
+        if self.global_rank is None or self.global_rank == 0:
+            logging.info("Starting .....")
+            self._start_time = time.time()
+
+    def on_action_end(self):
+        if self.global_rank is None or self.global_rank == 0:
+            if self._swriter is not None:
+                self._swriter.close()
+            delta = datetime.timedelta(seconds=(time.time() - self._start_time))
+            logging.info("Done in %s", delta)
+
+    def on_epoch_start(self):
+        if self.global_rank is None or self.global_rank == 0:
+            logging.info(f"Starting epoch {self.epoch_num}")
+            self._last_epoch_start = time.time()
+
+    def on_epoch_end(self):
+        if self.global_rank is None or self.global_rank == 0:
+            step = self.step
+
+            delta = datetime.timedelta(seconds=(time.time() - self._last_epoch_start))
+            logging.info(f"Finished epoch {self.epoch_num} in {delta}")
+
+            if self._swriter is not None:
+                value = self.epoch_num
+                self._swriter.add_scalar('misc/epoch', value, step)
+                value = time.time() - self._last_epoch_start
+                self._swriter.add_scalar('misc/epoch_time', value, step)
+
+    def on_iteration_start(self):
+        if self.global_rank is None or self.global_rank == 0:
+            self._last_iter_start = time.time()
+
+    def on_iteration_end(self):
+        if self.global_rank is None or self.global_rank == 0:
+            step = self.step
+            if step % self._step_freq == 0:
+                tensor_values = [self.registered_tensors[t.unique_name] for t in self.tensors]
+                logging.info(f"Step: {step}")
+                if self._print_func:
+                    self._print_func(tensor_values)
+                sys.stdout.flush()
+                if self._swriter is not None:
+                    if self._get_tb_values:
+                        tb_objects = self._get_tb_values(tensor_values)
+                        for name, value in tb_objects:
+                            value = value.item()
+                            self._swriter.add_scalar(name, value, step)
+                    if self._log_to_tb_func:
+                        self._log_to_tb_func(self._swriter, tensor_values, step)
+                    run_time = time.time() - self._last_iter_start
+                    self._swriter.add_scalar('misc/step_time', run_time, step)
+                run_time = time.time() - self._last_iter_start
+                logging.info(f"Step time: {run_time} seconds")
+
+
+class EvaluatorCallback(ActionCallback):
+    """
+    For callback documentation: please see
+    https://nvidia.github.io/NeMo/tutorials/callbacks.html
+    """
+
+    def __init__(
+        self,
+        eval_tensors,
+        user_iter_callback,
+        user_epochs_done_callback,
+        tb_writer=None,
+        tb_writer_func=None,
+        eval_step=1,
+        eval_epoch=None,
+        wandb_name=None,
+        wandb_project=None,
+        eval_at_start=True,
+    ):
+        # TODO: Eval_epoch currently does nothing
+        if eval_step is None and eval_epoch is None:
+            raise ValueError("Either eval_step or eval_epoch must be set. " f"But got: {eval_step} and {eval_epoch}")
+        if (eval_step is not None and eval_step <= 0) or (eval_epoch is not None and eval_epoch <= 0):
+            raise ValueError(f"Eval_step and eval_epoch must be > 0." f"But got: {eval_step} and {eval_epoch}")
+        super().__init__()
+        self._eval_tensors = eval_tensors
+        self._swriter = tb_writer
+        self._tb_writer_func = tb_writer_func
+        self._eval_frequency = eval_step
+        self._eval_at_start = eval_at_start
+        # will be passed to callbacks below
+        self._global_var_dict = {}
+
+        # Callbacks
+        self.user_iter_callback = user_iter_callback
+        self.user_done_callback = user_epochs_done_callback
+
+        # Weights and biases
+        self._wandb_project = wandb_project
+        self._wandb_name = wandb_name
+
+    @property
+    def eval_tensors(self):
+        return self._eval_tensors
+
+    @property
+    def tb_writer_func(self):
+        return self._tb_writer_func
+
+    @property
+    def swriter(self):
+        return self._swriter
+
+    def on_epoch_end(self):
+        pass
+
+    def on_iteration_end(self):
+        if self.step == 0 and not self._eval_at_start:
+            return
+        if self.step % self._eval_frequency == 0:
+            if self.global_rank == 0 or self.global_rank is None:
+                logging.info('Doing Evaluation ' + '.' * 30)
+            start_time = time.time()
+            self.action._eval(self._eval_tensors, self, self.step)
+            elapsed_time = time.time() - start_time
+            if self.global_rank == 0 or self.global_rank is None:
+                logging.info(f'Evaluation time: {elapsed_time} seconds')
+
+    def on_action_start(self):
+        if self.global_rank is None or self.global_rank == 0:
+            if self._wandb_name is not None or self._wandb_project is not None:
+                if _WANDB_AVAILABLE and wandb.run is None:
+                    wandb.init(name=self._wandb_name, project=self._wandb_project)
+                elif _WANDB_AVAILABLE and wandb.run is not None:
+                    logging.info("Re-using wandb session")
+                else:
+                    logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
+                    logging.info("Will not log data to weights and biases.")
+                    self._wandb_name = None
+                    self._wandb_project = None
+
+    def on_action_end(self):
+        step = self.step
+        if self.global_rank == 0 or self.global_rank is None:
+            logging.info('Final Evaluation ' + '.' * 30)
+        start_time = time.time()
+        self.action._eval(self._eval_tensors, self, step)
+        elapsed_time = time.time() - start_time
+        if self.global_rank == 0 or self.global_rank is None:
+            logging.info(f'Evaluation time: {elapsed_time} seconds')
+
+    def clear_global_var_dict(self):
+        self._global_var_dict = {}
+
+    def wandb_log(self, tensors_logged):
+        if self._wandb_name is not None and _WANDB_AVAILABLE:
+            wandb.log(tensors_logged, step=self.step)
+
+
+_Policy = namedtuple('Policy', 'method start end')
+
+
+class _Method(ABC):
+    """ Classes inherited from _Method are used for
+    ValueSetterCallback below
+    """
+
+    @abstractmethod
+    def __call__(self, step, total_steps):
+        pass
+
+
+class _Const(_Method):
+    def __init__(self, value):
+        super().__init__()
+
+        self.value = value
+
+    def __call__(self, step, total_steps):
+        return self.value
+
+
+class _Linear(_Method):
+    def __init__(self, a, b):
+        super().__init__()
+        self.a, self.b = a, b
+
+    def __call__(self, step, total_steps):
+        return self.a + (step / (total_steps - 1)) * (self.b - self.a)
+
+
+_Method.Const = _Const
+_Method.Linear = _Linear
+
+
+class ValueSetterCallback(ActionCallback):
+    Policy = _Policy
+    Method = _Method
+
+    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
+    def __init__(self, module, arg_name, policies=None, total_steps=None, tb_writer=None):
+        super().__init__()
+
+        if policies is None:
+            initial_value = getattr(module, arg_name)
+            policies = [_Policy(method=Const(initial_value), start=0.0, end=1.0)]
+
+        new_policies = []
+        for p in policies:
+            start, end = p.start, p.end
+            if isinstance(start, float):
+                start = int(start * total_steps)
+            if isinstance(end, float):
+                end = int(end * total_steps)
+            new_policies.append(_Policy(p.method, start, end))
+        policies = new_policies
+        assert policies[0].start == 0
+        assert policies[-1].end == total_steps
+
+        self.module = module
+        self.arg_name = arg_name
+        self.policies = policies
+        self.total_steps = total_steps
+        self.tb_writer = tb_writer
+
+        self.cur_i = 0
+
+    def on_iteration_start(self):
+        cur_policy = self.policies[self.cur_i]
+        if self.step < cur_policy.end:
+            step = self.step - cur_policy.start
+            total_steps = cur_policy.end - cur_policy.start
+            value = cur_policy.method(step, total_steps)
+            setattr(self.module, self.arg_name, value)
+            if self.tb_writer is not None:
+                class_name = self.module.__class__.__name__
+                name = f"param/{class_name}.{self.arg_name}"
+                self.tb_writer.add_scalar(name, value, self.step)
+        else:
+            self.cur_i += 1
+            self.on_iteration_start()
+
+
+class UnfreezeCallback(ActionCallback):
+    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
+    def __init__(self, modules, start_epoch=0):
+        super().__init__()
+
+        self.modules = modules
+        self.start_epoch = start_epoch
+
+    def on_iteration_start(self):
+        if self.epoch_num == self.start_epoch:
+            for m in self.modules:
+                m.unfreeze()
+
+
+class WandbCallback(ActionCallback):
+    """
+    Log metrics to [Weights & Biases](https://docs.wandb.com/)
+    """
+
+    @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.")
+    def __init__(
+        self, train_tensors=[], wandb_name=None, wandb_project=None, args=None, update_freq=25,
+    ):
+        """
+        Args:
+            train_tensors: list of tensors to evaluate and log based on training batches
+            wandb_name: wandb experiment name
+            wandb_project: wandb project name
+            args: argparse flags - will be logged as hyperparameters
+            update_freq: frequency with which to log updates
+        """
+        super().__init__()
+
+        if not _WANDB_AVAILABLE:
+            logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
+
+        self._update_freq = update_freq
+        self._train_tensors = train_tensors
+        self._name = wandb_name
+        self._project = wandb_project
+        self._args = args
+
+    def on_action_start(self):
+        if self.global_rank is None or self.global_rank == 0:
+            if _WANDB_AVAILABLE and wandb.run is None:
+                wandb.init(name=self._name, project=self._project)
+                if self._args is not None:
+                    wandb.config.update(self._args)
+            elif _WANDB_AVAILABLE and wandb.run is not None:
+                logging.info("Re-using wandb session")
+            else:
+                logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?")
+                logging.info("Will not log data to weights and biases.")
+                self._update_freq = -1
+
+    def on_iteration_end(self):
+        # log training metrics
+        if self.global_rank is None or self.global_rank == 0:
+            if self.step % self._update_freq == 0 and self._update_freq > 0:
+                tensors_logged = {t.name: self.registered_tensors[t.unique_name].cpu() for t in self._train_tensors}
+                # Always log learning rate
+                tensors_logged['LR'] = self.learning_rate
+                self.wandb_log(tensors_logged)
+
+    def on_epoch_start(self):
+        if self.global_rank is None or self.global_rank == 0:
+            self._last_epoch_start = time.time()
+
+    def on_epoch_end(self):
+        if self.global_rank is None or self.global_rank == 0:
+            # always log epoch num and epoch_time
+            epoch_time = time.time() - self._last_epoch_start
+            self.wandb_log({"epoch": self.epoch_num, "epoch_time": epoch_time})
+
+    def wandb_log(self, tensors_logged):
+        if _WANDB_AVAILABLE:
+            wandb.log(tensors_logged, step=self.step)