Merge pull request #615 from blisc/U_callbacks_4

Callback PR Rev 3
NVIDIA · May 28, 2020 · 52449a4 · 52449a4
2 parents 5d1527a + fdae1f3
commit 52449a4
Show file tree

Hide file tree

Showing 16 changed files with 1,900 additions and 1,098 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -83,6 +83,7 @@ To release a new version, please update the changelog as followed:
 
 ### Changed
 - Syncs across workers at each step to check for NaN or inf loss. Terminates all workers if stop\_on\_nan\_loss is set (as before), lets Apex deal with it if apex.amp optimization level is O1 or higher, and skips the step across workers otherwise. ([PR #637](https://github.com/NVIDIA/NeMo/pull/637)) - @redoctopus
+- Updated the callback system. Old callbacks will be deprecated in version 0.12. ([PR #615](https://github.com/NVIDIA/NeMo/pull/615)) - @blisc
 
 ### Dependencies Update
 
@@ -123,7 +124,7 @@ files, along with unit tests, examples and tutorials
 ([PR #375](https://github.com/NVIDIA/NeMo/pull/375)) - @titu1994
 
 ### Changed
-- Refactoring of `nemo_nlp` collections: 
+- Refactoring of `nemo_nlp` collections:
 ([PR #368](https://github.com/NVIDIA/NeMo/pull/368)) - @VahidooX, @yzhang123, @ekmb
     - renaming and restructuring of files, folder, and functions in `nemo_nlp`
     - losses cleaned up. LossAggregatorNM moved to nemo/backends/pytorch/common/losses
@@ -138,7 +139,7 @@ files, along with unit tests, examples and tutorials
 ([PR #284](https://github.com/NVIDIA/NeMo/pull/284)) - @stasbel
 - NeMo is not longer using pep8 code style rules. Code style rules are now enforced with `isort` and `black` incorporated into CI checks.
 ([PR #286](https://github.com/NVIDIA/NeMo/pull/286)) - @stasbel
-- Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params).  
+- Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params).
 ([PR #309](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia
 - Updated nemo's use of the logging library. from nemo import logging is now the reccomended way of using the nemo logger. neural_factory.logger and all other instances of logger are now deprecated and planned for removal in the next version. Please see PR 267 for complete change information.
 ([PR #267](https://github.com/NVIDIA/NeMo/pull/267), [PR #283](https://github.com/NVIDIA/NeMo/pull/283), [PR #305](https://github.com/NVIDIA/NeMo/pull/305), [PR #311](https://github.com/NVIDIA/NeMo/pull/311)) - @blisc
@@ -147,7 +148,7 @@ files, along with unit tests, examples and tutorials
 
 - Added TRADE (dialogue state tracking model) on MultiWOZ dataset
 ([PR #322](https://github.com/NVIDIA/NeMo/pull/322)) - @chiphuyen, @VahidooX
-- Question answering: 
+- Question answering:
 ([PR #390](https://github.com/NVIDIA/NeMo/pull/390)) - @yzhang123
     - Changed question answering task to use Roberta and Albert as alternative backends to Bert
     - Added inference mode that does not require ground truth labels
@@ -158,7 +159,7 @@ files, along with unit tests, examples and tutorials
 ### Deprecated
 
 ### Fixed
-- Critical fix of the training action on CPU 
+- Critical fix of the training action on CPU
 ([PR #308](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia
 - Fixed issue in Tacotron 2 prenet
 ([PR #444](https://github.com/NVIDIA/NeMo/pull/444)) - @blisc

diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py
@@ -17,64 +17,68 @@
     process_evaluation_epoch,
     word_error_rate,
 )
+from nemo.core import NeuralGraph
 from nemo.utils import logging
 from nemo.utils.lr_policies import CosineAnnealing
 
 
 def create_dags(model_config_file, vocab, args, nf):
 
-    # Create a data_layer for training.
-    data_layer = nemo_asr.AudioToTextDataLayer.import_from_config(
-        model_config_file,
-        "AudioToTextDataLayer_train",
-        overwrite_params={"manifest_filepath": args.train_dataset, "batch_size": args.batch_size},
-    )
+    with NeuralGraph() as g0:
+        # Create a data_layer for training.
+        data_layer = nemo_asr.AudioToTextDataLayer.import_from_config(
+            model_config_file,
+            "AudioToTextDataLayer_train",
+            overwrite_params={"manifest_filepath": args.train_dataset, "batch_size": args.batch_size},
+        )
 
-    num_samples = len(data_layer)
-    steps_per_epoch = math.ceil(num_samples / (data_layer.batch_size * args.iter_per_step * nf.world_size))
-    total_steps = steps_per_epoch * args.num_epochs
-    logging.info("Train samples=", num_samples, "num_steps=", total_steps)
+        num_samples = len(data_layer)
+        steps_per_epoch = math.ceil(num_samples / (data_layer.batch_size * args.iter_per_step * nf.world_size))
+        total_steps = steps_per_epoch * args.num_epochs
+        logging.info("Train samples=", num_samples, "num_steps=", total_steps)
 
-    # Create a data_layer for evaluation.
-    data_layer_eval = nemo_asr.AudioToTextDataLayer.import_from_config(
-        model_config_file, "AudioToTextDataLayer_eval", overwrite_params={"manifest_filepath": args.eval_datasets},
-    )
+        # Create a data_layer for evaluation.
+        data_layer_eval = nemo_asr.AudioToTextDataLayer.import_from_config(
+            model_config_file, "AudioToTextDataLayer_eval", overwrite_params={"manifest_filepath": args.eval_datasets},
+        )
 
-    num_samples = len(data_layer_eval)
-    logging.info(f"Eval samples={num_samples}")
+        num_samples = len(data_layer_eval)
+        logging.info(f"Eval samples={num_samples}")
 
-    # Instantiate data processor.
-    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config(
-        model_config_file, "AudioToMelSpectrogramPreprocessor"
-    )
+        # Instantiate data processor.
+        data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config(
+            model_config_file, "AudioToMelSpectrogramPreprocessor"
+        )
 
-    # Instantiate JASPER encoder-decoder modules.
-    jasper_encoder = nemo_asr.JasperEncoder.import_from_config(model_config_file, "JasperEncoder")
-    jasper_decoder = nemo_asr.JasperDecoderForCTC.import_from_config(
-        model_config_file, "JasperDecoderForCTC", overwrite_params={"num_classes": len(vocab)}
-    )
+        # Instantiate JASPER encoder-decoder modules.
+        jasper_encoder = nemo_asr.JasperEncoder.import_from_config(model_config_file, "JasperEncoder")
+        jasper_decoder = nemo_asr.JasperDecoderForCTC.import_from_config(
+            model_config_file, "JasperDecoderForCTC", overwrite_params={"num_classes": len(vocab)}
+        )
 
-    # Instantiate losses.
-    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
-    greedy_decoder = nemo_asr.GreedyCTCDecoder()
-
-    # Create a training graph.
-    audio, audio_len, transcript, transcript_len = data_layer()
-    processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len)
-    encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len)
-    log_probs = jasper_decoder(encoder_output=encoded)
-    predictions = greedy_decoder(log_probs=log_probs)
-    loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,)
-
-    # Create an evaluation graph.
-    audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
-    processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e)
-    encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e)
-    log_probs_e = jasper_decoder(encoder_output=encoded_e)
-    predictions_e = greedy_decoder(log_probs=log_probs_e)
-    loss_e = ctc_loss(
-        log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
-    )
+        # Instantiate losses.
+        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
+        greedy_decoder = nemo_asr.GreedyCTCDecoder()
+
+        # Create a training graph.
+        audio, audio_len, transcript, transcript_len = data_layer()
+        processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len)
+        encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len)
+        log_probs = jasper_decoder(encoder_output=encoded)
+        predictions = greedy_decoder(log_probs=log_probs)
+        loss = ctc_loss(
+            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
+        )
+
+        # Create an evaluation graph.
+        audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
+        processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e)
+        encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e)
+        log_probs_e = jasper_decoder(encoder_output=encoded_e)
+        predictions_e = greedy_decoder(log_probs=log_probs_e)
+        loss_e = ctc_loss(
+            log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
+        )
     logging.info("Num of params in encoder: {0}".format(jasper_encoder.num_weights))
 
     # Callbacks to print info to console and Tensorboard.
@@ -99,14 +103,7 @@ def create_dags(model_config_file, vocab, args, nf):
     callbacks = [train_callback, checkpointer_callback, eval_callback]
 
     # Return entities required by the actual training.
-    return (
-        loss,
-        eval_tensors,
-        callbacks,
-        total_steps,
-        log_probs_e,
-        encoded_len_e,
-    )
+    return (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e, g0)
 
 
 def main():
@@ -166,7 +163,7 @@ def main():
     # Get vocabulary.
     vocab = jasper_params['labels']
 
-    (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e,) = create_dags(
+    (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e, g0) = create_dags(
         args.model_config, vocab, args, nf
     )
 
@@ -232,13 +229,17 @@ def main():
             folder=checkpoint_dir, step_freq=args.checkpoint_save_freq, force_load=True,
         )
 
-        # Distributed Data Parallel changes the underlying class so we need
-        # to reinstantiate Encoder and Decoder
         args.num_epochs += 10
         previous_step_count = total_steps
-        loss, eval_tensors, callbacks, total_steps, _, _ = create_dags(args.model_config, vocab, args, nf)
 
+        # Distributed Data Parallel and amp changes the underlying class so we need to reinstantiate modules
+        # Clear the module registery
+        nemo.utils.app_state.AppState().modules.clear()
+        # Delete old graph and make a new one
+        del g0
         nf.reset_trainer()
+        loss, eval_tensors, callbacks, total_steps, _, _, new_g = create_dags(args.model_config, vocab, args, nf)
+
         nf.train(
             tensors_to_optimize=[loss],
             callbacks=callbacks,