diff --git a/Jenkinsfile b/Jenkinsfile
index e8367874d18d..9fb03e64e33a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -14,9 +14,14 @@ pipeline {
         sh 'python -c "import torch; print(torch.__version__)"'
       }
     }
+    stage('Code formatting checks') {
+      steps {
+        sh 'python setup.py check_style'
+      }
+    }
     stage('PEP8 Checks') {
       steps {
-        sh 'pycodestyle . --exclude=./tests/other/jasper.py,./tests/other/jasper_zero_dl.py,setup.py,./nemo/collections/nlp/utils/metrics/sacrebleu.py,./docs/sources/source/conf.py,./collections/nemo_nlp/build,./tests/test_squad.py,./nemo/package_info.py,./examples/asr/jasper_aishell_infer.py,./examples/asr/jasper_eval.py,./examples/nlp/asr_postprocessor.py,./examples/nlp/sentence_classification_with_bert.py,./examples/nlp/transformer_lm.py'
+        sh 'pycodestyle . --max-line-length=119 --exclude=./tests/other/jasper.py,./tests/other/jasper_zero_dl.py,setup.py,./nemo/collections/nlp/utils/metrics/sacrebleu.py,./docs/sources/source/conf.py,./collections/nemo_nlp/build,./tests/test_squad.py,./nemo/package_info.py,./examples/asr/jasper_aishell_infer.py,./examples/asr/jasper_eval.py,./examples/nlp/asr_postprocessor.py,./examples/nlp/sentence_classification_with_bert.py,./examples/nlp/transformer_lm.py'
       }
     } 
 
diff --git a/docs/docs_zh/sources/source/conf.py b/docs/docs_zh/sources/source/conf.py
index 2ddae2e3a453..faa8502da823 100644
--- a/docs/docs_zh/sources/source/conf.py
+++ b/docs/docs_zh/sources/source/conf.py
@@ -20,6 +20,7 @@
 import os
 import sys
 from unittest.mock import MagicMock
+
 import nemo
 
 sys.path.insert(0, os.path.abspath("."))
@@ -49,13 +50,20 @@ def __getattr__(cls, name):
 
 # ---- Mocking up the python modules. -----
 
-MOCK_MODULES = ['torch', 'torch.nn', 'torch.utils', 'torch.optim',
-                'torch.utils.data', 'torch.utils.data.sampler',
-                'torchvision', 'torchvision.models',
-                'torchtext',
-                'h5py', 'kaldi_io',
-                'transformers'
-                ]
+MOCK_MODULES = [
+    'torch',
+    'torch.nn',
+    'torch.utils',
+    'torch.optim',
+    'torch.utils.data',
+    'torch.utils.data.sampler',
+    'torchvision',
+    'torchvision.models',
+    'torchtext',
+    'h5py',
+    'kaldi_io',
+    'transformers',
+]
 
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 
@@ -163,12 +171,7 @@ def __getattr__(cls, name):
 #
 # This is required for the alabaster theme
 # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
-html_sidebars = {
-    "**": [
-        "relations.html",  # needs 'show_related': True theme option to display
-        "searchbox.html",
-    ]
-}
+html_sidebars = {"**": ["relations.html", "searchbox.html",]}  # needs 'show_related': True theme option to display
 
 html_theme_options = {
     "canonical_url": "",
@@ -207,10 +210,7 @@ def __getattr__(cls, name):
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, "nemo.tex", "nemo Documentation", "AI App Design team",
-     "manual")
-]
+latex_documents = [(master_doc, "nemo.tex", "nemo Documentation", "AI App Design team", "manual",)]
 
 # -- Options for manual page output ---------------------------------------
 
@@ -224,13 +224,5 @@ def __getattr__(cls, name):
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (
-        master_doc,
-        "nemo",
-        "nemo Documentation",
-        author,
-        "nemo",
-        "One line description of project.",
-        "Miscellaneous",
-    )
+    (master_doc, "nemo", "nemo Documentation", author, "nemo", "One line description of project.", "Miscellaneous",)
 ]
diff --git a/docs/sources/source/conf.py b/docs/sources/source/conf.py
index 5783ea882493..387a29c91d40 100644
--- a/docs/sources/source/conf.py
+++ b/docs/sources/source/conf.py
@@ -13,14 +13,15 @@
 # All configuration values have a default; values that are commented out
 # infer to show the default.
 
+import os
+import sys
+from unittest.mock import MagicMock
+
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 import nemo
-import os
-import sys
-from unittest.mock import MagicMock
 
 sys.path.insert(0, os.path.abspath("."))
 sys.path.insert(0, os.path.abspath("../../../"))
@@ -52,13 +53,20 @@ def __getattr__(cls, name):
 
 # ---- Mocking up the python modules. -----
 
-MOCK_MODULES = ['torch', 'torch.nn', 'torch.utils', 'torch.optim',
-                'torch.utils.data', 'torch.utils.data.sampler',
-                'torchvision', 'torchvision.models',
-                'torchtext',
-                'h5py', 'kaldi_io',
-                'transformers'
-                ]
+MOCK_MODULES = [
+    'torch',
+    'torch.nn',
+    'torch.utils',
+    'torch.optim',
+    'torch.utils.data',
+    'torch.utils.data.sampler',
+    'torchvision',
+    'torchvision.models',
+    'torchtext',
+    'h5py',
+    'kaldi_io',
+    'transformers',
+]
 
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 
@@ -84,8 +92,8 @@ def __getattr__(cls, name):
 ]
 
 
-locale_dirs = ['locale/']   # path is example but recommended.
-gettext_compact = False     # optional.
+locale_dirs = ['locale/']  # path is example but recommended.
+gettext_compact = False  # optional.
 
 
 # Add any paths that contain templates here, relative to this directory.
@@ -171,12 +179,7 @@ def __getattr__(cls, name):
 #
 # This is required for the alabaster theme
 # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
-html_sidebars = {
-    "**": [
-        "relations.html",  # needs 'show_related': True theme option to display
-        "searchbox.html",
-    ]
-}
+html_sidebars = {"**": ["relations.html", "searchbox.html",]}  # needs 'show_related': True theme option to display
 
 html_theme_options = {
     "canonical_url": "",
@@ -215,10 +218,7 @@ def __getattr__(cls, name):
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, "nemo.tex", "nemo Documentation", "AI App Design team",
-     "manual")
-]
+latex_documents = [(master_doc, "nemo.tex", "nemo Documentation", "AI App Design team", "manual",)]
 
 # -- Options for manual page output ---------------------------------------
 
@@ -232,13 +232,5 @@ def __getattr__(cls, name):
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (
-        master_doc,
-        "nemo",
-        "nemo Documentation",
-        author,
-        "nemo",
-        "One line description of project.",
-        "Miscellaneous",
-    )
+    (master_doc, "nemo", "nemo Documentation", author, "nemo", "One line description of project.", "Miscellaneous",)
 ]
diff --git a/examples/applications/asr_service/app/__init__.py b/examples/applications/asr_service/app/__init__.py
index 5f33432b5923..c2a94bd2126f 100644
--- a/examples/applications/asr_service/app/__init__.py
+++ b/examples/applications/asr_service/app/__init__.py
@@ -1,10 +1,12 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import os
-from flask import Flask
+
 from ruamel.yaml import YAML
 
 import nemo
 import nemo.collections.asr as nemo_asr
+from app import routes  # noqa
+from flask import Flask
 
 app = Flask(__name__)
 # make sure WORK_DIR exists before calling your service
@@ -28,34 +30,25 @@
 
 # Instantiate necessary Neural Modules
 # Note that data layer is missing from here
-neural_factory = nemo.core.NeuralModuleFactory(
-    placement=nemo.core.DeviceType.GPU,
-    backend=nemo.core.Backend.PyTorch)
-data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
-        factory=neural_factory)
+neural_factory = nemo.core.NeuralModuleFactory(placement=nemo.core.DeviceType.GPU, backend=nemo.core.Backend.PyTorch)
+data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(factory=neural_factory)
 jasper_encoder = nemo_asr.JasperEncoder(
     jasper=jasper_model_definition['JasperEncoder']['jasper'],
     activation=jasper_model_definition['JasperEncoder']['activation'],
-    feat_in=jasper_model_definition[
-        'AudioToMelSpectrogramPreprocessor']['features'])
+    feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
+)
 jasper_encoder.restore_from(CHECKPOINT_ENCODER, local_rank=0)
-jasper_decoder = nemo_asr.JasperDecoderForCTC(
-    feat_in=1024,
-    num_classes=len(labels))
+jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(labels))
 jasper_decoder.restore_from(CHECKPOINT_DECODER, local_rank=0)
 greedy_decoder = nemo_asr.GreedyCTCDecoder()
 
 if ENABLE_NGRAM and os.path.isfile(LM_PATH):
     beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
-        vocab=labels,
-        beam_width=64,
-        alpha=2.0,
-        beta=1.0,
-        lm_path=LM_PATH,
-        num_cpus=max(os.cpu_count(), 1))
+        vocab=labels, beam_width=64, alpha=2.0, beta=1.0, lm_path=LM_PATH, num_cpus=max(os.cpu_count(), 1),
+    )
 else:
     print("Beam search is not enabled")
 
-from app import routes  # noqa
+
 if __name__ == '__main__':
     app.run()
diff --git a/examples/applications/asr_service/app/routes.py b/examples/applications/asr_service/app/routes.py
index 3ad1150d9cff..50392024d4c7 100644
--- a/examples/applications/asr_service/app/routes.py
+++ b/examples/applications/asr_service/app/routes.py
@@ -3,47 +3,50 @@
 import os
 import time
 
+import nemo
+import nemo.collections.asr as nemo_asr
+from app import (
+    ENABLE_NGRAM,
+    MODEL_YAML,
+    WORK_DIR,
+    app,
+    data_preprocessor,
+    greedy_decoder,
+    jasper_decoder,
+    jasper_encoder,
+    neural_factory,
+)
 from flask import request
 from werkzeug.utils import secure_filename
 
-from app import app, data_preprocessor, jasper_encoder, jasper_decoder, \
-    greedy_decoder, neural_factory, MODEL_YAML, WORK_DIR, ENABLE_NGRAM
 try:
     from app import beam_search_with_lm
 except ImportError:
     print("Not using Beam Search Decoder with LM")
     ENABLE_NGRAM = False
-import nemo
-import nemo.collections.asr as nemo_asr
 
 
 def wav_to_text(manifest, greedy=True):
     from ruamel.yaml import YAML
+
     yaml = YAML(typ="safe")
     with open(MODEL_YAML) as f:
         jasper_model_definition = yaml.load(f)
     labels = jasper_model_definition['labels']
 
     # Instantiate necessary neural modules
-    data_layer = nemo_asr.AudioToTextDataLayer(
-        shuffle=False,
-        manifest_filepath=manifest,
-        labels=labels, batch_size=1)
+    data_layer = nemo_asr.AudioToTextDataLayer(shuffle=False, manifest_filepath=manifest, labels=labels, batch_size=1)
 
     # Define inference DAG
     audio_signal, audio_signal_len, _, _ = data_layer()
-    processed_signal, processed_signal_len = data_preprocessor(
-        input_signal=audio_signal,
-        length=audio_signal_len)
-    encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
-                                          length=processed_signal_len)
+    processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len)
+    encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=processed_signal_len)
     log_probs = jasper_decoder(encoder_output=encoded)
     predictions = greedy_decoder(log_probs=log_probs)
 
     if ENABLE_NGRAM:
         print('Running with beam search')
-        beam_predictions = beam_search_with_lm(
-            log_probs=log_probs, log_probs_length=encoded_len)
+        beam_predictions = beam_search_with_lm(log_probs=log_probs, log_probs_length=encoded_len)
         eval_tensors = [beam_predictions]
 
     if greedy:
@@ -52,6 +55,7 @@ def wav_to_text(manifest, greedy=True):
     tensors = neural_factory.infer(tensors=eval_tensors)
     if greedy:
         from nemo.collections.asr.helpers import post_process_predictions
+
         prediction = post_process_predictions(tensors[0], labels)
     else:
         prediction = tensors[0][0][0][0][1]
@@ -79,8 +83,7 @@ def transcribe_file():
         greedy = True
         if request.form.get('beam'):
             if not ENABLE_NGRAM:
-                return ("Error: Beam Search with ngram LM is not enabled "
-                        "on this server")
+                return "Error: Beam Search with ngram LM is not enabled " "on this server"
             greedy = False
         file_path = os.path.join(WORK_DIR, secure_filename(f.filename))
         f.save(file_path)
@@ -89,7 +92,7 @@ def transcribe_file():
         manifest['audio_filepath'] = file_path
         manifest['duration'] = 18000
         manifest['text'] = 'todo'
-        with open(file_path+".json", 'w') as fout:
+        with open(file_path + ".json", 'w') as fout:
             fout.write(json.dumps(manifest))
         start_t = time.time()
         transcription = wav_to_text(file_path + ".json", greedy=greedy)
diff --git a/examples/asr/experimental/garnet.py b/examples/asr/experimental/garnet.py
index f557e3b6a1ee..9a8bfeb34781 100644
--- a/examples/asr/experimental/garnet.py
+++ b/examples/asr/experimental/garnet.py
@@ -13,28 +13,28 @@
 from tensorboardX import SummaryWriter
 
 import nemo
-from nemo.core.callbacks import ValueSetterCallback, UnfreezeCallback
+import nemo.collections.asr as nemo_asr
 import nemo.utils.argparse as nm_argparse
+from nemo.collections.asr.las.helpers import process_evaluation_batch, process_evaluation_epoch
+from nemo.core.callbacks import UnfreezeCallback, ValueSetterCallback
 from nemo.utils.lr_policies import SquareAnnealing
 from nemo.utils.misc import Config
-import nemo.collections.asr as nemo_asr
-from nemo.collections.asr.las.helpers import process_evaluation_batch, \
-    process_evaluation_epoch
 
 # Special symbols for seq2seq with cross-entropy criterion and aux CTC loss
 SS = namedtuple('SS', 'id char name')
 _ = 0
 sss = [
-    SS(_ + 0, '#', 'pad'), SS(_ + 1, '<', 'bos'), SS(_ + 2, '>', 'eos'),  # CE
+    SS(_ + 0, '#', 'pad'),
+    SS(_ + 1, '<', 'bos'),
+    SS(_ + 2, '>', 'eos'),  # CE
     # SS(_ + 3, '@', 'ctc_blank')  # CTC
 ]
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        parents=[nm_argparse.NemoArgParser()],
-        description='GarNet',
-        conflict_handler='resolve')
+        parents=[nm_argparse.NemoArgParser()], description='GarNet', conflict_handler='resolve',
+    )
     parser.set_defaults(
         checkpoint_dir=None,
         optimizer="novograd",
@@ -44,15 +44,20 @@ def parse_args():
         weight_decay=1e-5,
         lr=0.02,
         amp_opt_level="O1",
-        create_tb_writer=True
+        create_tb_writer=True,
     )
 
     # Overwrite default args
-    parser.add_argument("--num_epochs", type=int, default=None, required=True,
-                        help="number of epochs to train. You should specify"
-                             "either num_epochs or max_steps")
-    parser.add_argument("--model_config", type=str, required=True,
-                        help="model configuration file: model.yaml")
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=None,
+        required=True,
+        help="number of epochs to train. You should specify" "either num_epochs or max_steps",
+    )
+    parser.add_argument(
+        "--model_config", type=str, required=True, help="model configuration file: model.yaml",
+    )
 
     # Create new args
     parser.add_argument("--exp_name", default="GarNet", type=str)
@@ -82,9 +87,7 @@ def parse_cfg(args):
 
     cfg['optimization']['batch_size'] = args.batch_size
     # Calculating real inference batch_size
-    inference_batch_size = int(
-        args.eval_batch_size / cfg['inference']['beam_size']
-    )
+    inference_batch_size = int(args.eval_batch_size / cfg['inference']['beam_size'])
     assert inference_batch_size >= 1
     cfg['inference']['batch_size'] = inference_batch_size
 
@@ -103,54 +106,44 @@ def create_dag(args, cfg, logger, num_gpus):
         labels=cfg['target']['labels'],
         batch_size=cfg['optimization']['batch_size'],
         eos_id=cfg['target']['eos_id'],
-        **cfg['AudioToTextDataLayer']['train']
+        **cfg['AudioToTextDataLayer']['train'],
     )
     data_evals = []
     if args.eval_datasets:
         for val_path in args.eval_datasets:
-            data_evals.append(nemo_asr.AudioToTextDataLayer(
-                manifest_filepath=val_path,
-                labels=cfg['target']['labels'],
-                batch_size=cfg['inference']['batch_size'],
-                eos_id=cfg['target']['eos_id'],
-                **cfg['AudioToTextDataLayer']['eval']
-            ))
+            data_evals.append(
+                nemo_asr.AudioToTextDataLayer(
+                    manifest_filepath=val_path,
+                    labels=cfg['target']['labels'],
+                    batch_size=cfg['inference']['batch_size'],
+                    eos_id=cfg['target']['eos_id'],
+                    **cfg['AudioToTextDataLayer']['eval'],
+                )
+            )
     else:
         logger.info("There were no val datasets passed")
-    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
-        **cfg['AudioToMelSpectrogramPreprocessor']
-    )
-    data_augmentation = nemo_asr.SpectrogramAugmentation(
-        **cfg['SpectrogramAugmentation']
-    )
+    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(**cfg['AudioToMelSpectrogramPreprocessor'])
+    data_augmentation = nemo_asr.SpectrogramAugmentation(**cfg['SpectrogramAugmentation'])
     encoder = nemo_asr.JasperEncoder(
-        feat_in=cfg["AudioToMelSpectrogramPreprocessor"]["features"],
-        **cfg['JasperEncoder']
+        feat_in=cfg["AudioToMelSpectrogramPreprocessor"]["features"], **cfg['JasperEncoder'],
     )
-    if args.encoder_checkpoint is not None \
-            and os.path.exists(args.encoder_checkpoint):
+    if args.encoder_checkpoint is not None and os.path.exists(args.encoder_checkpoint):
         if cfg['JasperEncoder']['load']:
             encoder.restore_from(args.encoder_checkpoint, args.local_rank)
-            logger.info(f'Loaded weights for encoder'
-                        f' from {args.encoder_checkpoint}')
+            logger.info(f'Loaded weights for encoder' f' from {args.encoder_checkpoint}')
         if cfg['JasperEncoder']['freeze']:
             encoder.freeze()
             logger.info(f'Freeze encoder weights')
     connector = nemo_asr.JasperRNNConnector(
-        in_channels=cfg['JasperEncoder']['jasper'][-1]['filters'],
-        out_channels=cfg['DecoderRNN']['hidden_size']
+        in_channels=cfg['JasperEncoder']['jasper'][-1]['filters'], out_channels=cfg['DecoderRNN']['hidden_size'],
     )
     decoder = nemo.backends.pytorch.DecoderRNN(
-        voc_size=len(cfg['target']['labels']),
-        bos_id=cfg['target']['bos_id'],
-        **cfg['DecoderRNN']
+        voc_size=len(cfg['target']['labels']), bos_id=cfg['target']['bos_id'], **cfg['DecoderRNN'],
     )
-    if args.decoder_checkpoint is not None \
-            and os.path.exists(args.decoder_checkpoint):
+    if args.decoder_checkpoint is not None and os.path.exists(args.decoder_checkpoint):
         if cfg['DecoderRNN']['load']:
             decoder.restore_from(args.decoder_checkpoint, args.local_rank)
-            logger.info(f'Loaded weights for decoder'
-                        f' from {args.decoder_checkpoint}')
+            logger.info(f'Loaded weights for decoder' f' from {args.decoder_checkpoint}')
         if cfg['DecoderRNN']['freeze']:
             decoder.freeze()
             logger.info(f'Freeze decoder weights')
@@ -165,26 +158,21 @@ def create_dag(args, cfg, logger, num_gpus):
     total_steps = num_epochs * steps_per_epoch
     vsc = ValueSetterCallback
     tf_callback = ValueSetterCallback(
-        decoder, 'teacher_forcing',
-        policies=[
-            vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0)
-        ],
-        total_steps=total_steps
+        decoder,
+        'teacher_forcing',
+        policies=[vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0)],
+        total_steps=total_steps,
     )
     seq_loss = nemo.backends.pytorch.SequenceLoss(
         pad_id=cfg['target']['pad_id'],
         smoothing_coef=cfg['optimization']['smoothing_coef'],
-        sample_wise=cfg['optimization']['sample_wise']
+        sample_wise=cfg['optimization']['sample_wise'],
     )
     se_callback = ValueSetterCallback(
-        seq_loss, 'smoothing_coef',
-        policies=[
-            vsc.Policy(
-                vsc.Method.Const(seq_loss.smoothing_coef),
-                start=0.0, end=1.0
-            ),
-        ],
-        total_steps=total_steps
+        seq_loss,
+        'smoothing_coef',
+        policies=[vsc.Policy(vsc.Method.Const(seq_loss.smoothing_coef), start=0.0, end=1.0),],
+        total_steps=total_steps,
     )
     beam_search = nemo.backends.pytorch.BeamSearch(
         decoder=decoder,
@@ -192,75 +180,48 @@ def create_dag(args, cfg, logger, num_gpus):
         bos_id=cfg['target']['bos_id'],
         eos_id=cfg['target']['eos_id'],
         max_len=cfg['target']['max_len'],
-        beam_size=cfg['inference']['beam_size']
-    )
-    uf_callback = UnfreezeCallback(
-        [encoder, decoder],
-        start_epoch=cfg['optimization']['start_unfreeze']
+        beam_size=cfg['inference']['beam_size'],
     )
+    uf_callback = UnfreezeCallback([encoder, decoder], start_epoch=cfg['optimization']['start_unfreeze'])
     saver_callback = nemo.core.ModuleSaverCallback(
-        save_modules_list=[encoder, connector, decoder],
-        folder=args.checkpoint_dir,
-        step_freq=args.eval_freq
+        save_modules_list=[encoder, connector, decoder], folder=args.checkpoint_dir, step_freq=args.eval_freq,
     )
 
     # Creating DAG
     audios, audio_lens, transcripts, _ = data()
-    processed_audios, processed_audio_lens = data_preprocessor(
-        input_signal=audios,
-        length=audio_lens
-    )
+    processed_audios, processed_audio_lens = data_preprocessor(input_signal=audios, length=audio_lens)
     augmented_spec = data_augmentation(input_spec=processed_audios)
-    encoded, _ = encoder(
-        audio_signal=augmented_spec,
-        length=processed_audio_lens
-    )
+    encoded, _ = encoder(audio_signal=augmented_spec, length=processed_audio_lens)
     encoded = connector(tensor=encoded)
-    log_probs, _ = decoder(
-        targets=transcripts,
-        encoder_outputs=encoded
-    )
-    train_loss = seq_loss(
-        log_probs=log_probs,
-        targets=transcripts
-    )
+    log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded)
+    train_loss = seq_loss(log_probs=log_probs, targets=transcripts)
     evals = []
     for i, data_eval in enumerate(data_evals):
         audios, audio_lens, transcripts, _ = data_eval()
-        processed_audios, processed_audio_lens = data_preprocessor(
-            input_signal=audios,
-            length=audio_lens
-        )
-        encoded, _ = encoder(
-            audio_signal=processed_audios,
-            length=processed_audio_lens
-        )
+        processed_audios, processed_audio_lens = data_preprocessor(input_signal=audios, length=audio_lens)
+        encoded, _ = encoder(audio_signal=processed_audios, length=processed_audio_lens)
         encoded = connector(tensor=encoded)
-        log_probs, _ = decoder(
-            targets=transcripts,
-            encoder_outputs=encoded
-        )
-        loss = seq_loss(
-            log_probs=log_probs,
-            targets=transcripts
-        )
+        log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded)
+        loss = seq_loss(log_probs=log_probs, targets=transcripts)
         predictions, aw = beam_search(encoder_outputs=encoded)
-        evals.append((args.eval_datasets[i],
-                     (loss, log_probs, transcripts, predictions, aw)))
+        evals.append((args.eval_datasets[i], (loss, log_probs, transcripts, predictions, aw),))
 
     # Update config
     cfg['num_params'] = {
         'encoder': encoder.num_weights,
         'connector': connector.num_weights,
-        'decoder': decoder.num_weights
+        'decoder': decoder.num_weights,
     }
     cfg['num_params']['total'] = sum(cfg['num_params'].values())
     cfg['input']['train'] = {'num_data': num_data}
     cfg['optimization']['steps_per_epoch'] = steps_per_epoch
     cfg['optimization']['total_steps'] = total_steps
 
-    return (train_loss, evals), cfg, [tf_callback, se_callback,
-                                      uf_callback, saver_callback]
+    return (
+        (train_loss, evals),
+        cfg,
+        [tf_callback, se_callback, uf_callback, saver_callback],
+    )
 
 
 def construct_name(args, cfg):
@@ -268,7 +229,7 @@ def construct_name(args, cfg):
         cfg['model'],
         args.exp_name,
         'bs' + str(cfg['optimization']['batch_size']),
-        'epochs' + str(cfg['optimization']['params']['num_epochs'])
+        'epochs' + str(cfg['optimization']['params']['num_epochs']),
     )
     if args.work_dir:
         name = os.path.join(args.work_dir, name)
@@ -290,7 +251,8 @@ def main():
         create_tb_writer=args.create_tb_writer,
         files_to_copy=[args.model_config, __file__],
         cudnn_benchmark=args.cudnn_benchmark,
-        tensorboard_dir=args.tensorboard_dir)
+        tensorboard_dir=args.tensorboard_dir,
+    )
 
     logger = neural_factory.logger
     tb_writer = neural_factory.tb_writer
@@ -307,17 +269,14 @@ def main():
         logger.info(f'Using seed {args.random_seed}')
 
     # Defining computational graph
-    (train_loss, evals), cfg, dag_callbacks = create_dag(
-        args, cfg, logger, neural_factory.world_size)
+    (train_loss, evals), cfg, dag_callbacks = create_dag(args, cfg, logger, neural_factory.world_size)
     logger.info('Config:')
     logger.info(pformat(cfg))
 
     num_data = cfg['input']['train']['num_data']
     steps_per_epoch = cfg['optimization']['steps_per_epoch']
     total_steps = cfg['optimization']['total_steps']
-    logger.info(f'Num data: {num_data}\n'
-                f'Steps per epoch: {steps_per_epoch}\n'
-                f'Total steps: {total_steps}')
+    logger.info(f'Num data: {num_data}\n' f'Steps per epoch: {steps_per_epoch}\n' f'Total steps: {total_steps}')
 
     # TODO: Workaround?
     dag_callbacks[0].tb_writer = tb_writer
@@ -328,7 +287,7 @@ def main():
         tensors=[train_loss],
         print_func=lambda x: logger.info(f"Loss: {x[0].item()}"),
         get_tb_values=lambda x: [("loss", x[0])],
-        tb_writer=tb_writer
+        tb_writer=tb_writer,
     )
     log_callbacks = [train_callback]
     target = cfg['target']
@@ -339,20 +298,13 @@ def main():
             # TODO: Should be fixed soon, so we don't need to pass exactly list
             eval_tensors=list(tensors),
             user_iter_callback=partial(
-                process_evaluation_batch,
-                labels=labels,
-                specials=specials,
-                tb_writer=tb_writer,
-                write_attn=False
+                process_evaluation_batch, labels=labels, specials=specials, tb_writer=tb_writer, write_attn=False,
             ),
             user_epochs_done_callback=partial(
-                process_evaluation_epoch,
-                tag=os.path.basename(name),
-                calc_wer=True,
-                logger=logger
+                process_evaluation_epoch, tag=os.path.basename(name), calc_wer=True, logger=logger,
             ),
             eval_step=args.eval_freq,
-            tb_writer=tb_writer
+            tb_writer=tb_writer,
         )
         log_callbacks.append(eval_callback)
     # noinspection PyTypeChecker
@@ -365,14 +317,11 @@ def main():
         lr_policy=SquareAnnealing(
             cfg['optimization']['total_steps'],
             min_lr=cfg['optimization']['min_lr'],
-            warmup_steps=(
-                cfg['optimization']['warmup_epochs']
-                * cfg['optimization']['steps_per_epoch']
-            )
+            warmup_steps=(cfg['optimization']['warmup_epochs'] * cfg['optimization']['steps_per_epoch']),
         ),
         optimizer=cfg['optimization']['optimizer'],
         optimization_params=cfg['optimization']['params'],
-        batches_per_step=args.iter_per_step
+        batches_per_step=args.iter_per_step,
     )
 
 
diff --git a/examples/asr/experimental/garnet_rnnlm.py b/examples/asr/experimental/garnet_rnnlm.py
index 4d1748f3383f..d2d852300fe8 100644
--- a/examples/asr/experimental/garnet_rnnlm.py
+++ b/examples/asr/experimental/garnet_rnnlm.py
@@ -13,28 +13,28 @@
 from tensorboardX import SummaryWriter
 
 import nemo
+import nemo.collections.asr as nemo_asr
+import nemo.utils.argparse as nm_argparse
+from nemo.collections.asr.las.helpers import process_evaluation_batch, process_evaluation_epoch
 from nemo.core.callbacks import ValueSetterCallback
 from nemo.utils.lr_policies import SquareAnnealing
-import nemo.utils.argparse as nm_argparse
 from nemo.utils.misc import Config
-import nemo.collections.asr as nemo_asr
-from nemo.collections.asr.las.helpers import process_evaluation_batch, \
-    process_evaluation_epoch
 
 # Special symbols for seq2seq with cross-entropy criterion and aux CTC loss
 SS = namedtuple('SS', 'id char name')
 _ = 0
 sss = [
-    SS(_ + 0, '#', 'pad'), SS(_ + 1, '<', 'bos'), SS(_ + 2, '>', 'eos'),  # CE
+    SS(_ + 0, '#', 'pad'),
+    SS(_ + 1, '<', 'bos'),
+    SS(_ + 2, '>', 'eos'),  # CE
     # SS(_ + 3, '@', 'ctc_blank')  # CTC
 ]
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        parents=[nm_argparse.NemoArgParser()],
-        description='GarNet RnnLM',
-        conflict_handler='resolve')
+        parents=[nm_argparse.NemoArgParser()], description='GarNet RnnLM', conflict_handler='resolve',
+    )
     parser.set_defaults(
         checkpoint_dir=None,
         optimizer="novograd",
@@ -44,17 +44,23 @@ def parse_args():
         weight_decay=1e-5,
         lr=0.02,
         amp_opt_level="O1",
-        create_tb_writer=True
+        create_tb_writer=True,
     )
 
     # Overwrite default args
-    parser.add_argument("--num_epochs", type=int, default=None, required=True,
-                        help="number of epochs to train. You should specify"
-                             "either num_epochs or max_steps")
-    parser.add_argument("--model_config", type=str, required=True,
-                        help="model configuration file: model.yaml")
-    parser.add_argument("--eval_datasets", type=str, required=True,
-                        help="validation dataset path")
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=None,
+        required=True,
+        help="number of epochs to train. You should specify" "either num_epochs or max_steps",
+    )
+    parser.add_argument(
+        "--model_config", type=str, required=True, help="model configuration file: model.yaml",
+    )
+    parser.add_argument(
+        "--eval_datasets", type=str, required=True, help="validation dataset path",
+    )
 
     # Create new args
     parser.add_argument("--exp_name", default="GarNet", type=str)
@@ -104,12 +110,10 @@ def create_dag(args, cfg, num_gpus):
         labels=cfg['target']['labels'],
         eos_id=cfg['target']['eos_id'],
         batch_size=cfg['inference']['batch_size'],
-        load_audio=False
+        load_audio=False,
     )
     decoder = nemo.backends.pytorch.DecoderRNN(
-        voc_size=len(cfg['target']['labels']),
-        bos_id=cfg['target']['bos_id'],
-        **cfg['DecoderRNN']
+        voc_size=len(cfg['target']['labels']), bos_id=cfg['target']['bos_id'], **cfg['DecoderRNN'],
     )
     num_data = len(data)
     batch_size = cfg['optimization']['batch_size']
@@ -118,42 +122,27 @@ def create_dag(args, cfg, num_gpus):
     total_steps = num_epochs * steps_per_epoch
     vsc = ValueSetterCallback
     tf_callback = ValueSetterCallback(
-        decoder, 'teacher_forcing',
-        policies=[
-            vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0),
-        ],
-        total_steps=total_steps
+        decoder,
+        'teacher_forcing',
+        policies=[vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0),],
+        total_steps=total_steps,
     )
     seq_loss = nemo.backends.pytorch.SequenceLoss(
-        pad_id=cfg['target']['pad_id'],
-        smoothing_coef=cfg['optimization']['smoothing_coef']
+        pad_id=cfg['target']['pad_id'], smoothing_coef=cfg['optimization']['smoothing_coef'],
     )
     saver_callback = nemo.core.ModuleSaverCallback(
-        save_modules_list=[decoder],
-        folder=args.checkpoint_dir,
-        step_freq=args.checkpoint_save_freq
+        save_modules_list=[decoder], folder=args.checkpoint_dir, step_freq=args.checkpoint_save_freq,
     )
 
     # Creating DAG
     texts, _ = data()
-    log_probs, _ = decoder(
-        targets=texts
-    )
-    train_loss = seq_loss(
-        log_probs=log_probs,
-        targets=texts
-    )
+    log_probs, _ = decoder(targets=texts)
+    train_loss = seq_loss(log_probs=log_probs, targets=texts)
     evals = []
     _, _, texts, _ = data_eval()
-    log_probs, _ = decoder(
-        targets=texts
-    )
-    eval_loss = seq_loss(
-        log_probs=log_probs,
-        targets=texts
-    )
-    evals.append((args.eval_datasets,
-                  (eval_loss, log_probs, texts)))
+    log_probs, _ = decoder(targets=texts)
+    eval_loss = seq_loss(log_probs=log_probs, targets=texts)
+    evals.append((args.eval_datasets, (eval_loss, log_probs, texts)))
 
     # Update config
     cfg['num_params'] = {'decoder': decoder.num_weights}
@@ -193,7 +182,8 @@ def main():
         create_tb_writer=args.create_tb_writer,
         files_to_copy=[args.model_config, __file__],
         cudnn_benchmark=args.cudnn_benchmark,
-        tensorboard_dir=args.tensorboard_dir)
+        tensorboard_dir=args.tensorboard_dir,
+    )
 
     logger = neural_factory.logger
     tb_writer = neural_factory.tb_writer
@@ -210,17 +200,14 @@ def main():
         logger.info(f'Using seed {args.random_seed}')
 
     # Defining computational graph
-    (train_loss, evals), cfg, dag_callbacks = create_dag(
-        args, cfg, neural_factory.world_size)
+    (train_loss, evals), cfg, dag_callbacks = create_dag(args, cfg, neural_factory.world_size)
     logger.info('Config:')
     logger.info(pformat(cfg))
 
     num_data = cfg['input']['train']['num_data']
     steps_per_epoch = cfg['optimization']['steps_per_epoch']
     total_steps = cfg['optimization']['total_steps']
-    logger.info(f'Num data: {num_data}\n'
-                f'Steps per epoch: {steps_per_epoch}\n'
-                f'Total steps: {total_steps}')
+    logger.info(f'Num data: {num_data}\n' f'Steps per epoch: {steps_per_epoch}\n' f'Total steps: {total_steps}')
 
     dag_callbacks[0].tb_writer = tb_writer
 
@@ -229,7 +216,7 @@ def main():
         tensors=[train_loss],
         print_func=lambda x: logger.info(f"Loss: {x[0].item()}"),
         get_tb_values=lambda x: [("loss", x[0])],
-        tb_writer=tb_writer
+        tb_writer=tb_writer,
     )
     log_callbacks = [train_callback]
     target = cfg['target']
@@ -239,19 +226,10 @@ def main():
         eval_callback = nemo.core.EvaluatorCallback(
             # TODO: Should be fixed soon, so we don't need to pass exactly list
             eval_tensors=list(tensors),
-            user_iter_callback=partial(
-                process_evaluation_batch,
-                labels=labels,
-                specials=specials,
-                write_attn=False
-            ),
-            user_epochs_done_callback=partial(
-                process_evaluation_epoch,
-                tag=os.path.basename(name),
-                logger=logger
-            ),
+            user_iter_callback=partial(process_evaluation_batch, labels=labels, specials=specials, write_attn=False,),
+            user_epochs_done_callback=partial(process_evaluation_epoch, tag=os.path.basename(name), logger=logger,),
             eval_step=args.eval_freq,
-            tb_writer=tb_writer
+            tb_writer=tb_writer,
         )
         log_callbacks.append(eval_callback)
     # noinspection PyTypeChecker
@@ -264,14 +242,11 @@ def main():
         lr_policy=SquareAnnealing(
             cfg['optimization']['total_steps'],
             min_lr=cfg['optimization']['min_lr'],
-            warmup_steps=(
-                    cfg['optimization']['warmup_epochs']
-                    * cfg['optimization']['steps_per_epoch']
-            )
+            warmup_steps=(cfg['optimization']['warmup_epochs'] * cfg['optimization']['steps_per_epoch']),
         ),
         optimizer=cfg['optimization']['optimizer'],
         optimization_params=cfg['optimization']['params'],
-        batches_per_step=args.iter_per_step
+        batches_per_step=args.iter_per_step,
     )
 
 
diff --git a/examples/asr/jasper.py b/examples/asr/jasper.py
index ebff40818ed5..f2c7d4b82e02 100644
--- a/examples/asr/jasper.py
+++ b/examples/asr/jasper.py
@@ -1,25 +1,23 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import argparse
 import copy
-from functools import partial
 import math
 import os
+from functools import partial
 
 from ruamel.yaml import YAML
 
 import nemo
-from nemo.utils.lr_policies import CosineAnnealing
-import nemo.utils.argparse as nm_argparse
 import nemo.collections.asr as nemo_asr
-from nemo.collections.asr.helpers import monitor_asr_train_progress, \
-    process_evaluation_batch, process_evaluation_epoch
+import nemo.utils.argparse as nm_argparse
+from nemo.collections.asr.helpers import monitor_asr_train_progress, process_evaluation_batch, process_evaluation_epoch
+from nemo.utils.lr_policies import CosineAnnealing
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        parents=[nm_argparse.NemoArgParser()],
-        description='Jasper',
-        conflict_handler='resolve')
+        parents=[nm_argparse.NemoArgParser()], description='Jasper', conflict_handler='resolve',
+    )
     parser.set_defaults(
         checkpoint_dir=None,
         optimizer="novograd",
@@ -27,24 +25,28 @@ def parse_args():
         eval_batch_size=64,
         lr=0.02,
         amp_opt_level="O1",
-        create_tb_writer=True
+        create_tb_writer=True,
     )
 
     # Overwrite default args
-    parser.add_argument("--max_steps", type=int, default=None, required=False,
-                        help="max number of steps to train")
-    parser.add_argument("--num_epochs", type=int, default=None, required=False,
-                        help="number of epochs to train")
-    parser.add_argument("--model_config", type=str, required=True,
-                        help="model configuration file: model.yaml")
+    parser.add_argument(
+        "--max_steps", type=int, default=None, required=False, help="max number of steps to train",
+    )
+    parser.add_argument(
+        "--num_epochs", type=int, default=None, required=False, help="number of epochs to train",
+    )
+    parser.add_argument(
+        "--model_config", type=str, required=True, help="model configuration file: model.yaml",
+    )
 
     # Create new args
     parser.add_argument("--exp_name", default="Jasper", type=str)
     parser.add_argument("--beta1", default=0.95, type=float)
     parser.add_argument("--beta2", default=0.25, type=float)
     parser.add_argument("--warmup_steps", default=0, type=int)
-    parser.add_argument("--load_dir", default=None, type=str,
-                        help="directory with pre-trained checkpoint")
+    parser.add_argument(
+        "--load_dir", default=None, type=str, help="directory with pre-trained checkpoint",
+    )
 
     args = parser.parse_args()
 
@@ -53,24 +55,15 @@ def parse_args():
     return args
 
 
-def construct_name(name, lr, batch_size, max_steps, num_epochs, wd, optimizer,
-                   iter_per_step):
+def construct_name(name, lr, batch_size, max_steps, num_epochs, wd, optimizer, iter_per_step):
     if max_steps is not None:
-        return ("{0}-lr_{1}-bs_{2}-s_{3}-wd_{4}-opt_{5}-ips_{6}".format(
-            name, lr,
-            batch_size,
-            max_steps,
-            wd,
-            optimizer,
-            iter_per_step))
+        return "{0}-lr_{1}-bs_{2}-s_{3}-wd_{4}-opt_{5}-ips_{6}".format(
+            name, lr, batch_size, max_steps, wd, optimizer, iter_per_step
+        )
     else:
-        return ("{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}-ips_{6}".format(
-            name, lr,
-            batch_size,
-            num_epochs,
-            wd,
-            optimizer,
-            iter_per_step))
+        return "{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}-ips_{6}".format(
+            name, lr, batch_size, num_epochs, wd, optimizer, iter_per_step
+        )
 
 
 def create_all_dags(args, neural_factory):
@@ -102,13 +95,12 @@ def create_all_dags(args, neural_factory):
     )
 
     N = len(data_layer)
-    steps_per_epoch = math.ceil(
-        N / (args.batch_size * args.iter_per_step * args.num_gpus))
+    steps_per_epoch = math.ceil(N / (args.batch_size * args.iter_per_step * args.num_gpus))
     nemo.logging.info('Have {0} examples to train on.'.format(N))
 
     data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
-        sample_rate=sample_rate,
-        **jasper_params["AudioToMelSpectrogramPreprocessor"])
+        sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"],
+    )
 
     multiply_batch_config = jasper_params.get('MultiplyBatch', None)
     if multiply_batch_config:
@@ -116,8 +108,7 @@ def create_all_dags(args, neural_factory):
 
     spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)
     if spectr_augment_config:
-        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
-            **spectr_augment_config)
+        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config)
 
     eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
     eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
@@ -141,106 +132,80 @@ def create_all_dags(args, neural_factory):
         nemo.logging.warning("There were no val datasets passed")
 
     jasper_encoder = nemo_asr.JasperEncoder(
-        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
-        **jasper_params["JasperEncoder"])
+        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"],
+    )
 
     jasper_decoder = nemo_asr.JasperDecoderForCTC(
         feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
         num_classes=len(vocab),
-        factory=neural_factory)
+        factory=neural_factory,
+    )
 
-    ctc_loss = nemo_asr.CTCLossNM(
-        num_classes=len(vocab))
+    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
 
     greedy_decoder = nemo_asr.GreedyCTCDecoder()
 
     nemo.logging.info('================================')
+    nemo.logging.info(f"Number of parameters in encoder: {jasper_encoder.num_weights}")
+    nemo.logging.info(f"Number of parameters in decoder: {jasper_decoder.num_weights}")
     nemo.logging.info(
-        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
-    nemo.logging.info(
-        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
-    nemo.logging.info(
-        f"Total number of parameters in model: "
-        f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
+        f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}"
+    )
     nemo.logging.info('================================')
 
     # Train DAG
-    audio_signal_t, a_sig_length_t, \
-        transcript_t, transcript_len_t = data_layer()
-    processed_signal_t, p_length_t = data_preprocessor(
-        input_signal=audio_signal_t,
-        length=a_sig_length_t)
+    (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t,) = data_layer()
+    processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t)
 
     if multiply_batch_config:
-        processed_signal_t, p_length_t, transcript_t, transcript_len_t = \
-            multiply_batch(
-                in_x=processed_signal_t, in_x_len=p_length_t,
-                in_y=transcript_t,
-                in_y_len=transcript_len_t)
+        (processed_signal_t, p_length_t, transcript_t, transcript_len_t,) = multiply_batch(
+            in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t,
+        )
 
     if spectr_augment_config:
-        processed_signal_t = data_spectr_augmentation(
-            input_spec=processed_signal_t)
+        processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t)
 
-    encoded_t, encoded_len_t = jasper_encoder(
-        audio_signal=processed_signal_t,
-        length=p_length_t)
+    encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t, length=p_length_t)
     log_probs_t = jasper_decoder(encoder_output=encoded_t)
     predictions_t = greedy_decoder(log_probs=log_probs_t)
     loss_t = ctc_loss(
-        log_probs=log_probs_t,
-        targets=transcript_t,
-        input_length=encoded_len_t,
-        target_length=transcript_len_t)
+        log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t,
+    )
 
     # Callbacks needed to print info to console and Tensorboard
     train_callback = nemo.core.SimpleLossLoggerCallback(
         tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
-        print_func=partial(
-            monitor_asr_train_progress,
-            labels=vocab),
+        print_func=partial(monitor_asr_train_progress, labels=vocab),
         get_tb_values=lambda x: [("loss", x[0])],
         tb_writer=neural_factory.tb_writer,
     )
 
     chpt_callback = nemo.core.CheckpointCallback(
-        folder=neural_factory.checkpoint_dir,
-        load_from_folder=args.load_dir,
-        step_freq=args.checkpoint_save_freq)
+        folder=neural_factory.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq,
+    )
 
     callbacks = [train_callback, chpt_callback]
 
     # assemble eval DAGs
     for i, eval_dl in enumerate(data_layers_eval):
-        audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e = \
-            eval_dl()
-        processed_signal_e, p_length_e = data_preprocessor(
-            input_signal=audio_signal_e,
-            length=a_sig_length_e)
-        encoded_e, encoded_len_e = jasper_encoder(
-            audio_signal=processed_signal_e,
-            length=p_length_e)
+        (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e,) = eval_dl()
+        processed_signal_e, p_length_e = data_preprocessor(input_signal=audio_signal_e, length=a_sig_length_e)
+        encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_signal_e, length=p_length_e)
         log_probs_e = jasper_decoder(encoder_output=encoded_e)
         predictions_e = greedy_decoder(log_probs=log_probs_e)
         loss_e = ctc_loss(
-            log_probs=log_probs_e,
-            targets=transcript_e,
-            input_length=encoded_len_e,
-            target_length=transcript_len_e)
+            log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
+        )
 
         # create corresponding eval callback
         tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
         eval_callback = nemo.core.EvaluatorCallback(
-            eval_tensors=[loss_e, predictions_e,
-                          transcript_e, transcript_len_e],
-            user_iter_callback=partial(
-                process_evaluation_batch,
-                labels=vocab),
-            user_epochs_done_callback=partial(
-                process_evaluation_epoch,
-                tag=tagname),
+            eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e,],
+            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
+            user_epochs_done_callback=partial(process_evaluation_epoch, tag=tagname),
             eval_step=args.eval_freq,
-            tb_writer=neural_factory.tb_writer)
+            tb_writer=neural_factory.tb_writer,
+        )
 
         callbacks.append(eval_callback)
     return loss_t, callbacks, steps_per_epoch
@@ -257,7 +222,8 @@ def main():
         args.num_epochs,
         args.weight_decay,
         args.optimizer,
-        args.iter_per_step)
+        args.iter_per_step,
+    )
     log_dir = name
     if args.work_dir:
         log_dir = os.path.join(args.work_dir, name)
@@ -272,7 +238,8 @@ def main():
         create_tb_writer=args.create_tb_writer,
         files_to_copy=[args.model_config, __file__],
         cudnn_benchmark=args.cudnn_benchmark,
-        tensorboard_dir=args.tensorboard_dir)
+        tensorboard_dir=args.tensorboard_dir,
+    )
     args.num_gpus = neural_factory.world_size
 
     checkpoint_dir = neural_factory.checkpoint_dir
@@ -280,28 +247,27 @@ def main():
         nemo.logging.info('Doing ALL GPU')
 
     # build dags
-    train_loss, callbacks, steps_per_epoch = \
-        create_all_dags(args, neural_factory)
+    train_loss, callbacks, steps_per_epoch = create_all_dags(args, neural_factory)
 
     # train model
     neural_factory.train(
         tensors_to_optimize=[train_loss],
         callbacks=callbacks,
         lr_policy=CosineAnnealing(
-            args.max_steps if args.max_steps is not None else
-            args.num_epochs * steps_per_epoch,
-            warmup_steps=args.warmup_steps),
+            args.max_steps if args.max_steps is not None else args.num_epochs * steps_per_epoch,
+            warmup_steps=args.warmup_steps,
+        ),
         optimizer=args.optimizer,
         optimization_params={
             "num_epochs": args.num_epochs,
             "max_steps": args.max_steps,
             "lr": args.lr,
-            "betas": (
-                args.beta1,
-                args.beta2),
+            "betas": (args.beta1, args.beta2),
             "weight_decay": args.weight_decay,
-            "grad_norm_clip": None},
-        batches_per_step=args.iter_per_step)
+            "grad_norm_clip": None,
+        },
+        batches_per_step=args.iter_per_step,
+    )
 
 
 if __name__ == '__main__':
diff --git a/examples/asr/jasper_aishell.py b/examples/asr/jasper_aishell.py
index 03f3da1906e7..a6115d6c8f77 100644
--- a/examples/asr/jasper_aishell.py
+++ b/examples/asr/jasper_aishell.py
@@ -1,24 +1,22 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import argparse
 import copy
-from functools import partial
 import os
+from functools import partial
 
 from ruamel.yaml import YAML
 
 import nemo
-from nemo.utils.lr_policies import SquareAnnealing
-import nemo.utils.argparse as nm_argparse
 import nemo.collections.asr as nemo_asr
-from nemo.collections.asr.helpers import monitor_asr_train_progress, \
-    process_evaluation_batch, process_evaluation_epoch
+import nemo.utils.argparse as nm_argparse
+from nemo.collections.asr.helpers import monitor_asr_train_progress, process_evaluation_batch, process_evaluation_epoch
+from nemo.utils.lr_policies import SquareAnnealing
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        parents=[nm_argparse.NemoArgParser()],
-        description='Jasper Aishell',
-        conflict_handler='resolve')
+        parents=[nm_argparse.NemoArgParser()], description='Jasper Aishell', conflict_handler='resolve',
+    )
 
     parser.set_defaults(
         model_config="./configs/jasper12x1SEP.yaml",
@@ -33,12 +31,11 @@ def parse_args():
         warmup_steps=8000,
         checkpoint_save_freq=1000,
         train_eval_freq=50,
-        eval_freq=4000
+        eval_freq=4000,
     )
 
     # Create new args
-    parser.add_argument("--vocab_file", type=str, required=True,
-                        help="vocabulary file path")
+    parser.add_argument("--vocab_file", type=str, required=True, help="vocabulary file path")
     parser.add_argument("--exp_name", default="Jasper Aishell", type=str)
     parser.add_argument("--beta1", default=0.95, type=float)
     parser.add_argument("--beta2", default=0.25, type=float)
@@ -51,15 +48,10 @@ def parse_args():
     return args
 
 
-def construct_name(name, lr, batch_size, num_epochs, wd, optimizer,
-                   iter_per_step):
-    return ("{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}-ips_{6}".format(
-        name, lr,
-        batch_size,
-        num_epochs,
-        wd,
-        optimizer,
-        iter_per_step))
+def construct_name(name, lr, batch_size, num_epochs, wd, optimizer, iter_per_step):
+    return "{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}-ips_{6}".format(
+        name, lr, batch_size, num_epochs, wd, optimizer, iter_per_step
+    )
 
 
 def load_vocab(vocab_file):
@@ -107,8 +99,8 @@ def create_all_dags(args, neural_factory):
     nemo.logging.info('Have {0} examples to train on.'.format(N))
 
     data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
-        sample_rate=sample_rate,
-        **jasper_params["AudioToMelSpectrogramPreprocessor"])
+        sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"],
+    )
 
     multiply_batch_config = jasper_params.get('MultiplyBatch', None)
     if multiply_batch_config:
@@ -116,8 +108,7 @@ def create_all_dags(args, neural_factory):
 
     spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)
     if spectr_augment_config:
-        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
-            **spectr_augment_config)
+        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config)
 
     eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
     eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
@@ -142,108 +133,81 @@ def create_all_dags(args, neural_factory):
         nemo.logging.warning("There were no val datasets passed")
 
     jasper_encoder = nemo_asr.JasperEncoder(
-        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
-        **jasper_params["JasperEncoder"])
+        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"],
+    )
 
     jasper_decoder = nemo_asr.JasperDecoderForCTC(
         feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
         num_classes=len(vocab),
-        factory=neural_factory)
+        factory=neural_factory,
+    )
 
-    ctc_loss = nemo_asr.CTCLossNM(
-        num_classes=len(vocab))
+    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
 
     greedy_decoder = nemo_asr.GreedyCTCDecoder()
 
     nemo.logging.info('================================')
+    nemo.logging.info(f"Number of parameters in encoder: {jasper_encoder.num_weights}")
+    nemo.logging.info(f"Number of parameters in decoder: {jasper_decoder.num_weights}")
     nemo.logging.info(
-        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
-    nemo.logging.info(
-        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
-    nemo.logging.info(
-        f"Total number of parameters in model: "
-        f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
+        f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}"
+    )
     nemo.logging.info('================================')
 
     # Train DAG
-    audio_signal_t, a_sig_length_t, \
-        transcript_t, transcript_len_t = data_layer()
-    processed_signal_t, p_length_t = data_preprocessor(
-        input_signal=audio_signal_t,
-        length=a_sig_length_t)
+    (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t,) = data_layer()
+    processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t)
 
     if multiply_batch_config:
-        processed_signal_t, p_length_t, transcript_t, transcript_len_t = \
-            multiply_batch(
-                in_x=processed_signal_t, in_x_len=p_length_t,
-                in_y=transcript_t,
-                in_y_len=transcript_len_t)
+        (processed_signal_t, p_length_t, transcript_t, transcript_len_t,) = multiply_batch(
+            in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t,
+        )
 
     if spectr_augment_config:
-        processed_signal_t = data_spectr_augmentation(
-            input_spec=processed_signal_t)
+        processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t)
 
-    encoded_t, encoded_len_t = jasper_encoder(
-        audio_signal=processed_signal_t,
-        length=p_length_t)
+    encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t, length=p_length_t)
     log_probs_t = jasper_decoder(encoder_output=encoded_t)
     predictions_t = greedy_decoder(log_probs=log_probs_t)
     loss_t = ctc_loss(
-        log_probs=log_probs_t,
-        targets=transcript_t,
-        input_length=encoded_len_t,
-        target_length=transcript_len_t)
+        log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t,
+    )
 
     # Callbacks needed to print info to console and Tensorboard
     train_callback = nemo.core.SimpleLossLoggerCallback(
         tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
-        print_func=partial(
-            monitor_asr_train_progress,
-            labels=vocab,
-            eval_metric='CER'),
+        print_func=partial(monitor_asr_train_progress, labels=vocab, eval_metric='CER'),
         step_freq=args.train_eval_freq,
         get_tb_values=lambda x: [("loss", x[0])],
         tb_writer=neural_factory.tb_writer,
     )
 
     chpt_callback = nemo.core.CheckpointCallback(
-        folder=neural_factory.checkpoint_dir,
-        step_freq=args.checkpoint_save_freq)
+        folder=neural_factory.checkpoint_dir, step_freq=args.checkpoint_save_freq,
+    )
 
     callbacks = [train_callback, chpt_callback]
 
     # assemble eval DAGs
     for i, eval_dl in enumerate(data_layers_eval):
-        audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e = \
-            eval_dl()
-        processed_signal_e, p_length_e = data_preprocessor(
-            input_signal=audio_signal_e,
-            length=a_sig_length_e)
-        encoded_e, encoded_len_e = jasper_encoder(
-            audio_signal=processed_signal_e,
-            length=p_length_e)
+        (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e,) = eval_dl()
+        processed_signal_e, p_length_e = data_preprocessor(input_signal=audio_signal_e, length=a_sig_length_e)
+        encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_signal_e, length=p_length_e)
         log_probs_e = jasper_decoder(encoder_output=encoded_e)
         predictions_e = greedy_decoder(log_probs=log_probs_e)
         loss_e = ctc_loss(
-            log_probs=log_probs_e,
-            targets=transcript_e,
-            input_length=encoded_len_e,
-            target_length=transcript_len_e)
+            log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
+        )
 
         # create corresponding eval callback
         tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
         eval_callback = nemo.core.EvaluatorCallback(
-            eval_tensors=[loss_e, predictions_e,
-                          transcript_e, transcript_len_e],
-            user_iter_callback=partial(
-                process_evaluation_batch,
-                labels=vocab),
-            user_epochs_done_callback=partial(
-                process_evaluation_epoch,
-                eval_metric='CER',
-                tag=tagname),
+            eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e,],
+            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
+            user_epochs_done_callback=partial(process_evaluation_epoch, eval_metric='CER', tag=tagname),
             eval_step=args.eval_freq,
-            tb_writer=neural_factory.tb_writer)
+            tb_writer=neural_factory.tb_writer,
+        )
 
         callbacks.append(eval_callback)
     return loss_t, callbacks, steps_per_epoch
@@ -258,7 +222,8 @@ def main():
         args.num_epochs,
         args.weight_decay,
         args.optimizer,
-        args.iter_per_step)
+        args.iter_per_step,
+    )
     log_dir = name
     if args.work_dir:
         log_dir = os.path.join(args.work_dir, name)
@@ -273,7 +238,8 @@ def main():
         create_tb_writer=args.create_tb_writer,
         files_to_copy=[args.model_config, __file__],
         cudnn_benchmark=args.cudnn_benchmark,
-        tensorboard_dir=args.tensorboard_dir)
+        tensorboard_dir=args.tensorboard_dir,
+    )
     args.num_gpus = neural_factory.world_size
 
     checkpoint_dir = neural_factory.checkpoint_dir
@@ -281,25 +247,23 @@ def main():
         nemo.logging.info('Doing ALL GPU')
 
     # build dags
-    train_loss, callbacks, steps_per_epoch = \
-        create_all_dags(args, neural_factory)
+    train_loss, callbacks, steps_per_epoch = create_all_dags(args, neural_factory)
 
     # train model
     neural_factory.train(
         tensors_to_optimize=[train_loss],
         callbacks=callbacks,
-        lr_policy=SquareAnnealing(args.num_epochs * steps_per_epoch,
-                                  warmup_steps=args.warmup_steps),
+        lr_policy=SquareAnnealing(args.num_epochs * steps_per_epoch, warmup_steps=args.warmup_steps),
         optimizer=args.optimizer,
         optimization_params={
             "num_epochs": args.num_epochs,
             "lr": args.lr,
-            "betas": (
-                args.beta1,
-                args.beta2),
+            "betas": (args.beta1, args.beta2),
             "weight_decay": args.weight_decay,
-            "grad_norm_clip": None},
-        batches_per_step=args.iter_per_step)
+            "grad_norm_clip": None,
+        },
+        batches_per_step=args.iter_per_step,
+    )
 
 
 if __name__ == '__main__':
diff --git a/examples/asr/jasper_aishell_infer.py b/examples/asr/jasper_aishell_infer.py
index e8ab0361849e..919355731493 100644
--- a/examples/asr/jasper_aishell_infer.py
+++ b/examples/asr/jasper_aishell_infer.py
@@ -8,8 +8,7 @@
 
 import nemo
 import nemo.collections.asr as nemo_asr
-from nemo.collections.asr.helpers import word_error_rate, post_process_predictions, \
-    post_process_transcripts
+from nemo.collections.asr.helpers import post_process_predictions, post_process_transcripts, word_error_rate
 
 
 def load_vocab(vocab_file):
@@ -48,8 +47,8 @@ def main():
     if args.local_rank is not None:
         if args.lm_path:
             raise NotImplementedError(
-                "Beam search decoder with LM does not currently support "
-                "evaluation on multi-gpu.")
+                "Beam search decoder with LM does not currently support " "evaluation on multi-gpu."
+            )
         device = nemo.core.DeviceType.AllGpu
     else:
         device = nemo.core.DeviceType.GPU
@@ -59,7 +58,8 @@ def main():
         backend=nemo.core.Backend.PyTorch,
         local_rank=args.local_rank,
         optimization_level=nemo.core.Optimization.mxprO1,
-        placement=device)
+        placement=device,
+    )
 
     if args.local_rank is not None:
         nemo.logging.info('Doing ALL GPU')
@@ -84,21 +84,21 @@ def main():
         sample_rate=sample_rate,
         labels=vocab,
         batch_size=batch_size,
-        **eval_dl_params)
+        **eval_dl_params,
+    )
 
     n = len(data_layer)
     nemo.logging.info('Evaluating {0} examples'.format(n))
 
     data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
-        sample_rate=sample_rate,
-        **jasper_params["AudioToMelSpectrogramPreprocessor"])
+        sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"],
+    )
     jasper_encoder = nemo_asr.JasperEncoder(
-        feat_in=jasper_params[
-            "AudioToMelSpectrogramPreprocessor"]["features"],
-        **jasper_params["JasperEncoder"])
+        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"],
+    )
     jasper_decoder = nemo_asr.JasperDecoderForCTC(
-        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
-        num_classes=len(vocab))
+        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab),
+    )
     greedy_decoder = nemo_asr.GreedyCTCDecoder()
 
     if args.lm_path:
@@ -115,48 +115,40 @@ def main():
             cutoff_prob=cutoff_prob,
             cutoff_top_n=cutoff_top_n,
             lm_path=args.lm_path,
-            num_cpus=max(os.cpu_count(), 1))
+            num_cpus=max(os.cpu_count(), 1),
+        )
 
     nemo.logging.info('================================')
+    nemo.logging.info(f"Number of parameters in encoder: {jasper_encoder.num_weights}")
+    nemo.logging.info(f"Number of parameters in decoder: {jasper_decoder.num_weights}")
     nemo.logging.info(
-        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
-    nemo.logging.info(
-        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
-    nemo.logging.info(
-        f"Total number of parameters in model: "
-        f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
+        f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}"
+    )
     nemo.logging.info('================================')
 
-    audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1 = \
-        data_layer()
-    processed_signal_e1, p_length_e1 = data_preprocessor(
-        input_signal=audio_signal_e1,
-        length=a_sig_length_e1)
-    encoded_e1, encoded_len_e1 = jasper_encoder(
-        audio_signal=processed_signal_e1,
-        length=p_length_e1)
+    (audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1,) = data_layer()
+    processed_signal_e1, p_length_e1 = data_preprocessor(input_signal=audio_signal_e1, length=a_sig_length_e1)
+    encoded_e1, encoded_len_e1 = jasper_encoder(audio_signal=processed_signal_e1, length=p_length_e1)
     log_probs_e1 = jasper_decoder(encoder_output=encoded_e1)
     predictions_e1 = greedy_decoder(log_probs=log_probs_e1)
 
-    eval_tensors = [log_probs_e1, predictions_e1,
-                    transcript_e1, transcript_len_e1, encoded_len_e1]
+    eval_tensors = [
+        log_probs_e1,
+        predictions_e1,
+        transcript_e1,
+        transcript_len_e1,
+        encoded_len_e1,
+    ]
 
     if args.lm_path:
-        beam_predictions_e1 = beam_search_with_lm(
-            log_probs=log_probs_e1, log_probs_length=encoded_len_e1)
+        beam_predictions_e1 = beam_search_with_lm(log_probs=log_probs_e1, log_probs_length=encoded_len_e1)
         eval_tensors.append(beam_predictions_e1)
 
-    evaluated_tensors = neural_factory.infer(
-        tensors=eval_tensors,
-        checkpoint_dir=load_dir,
-    )
+    evaluated_tensors = neural_factory.infer(tensors=eval_tensors, checkpoint_dir=load_dir,)
 
     greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
-    references = post_process_transcripts(
-        evaluated_tensors[2], evaluated_tensors[3], vocab)
-    cer = word_error_rate(hypotheses=greedy_hypotheses,
-                          references=references,
-                          use_cer=True)
+    references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab)
+    cer = word_error_rate(hypotheses=greedy_hypotheses, references=references, use_cer=True)
     nemo.logging.info("Greedy CER {:.2f}%".format(cer * 100))
 
     if args.lm_path:
@@ -167,8 +159,7 @@ def main():
             for j in i:
                 beam_hypotheses.append(j[0][1])
 
-        cer = word_error_rate(
-            hypotheses=beam_hypotheses, references=references, use_cer=True)
+        cer = word_error_rate(hypotheses=beam_hypotheses, references=references, use_cer=True)
         nemo.logging.info("Beam CER {:.2f}".format(cer * 100))
 
     if args.save_logprob:
@@ -176,8 +167,7 @@ def main():
         logprob = []
         for i, batch in enumerate(evaluated_tensors[0]):
             for j in range(batch.shape[0]):
-                logprob.append(
-                    batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy())
+                logprob.append(batch[j][: evaluated_tensors[4][i][j], :].cpu().numpy())
         with open(args.save_logprob, 'wb') as f:
             pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)
 
diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py
index 3695c1c28922..4a307f983595 100644
--- a/examples/asr/jasper_an4.py
+++ b/examples/asr/jasper_an4.py
@@ -1,19 +1,24 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import argparse
 import copy
-from functools import partial
 import math
 import os
+from functools import partial
 
 from ruamel.yaml import YAML
 
 import nemo
+import nemo.collections.asr as nemo_asr
 import nemo.utils.argparse as nm_argparse
+from nemo.collections.asr.helpers import (
+    monitor_asr_train_progress,
+    post_process_predictions,
+    post_process_transcripts,
+    process_evaluation_batch,
+    process_evaluation_epoch,
+    word_error_rate,
+)
 from nemo.utils.lr_policies import CosineAnnealing
-import nemo.collections.asr as nemo_asr
-from nemo.collections.asr.helpers import monitor_asr_train_progress, \
-    process_evaluation_batch, process_evaluation_epoch, word_error_rate, \
-    post_process_predictions, post_process_transcripts
 
 
 def create_dags(jasper_params, args, nf):
@@ -26,15 +31,11 @@ def create_dags(jasper_params, args, nf):
     del train_dl_params["eval"]
 
     data_layer = nemo_asr.AudioToTextDataLayer(
-        manifest_filepath=args.train_dataset,
-        labels=vocab,
-        batch_size=args.batch_size,
-        **train_dl_params
+        manifest_filepath=args.train_dataset, labels=vocab, batch_size=args.batch_size, **train_dl_params,
     )
 
     num_samples = len(data_layer)
-    steps_per_epoch = math.ceil(
-        num_samples / (args.batch_size * args.iter_per_step * nf.world_size))
+    steps_per_epoch = math.ceil(num_samples / (args.batch_size * args.iter_per_step * nf.world_size))
     total_steps = steps_per_epoch * args.num_epochs
     print("Train samples=", num_samples, "num_steps=", total_steps)
 
@@ -52,10 +53,7 @@ def create_dags(jasper_params, args, nf):
     del eval_dl_params["eval"]
 
     data_layer_eval = nemo_asr.AudioToTextDataLayer(
-        manifest_filepath=args.eval_datasets,
-        labels=vocab,
-        batch_size=args.eval_batch_size,
-        **eval_dl_params
+        manifest_filepath=args.eval_datasets, labels=vocab, batch_size=args.eval_batch_size, **eval_dl_params,
     )
 
     num_samples = len(data_layer_eval)
@@ -63,9 +61,7 @@ def create_dags(jasper_params, args, nf):
 
     jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"])
 
-    jasper_decoder = nemo_asr.JasperDecoderForCTC(
-        num_classes=len(vocab),
-        **jasper_params["JasperDecoderForCTC"])
+    jasper_decoder = nemo_asr.JasperDecoderForCTC(num_classes=len(vocab), **jasper_params["JasperDecoderForCTC"])
 
     ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
 
@@ -73,67 +69,61 @@ def create_dags(jasper_params, args, nf):
 
     # Training model
     audio, audio_len, transcript, transcript_len = data_layer()
-    processed, processed_len = data_preprocessor(
-        input_signal=audio, length=audio_len)
-    encoded, encoded_len = jasper_encoder(
-        audio_signal=processed, length=processed_len)
+    processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len)
+    encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len)
     log_probs = jasper_decoder(encoder_output=encoded)
     predictions = greedy_decoder(log_probs=log_probs)
-    loss = ctc_loss(log_probs=log_probs, targets=transcript,
-                    input_length=encoded_len, target_length=transcript_len)
+    loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,)
 
     # Evaluation model
     audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
-    processed_e, processed_len_e = data_preprocessor(
-        input_signal=audio_e, length=audio_len_e)
-    encoded_e, encoded_len_e = jasper_encoder(
-        audio_signal=processed_e, length=processed_len_e)
+    processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e)
+    encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e)
     log_probs_e = jasper_decoder(encoder_output=encoded_e)
     predictions_e = greedy_decoder(log_probs=log_probs_e)
-    loss_e = ctc_loss(log_probs=log_probs_e, targets=transcript_e,
-                      input_length=encoded_len_e,
-                      target_length=transcript_len_e)
-    nemo.logging.info(
-        "Num of params in encoder: {0}".format(jasper_encoder.num_weights))
+    loss_e = ctc_loss(
+        log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
+    )
+    nemo.logging.info("Num of params in encoder: {0}".format(jasper_encoder.num_weights))
 
     # Callbacks to print info to console and Tensorboard
     train_callback = nemo.core.SimpleLossLoggerCallback(
         tensors=[loss, predictions, transcript, transcript_len],
-        print_func=partial(
-            monitor_asr_train_progress,
-            labels=vocab),
+        print_func=partial(monitor_asr_train_progress, labels=vocab),
         get_tb_values=lambda x: [["loss", x[0]]],
         tb_writer=nf.tb_writer,
     )
 
-    checkpointer_callback = nemo.core.CheckpointCallback(
-        folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)
+    checkpointer_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)
 
     eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
     eval_callback = nemo.core.EvaluatorCallback(
         eval_tensors=eval_tensors,
-        user_iter_callback=partial(
-            process_evaluation_batch,
-            labels=vocab),
+        user_iter_callback=partial(process_evaluation_batch, labels=vocab),
         user_epochs_done_callback=process_evaluation_epoch,
         eval_step=args.eval_freq,
-        tb_writer=nf.tb_writer)
+        tb_writer=nf.tb_writer,
+    )
     callbacks = [train_callback, checkpointer_callback, eval_callback]
-    return (loss, eval_tensors, callbacks, total_steps, vocab,
-            log_probs_e, encoded_len_e)
+    return (
+        loss,
+        eval_tensors,
+        callbacks,
+        total_steps,
+        vocab,
+        log_probs_e,
+        encoded_len_e,
+    )
 
 
 def main():
     parser = argparse.ArgumentParser(
-        parents=[nm_argparse.NemoArgParser()],
-        description='AN4 ASR',
-        conflict_handler='resolve')
+        parents=[nm_argparse.NemoArgParser()], description='AN4 ASR', conflict_handler='resolve',
+    )
 
     # Overwrite default args
-    parser.add_argument("--train_dataset", type=str,
-                        help="training dataset path")
-    parser.add_argument("--eval_datasets", type=str, nargs=1,
-                        help="validation dataset path")
+    parser.add_argument("--train_dataset", type=str, help="training dataset path")
+    parser.add_argument("--eval_datasets", type=str, nargs=1, help="validation dataset path")
 
     # Create new args
     parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str)
@@ -154,7 +144,7 @@ def main():
         weight_decay=0.005,
         checkpoint_save_freq=1000,
         eval_freq=100,
-        amp_opt_level="O1"
+        amp_opt_level="O1",
     )
 
     args = parser.parse_args()
@@ -170,7 +160,7 @@ def main():
         random_seed=0,
         log_dir=args.work_dir,
         create_tb_writer=True,
-        cudnn_benchmark=args.cudnn_benchmark
+        cudnn_benchmark=args.cudnn_benchmark,
     )
     tb_writer = nf.tb_writer
     checkpoint_dir = nf.checkpoint_dir
@@ -180,14 +170,15 @@ def main():
     with open(args.model_config) as f:
         jasper_params = yaml.load(f)
 
-    (loss, eval_tensors, callbacks, total_steps, vocab,
-     log_probs_e, encoded_len_e) = create_dags(jasper_params, args, nf)
+    (loss, eval_tensors, callbacks, total_steps, vocab, log_probs_e, encoded_len_e,) = create_dags(
+        jasper_params, args, nf
+    )
 
     nf.train(
         tensors_to_optimize=[loss],
         callbacks=callbacks,
         optimizer=args.optimizer,
-        lr_policy=CosineAnnealing(total_steps=total_steps, min_lr=args.lr/100),
+        lr_policy=CosineAnnealing(total_steps=total_steps, min_lr=args.lr / 100),
         optimization_params={
             "num_epochs": args.num_epochs,
             "max_steps": args.max_steps,
@@ -195,9 +186,10 @@ def main():
             "momentum": args.momentum,
             "betas": betas,
             "weight_decay": args.weight_decay,
-            "grad_norm_clip": None},
+            "grad_norm_clip": None,
+        },
         batches_per_step=args.iter_per_step,
-        amp_max_loss_scale=256.,
+        amp_max_loss_scale=256.0,
         # synced_batchnorm=(nf.global_rank is not None),
     )
 
@@ -205,33 +197,23 @@ def main():
         nemo.logging.info("Testing greedy and beam search with LM WER.")
         # Create BeamSearch NM
         if nf.world_size > 1:
-            nemo.logging.warning("Skipping beam search WER as it does not "
-                                 "work if doing distributed training.")
+            nemo.logging.warning("Skipping beam search WER as it does not " "work if doing distributed training.")
         else:
             beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
-                vocab=vocab,
-                beam_width=64,
-                alpha=2.,
-                beta=1.5,
-                lm_path=args.lm,
-                num_cpus=max(os.cpu_count(), 1))
-            beam_predictions = beam_search_with_lm(
-                log_probs=log_probs_e, log_probs_length=encoded_len_e)
+                vocab=vocab, beam_width=64, alpha=2.0, beta=1.5, lm_path=args.lm, num_cpus=max(os.cpu_count(), 1),
+            )
+            beam_predictions = beam_search_with_lm(log_probs=log_probs_e, log_probs_length=encoded_len_e)
             eval_tensors.append(beam_predictions)
 
         evaluated_tensors = nf.infer(eval_tensors)
         if nf.global_rank in [0, None]:
-            greedy_hypotheses = post_process_predictions(
-                evaluated_tensors[1], vocab)
-            references = post_process_transcripts(
-                evaluated_tensors[2], evaluated_tensors[3], vocab)
-            wer = word_error_rate(
-                hypotheses=greedy_hypotheses, references=references)
+            greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
+            references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab)
+            wer = word_error_rate(hypotheses=greedy_hypotheses, references=references)
             nemo.logging.info("Greedy WER: {:.2f}%".format(wer * 100))
             if wer > wer_thr:
                 nf.sync_all_processes(False)
-                raise ValueError(f"Final eval greedy WER {wer*100:.2f}% > :"
-                                 f"than {wer_thr*100:.2f}%")
+                raise ValueError(f"Final eval greedy WER {wer*100:.2f}% > :" f"than {wer_thr*100:.2f}%")
         nf.sync_all_processes()
 
         if nf.world_size == 1:
@@ -242,60 +224,54 @@ def main():
                 for j in i:
                     beam_hypotheses.append(j[0][1])
 
-            beam_wer = word_error_rate(
-                hypotheses=beam_hypotheses, references=references)
+            beam_wer = word_error_rate(hypotheses=beam_hypotheses, references=references)
             nemo.logging.info("Beam WER {:.2f}%".format(beam_wer * 100))
-            assert beam_wer <= beam_wer_thr, (
-                "Final eval beam WER {:.2f}%  > than {:.2f}%".format(
-                    beam_wer*100, beam_wer_thr*100))
-            assert beam_wer <= wer, (
-                "Final eval beam WER > than the greedy WER.")
+            assert beam_wer <= beam_wer_thr, "Final eval beam WER {:.2f}%  > than {:.2f}%".format(
+                beam_wer * 100, beam_wer_thr * 100
+            )
+            assert beam_wer <= wer, "Final eval beam WER > than the greedy WER."
 
         # Reload model weights and train for extra 10 epochs
         checkpointer_callback = nemo.core.CheckpointCallback(
-            folder=checkpoint_dir, step_freq=args.checkpoint_save_freq,
-            force_load=True)
+            folder=checkpoint_dir, step_freq=args.checkpoint_save_freq, force_load=True,
+        )
 
         # Distributed Data Parallel changes the underlying class so we need
         # to reinstantiate Encoder and Decoder
         args.num_epochs += 10
         previous_step_count = total_steps
-        loss, eval_tensors, callbacks, total_steps, vocab, _, _ = create_dags(
-            jasper_params, args, nf)
+        loss, eval_tensors, callbacks, total_steps, vocab, _, _ = create_dags(jasper_params, args, nf)
 
         nf.reset_trainer()
         nf.train(
             tensors_to_optimize=[loss],
             callbacks=callbacks,
             optimizer=args.optimizer,
-            lr_policy=CosineAnnealing(
-                warmup_steps=previous_step_count, total_steps=total_steps),
+            lr_policy=CosineAnnealing(warmup_steps=previous_step_count, total_steps=total_steps),
             optimization_params={
                 "num_epochs": args.num_epochs,
-                "lr": args.lr/100,
+                "lr": args.lr / 100,
                 "momentum": args.momentum,
                 "betas": betas,
                 "weight_decay": args.weight_decay,
-                "grad_norm_clip": None},
+                "grad_norm_clip": None,
+            },
             reset=True,
-            amp_max_loss_scale=256.,
+            amp_max_loss_scale=256.0,
             # synced_batchnorm=(nf.global_rank is not None),
         )
 
         evaluated_tensors = nf.infer(eval_tensors)
         if nf.global_rank in [0, None]:
-            greedy_hypotheses = post_process_predictions(
-                evaluated_tensors[1], vocab)
-            references = post_process_transcripts(
-                evaluated_tensors[2], evaluated_tensors[3], vocab)
-            wer_new = word_error_rate(
-                hypotheses=greedy_hypotheses, references=references)
+            greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
+            references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab)
+            wer_new = word_error_rate(hypotheses=greedy_hypotheses, references=references)
             nemo.logging.info("New greedy WER: {:.2f}%".format(wer_new * 100))
             if wer_new > wer * 1.1:
                 nf.sync_all_processes(False)
                 raise ValueError(
-                    f"Fine tuning: new WER {wer_new* 100:.2f}% > than the "
-                    f"previous WER {wer * 100:.2f}%")
+                    f"Fine tuning: new WER {wer_new* 100:.2f}% > than the " f"previous WER {wer * 100:.2f}%"
+                )
         nf.sync_all_processes()
 
         # Open the log file and ensure that epochs is strictly increasing
@@ -306,7 +282,7 @@ def main():
                 while line:
                     index = line.find("Starting epoch")
                     if index != -1:
-                        epochs.append(int(line[index+len("Starting epoch"):]))
+                        epochs.append(int(line[index + len("Starting epoch") :]))
                     line = log_file.readline()
             for i, e in enumerate(epochs):
                 if i != e:
diff --git a/examples/asr/jasper_eval.py b/examples/asr/jasper_eval.py
index fd62f8d92c67..b4b16699d13f 100644
--- a/examples/asr/jasper_eval.py
+++ b/examples/asr/jasper_eval.py
@@ -6,13 +6,12 @@
 import os
 import pickle
 
-from ruamel.yaml import YAML
 import numpy as np
+from ruamel.yaml import YAML
 
 import nemo
 import nemo.collections.asr as nemo_asr
-from nemo.collections.asr.helpers import word_error_rate, post_process_predictions, \
-    post_process_transcripts
+from nemo.collections.asr.helpers import post_process_predictions, post_process_transcripts, word_error_rate
 
 
 def main():
@@ -25,32 +24,35 @@ def main():
     parser.add_argument("--save_logprob", default=None, type=str)
     parser.add_argument("--lm_path", default=None, type=str)
     parser.add_argument(
-        '--alpha', default=2., type=float,
-        help='value of LM weight',
-        required=False)
+        '--alpha', default=2.0, type=float, help='value of LM weight', required=False,
+    )
     parser.add_argument(
-        '--alpha_max', type=float,
+        '--alpha_max',
+        type=float,
         help='maximum value of LM weight (for a grid search in \'eval\' mode)',
-        required=False)
+        required=False,
+    )
     parser.add_argument(
-        '--alpha_step', type=float,
-        help='step for LM weight\'s tuning in \'eval\' mode',
-        required=False, default=0.1)
+        '--alpha_step', type=float, help='step for LM weight\'s tuning in \'eval\' mode', required=False, default=0.1,
+    )
     parser.add_argument(
-        '--beta', default=1.5, type=float,
-        help='value of word count weight',
-        required=False)
+        '--beta', default=1.5, type=float, help='value of word count weight', required=False,
+    )
     parser.add_argument(
-        '--beta_max', type=float,
+        '--beta_max',
+        type=float,
         help='maximum value of word count weight (for a grid search in \
           \'eval\' mode',
-        required=False)
+        required=False,
+    )
     parser.add_argument(
-        '--beta_step', type=float,
+        '--beta_step',
+        type=float,
         help='step for word count weight\'s tuning in \'eval\' mode',
-        required=False, default=0.1)
-    parser.add_argument(
-        "--beam_width", default=128, type=int)
+        required=False,
+        default=0.1,
+    )
+    parser.add_argument("--beam_width", default=128, type=int)
 
     args = parser.parse_args()
     batch_size = args.batch_size
@@ -59,8 +61,8 @@ def main():
     if args.local_rank is not None:
         if args.lm_path:
             raise NotImplementedError(
-                "Beam search decoder with LM does not currently support "
-                "evaluation on multi-gpu.")
+                "Beam search decoder with LM does not currently support " "evaluation on multi-gpu."
+            )
         device = nemo.core.DeviceType.AllGpu
     else:
         device = nemo.core.DeviceType.GPU
@@ -70,7 +72,8 @@ def main():
         backend=nemo.core.Backend.PyTorch,
         local_rank=args.local_rank,
         optimization_level=nemo.core.Optimization.mxprO1,
-        placement=device)
+        placement=device,
+    )
 
     if args.local_rank is not None:
         nemo.logging.info('Doing ALL GPU')
@@ -92,56 +95,49 @@ def main():
         sample_rate=sample_rate,
         labels=vocab,
         batch_size=batch_size,
-        **eval_dl_params)
+        **eval_dl_params,
+    )
 
     N = len(data_layer)
     nemo.logging.info('Evaluating {0} examples'.format(N))
 
     data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
-        sample_rate=sample_rate,
-        **jasper_params["AudioToMelSpectrogramPreprocessor"])
+        sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"],
+    )
     jasper_encoder = nemo_asr.JasperEncoder(
-        feat_in=jasper_params[
-            "AudioToMelSpectrogramPreprocessor"]["features"],
-        **jasper_params["JasperEncoder"])
+        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"],
+    )
     jasper_decoder = nemo_asr.JasperDecoderForCTC(
-        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
-        num_classes=len(vocab))
+        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab),
+    )
     greedy_decoder = nemo_asr.GreedyCTCDecoder()
 
     nemo.logging.info('================================')
+    nemo.logging.info(f"Number of parameters in encoder: {jasper_encoder.num_weights}")
+    nemo.logging.info(f"Number of parameters in decoder: {jasper_decoder.num_weights}")
     nemo.logging.info(
-        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
-    nemo.logging.info(
-        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
-    nemo.logging.info(
-        f"Total number of parameters in model: "
-        f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
+        f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}"
+    )
     nemo.logging.info('================================')
 
-    audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1 =\
-        data_layer()
-    processed_signal_e1, p_length_e1 = data_preprocessor(
-        input_signal=audio_signal_e1,
-        length=a_sig_length_e1)
-    encoded_e1, encoded_len_e1 = jasper_encoder(
-        audio_signal=processed_signal_e1,
-        length=p_length_e1)
+    (audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1,) = data_layer()
+    processed_signal_e1, p_length_e1 = data_preprocessor(input_signal=audio_signal_e1, length=a_sig_length_e1)
+    encoded_e1, encoded_len_e1 = jasper_encoder(audio_signal=processed_signal_e1, length=p_length_e1)
     log_probs_e1 = jasper_decoder(encoder_output=encoded_e1)
     predictions_e1 = greedy_decoder(log_probs=log_probs_e1)
 
-    eval_tensors = [log_probs_e1, predictions_e1,
-                    transcript_e1, transcript_len_e1, encoded_len_e1]
+    eval_tensors = [
+        log_probs_e1,
+        predictions_e1,
+        transcript_e1,
+        transcript_len_e1,
+        encoded_len_e1,
+    ]
 
-    evaluated_tensors = neural_factory.infer(
-        tensors=eval_tensors,
-        checkpoint_dir=load_dir,
-        cache=True
-    )
+    evaluated_tensors = neural_factory.infer(tensors=eval_tensors, checkpoint_dir=load_dir, cache=True)
 
     greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
-    references = post_process_transcripts(
-        evaluated_tensors[2], evaluated_tensors[3], vocab)
+    references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab)
     wer = word_error_rate(hypotheses=greedy_hypotheses, references=references)
     nemo.logging.info("Greedy WER {:.2f}%".format(wer * 100))
 
@@ -161,23 +157,18 @@ def main():
         for alpha in np.arange(args.alpha, args.alpha_max, args.alpha_step):
             for beta in np.arange(args.beta, args.beta_max, args.beta_step):
                 nemo.logging.info('================================')
-                nemo.logging.info(
-                    f'Infering with (alpha, beta): ({alpha}, {beta})')
+                nemo.logging.info(f'Infering with (alpha, beta): ({alpha}, {beta})')
                 beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
                     vocab=vocab,
                     beam_width=args.beam_width,
                     alpha=alpha,
                     beta=beta,
                     lm_path=args.lm_path,
-                    num_cpus=max(os.cpu_count(), 1))
-                beam_predictions_e1 = beam_search_with_lm(
-                    log_probs=log_probs_e1, log_probs_length=encoded_len_e1)
-
-                evaluated_tensors = neural_factory.infer(
-                    tensors=[beam_predictions_e1],
-                    use_cache=True,
-                    verbose=False
+                    num_cpus=max(os.cpu_count(), 1),
                 )
+                beam_predictions_e1 = beam_search_with_lm(log_probs=log_probs_e1, log_probs_length=encoded_len_e1)
+
+                evaluated_tensors = neural_factory.infer(tensors=[beam_predictions_e1], use_cache=True, verbose=False,)
 
                 beam_hypotheses = []
                 # Over mini-batch
@@ -186,8 +177,7 @@ def main():
                     for j in i:
                         beam_hypotheses.append(j[0][1])
 
-                wer = word_error_rate(
-                    hypotheses=beam_hypotheses, references=references)
+                wer = word_error_rate(hypotheses=beam_hypotheses, references=references)
                 nemo.logging.info("Beam WER {:.2f}%".format(wer * 100))
                 beam_wers.append(((alpha, beta), wer * 100))
 
@@ -196,18 +186,14 @@ def main():
         nemo.logging.info('\n' + '\n'.join([str(e) for e in beam_wers]))
         nemo.logging.info('================================')
         best_beam_wer = min(beam_wers, key=lambda x: x[1])
-        nemo.logging.info('Best (alpha, beta): '
-                          f'{best_beam_wer[0]}, '
-                          f'WER: {best_beam_wer[1]:.2f}%')
+        nemo.logging.info('Best (alpha, beta): ' f'{best_beam_wer[0]}, ' f'WER: {best_beam_wer[1]:.2f}%')
 
     if args.save_logprob:
         # Convert logits to list of numpy arrays
         logprob = []
         for i, batch in enumerate(evaluated_tensors[0]):
             for j in range(batch.shape[0]):
-                logprob.append(
-                    batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy()
-                )
+                logprob.append(batch[j][: evaluated_tensors[4][i][j], :].cpu().numpy())
         with open(args.save_logprob, 'wb') as f:
             pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)
 
diff --git a/examples/asr/quartznet.py b/examples/asr/quartznet.py
index a984c27009ef..e199f408e89a 100644
--- a/examples/asr/quartznet.py
+++ b/examples/asr/quartznet.py
@@ -1,24 +1,22 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import argparse
 import copy
-from functools import partial
 import os
+from functools import partial
 
 from ruamel.yaml import YAML
 
 import nemo
-from nemo.utils.lr_policies import CosineAnnealing
-import nemo.utils.argparse as nm_argparse
 import nemo.collections.asr as nemo_asr
-from nemo.collections.asr.helpers import monitor_asr_train_progress, \
-    process_evaluation_batch, process_evaluation_epoch
+import nemo.utils.argparse as nm_argparse
+from nemo.collections.asr.helpers import monitor_asr_train_progress, process_evaluation_batch, process_evaluation_epoch
+from nemo.utils.lr_policies import CosineAnnealing
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        parents=[nm_argparse.NemoArgParser()],
-        description='QuartzNet',
-        conflict_handler='resolve')
+        parents=[nm_argparse.NemoArgParser()], description='QuartzNet', conflict_handler='resolve',
+    )
     parser.set_defaults(
         checkpoint_dir=None,
         optimizer="novograd",
@@ -27,15 +25,20 @@ def parse_args():
         lr=0.01,
         weight_decay=0.001,
         amp_opt_level="O0",
-        create_tb_writer=True
+        create_tb_writer=True,
     )
 
     # Overwrite default args
-    parser.add_argument("--num_epochs", type=int, default=None, required=True,
-                        help="number of epochs to train. You should specify"
-                             "either num_epochs or max_steps")
-    parser.add_argument("--model_config", type=str, required=True,
-                        help="model configuration file: model.yaml")
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=None,
+        required=True,
+        help="number of epochs to train. You should specify" "either num_epochs or max_steps",
+    )
+    parser.add_argument(
+        "--model_config", type=str, required=True, help="model configuration file: model.yaml",
+    )
 
     # Create new args
     parser.add_argument("--exp_name", default="QuartzNet", type=str)
@@ -43,8 +46,7 @@ def parse_args():
     parser.add_argument("--beta2", default=0.5, type=float)
     parser.add_argument("--warmup_steps", default=1000, type=int)
     parser.add_argument("--load_dir", default=None, type=str)
-    parser.add_argument("--synced_bn", action='store_true',
-                        help="Use synchronized batch norm")
+    parser.add_argument("--synced_bn", action='store_true', help="Use synchronized batch norm")
     parser.add_argument("--synced_bn_groupsize", default=0, type=int)
 
     args = parser.parse_args()
@@ -55,11 +57,7 @@ def parse_args():
 
 
 def construct_name(name, lr, batch_size, num_epochs, wd, optimizer):
-    return ("{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}".format(
-        name, lr,
-        batch_size,
-        num_epochs, wd,
-        optimizer))
+    return "{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}".format(name, lr, batch_size, num_epochs, wd, optimizer)
 
 
 def create_all_dags(args, neural_factory):
@@ -97,8 +95,7 @@ def create_all_dags(args, neural_factory):
     )
 
     N = len(data_layer_train)
-    steps_per_epoch = int(
-        N / (args.batch_size * args.iter_per_step * args.num_gpus))
+    steps_per_epoch = int(N / (args.batch_size * args.iter_per_step * args.num_gpus))
 
     # create separate data layers for eval
     # we need separate eval dags for separate eval datasets
@@ -128,20 +125,19 @@ def create_all_dags(args, neural_factory):
     # create shared modules
 
     data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
-        sample_rate=sample_rate,
-        **quartz_params["AudioToMelSpectrogramPreprocessor"])
+        sample_rate=sample_rate, **quartz_params["AudioToMelSpectrogramPreprocessor"],
+    )
 
     # (QuartzNet uses the Jasper baseline encoder and decoder)
     encoder = nemo_asr.JasperEncoder(
-        feat_in=quartz_params["AudioToMelSpectrogramPreprocessor"]["features"],
-        **quartz_params["JasperEncoder"])
+        feat_in=quartz_params["AudioToMelSpectrogramPreprocessor"]["features"], **quartz_params["JasperEncoder"],
+    )
 
     decoder = nemo_asr.JasperDecoderForCTC(
-        feat_in=quartz_params["JasperEncoder"]["jasper"][-1]["filters"],
-        num_classes=len(vocab))
+        feat_in=quartz_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab),
+    )
 
-    ctc_loss = nemo_asr.CTCLossNM(
-        num_classes=len(vocab))
+    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
 
     greedy_decoder = nemo_asr.GreedyCTCDecoder()
 
@@ -154,92 +150,68 @@ def create_all_dags(args, neural_factory):
 
     spectr_augment_config = quartz_params.get('SpectrogramAugmentation', None)
     if spectr_augment_config:
-        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
-            **spectr_augment_config)
+        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config)
 
     # assemble train DAG
 
-    audio_signal_t, a_sig_length_t, \
-        transcript_t, transcript_len_t = data_layer_train()
+    (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t,) = data_layer_train()
 
-    processed_signal_t, p_length_t = data_preprocessor(
-        input_signal=audio_signal_t,
-        length=a_sig_length_t)
+    processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t)
 
     if multiply_batch_config:
-        processed_signal_t, p_length_t, transcript_t, transcript_len_t = \
-            multiply_batch(
-                in_x=processed_signal_t, in_x_len=p_length_t,
-                in_y=transcript_t,
-                in_y_len=transcript_len_t)
+        (processed_signal_t, p_length_t, transcript_t, transcript_len_t,) = multiply_batch(
+            in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t,
+        )
 
     if spectr_augment_config:
-        processed_signal_t = data_spectr_augmentation(
-            input_spec=processed_signal_t)
+        processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t)
 
-    encoded_t, encoded_len_t = encoder(
-        audio_signal=processed_signal_t,
-        length=p_length_t)
+    encoded_t, encoded_len_t = encoder(audio_signal=processed_signal_t, length=p_length_t)
     log_probs_t = decoder(encoder_output=encoded_t)
     predictions_t = greedy_decoder(log_probs=log_probs_t)
     loss_t = ctc_loss(
-        log_probs=log_probs_t,
-        targets=transcript_t,
-        input_length=encoded_len_t,
-        target_length=transcript_len_t)
+        log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t,
+    )
 
     # create train callbacks
     train_callback = nemo.core.SimpleLossLoggerCallback(
         tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
-        print_func=partial(
-            monitor_asr_train_progress,
-            labels=vocab),
+        print_func=partial(monitor_asr_train_progress, labels=vocab),
         get_tb_values=lambda x: [["loss", x[0]]],
-        tb_writer=neural_factory.tb_writer)
+        tb_writer=neural_factory.tb_writer,
+    )
 
     callbacks = [train_callback]
 
     if args.checkpoint_dir or args.load_dir:
         chpt_callback = nemo.core.CheckpointCallback(
-            folder=args.checkpoint_dir,
-            load_from_folder=args.load_dir,
-            step_freq=args.checkpoint_save_freq)
+            folder=args.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq,
+        )
 
         callbacks.append(chpt_callback)
 
     # assemble eval DAGs
     for i, eval_dl in enumerate(data_layers_eval):
 
-        audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e = \
-            eval_dl()
-        processed_signal_e, p_length_e = data_preprocessor(
-            input_signal=audio_signal_e,
-            length=a_sig_length_e)
-        encoded_e, encoded_len_e = encoder(
-            audio_signal=processed_signal_e,
-            length=p_length_e)
+        (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e,) = eval_dl()
+        processed_signal_e, p_length_e = data_preprocessor(input_signal=audio_signal_e, length=a_sig_length_e)
+        encoded_e, encoded_len_e = encoder(audio_signal=processed_signal_e, length=p_length_e)
         log_probs_e = decoder(encoder_output=encoded_e)
         predictions_e = greedy_decoder(log_probs=log_probs_e)
         loss_e = ctc_loss(
-            log_probs=log_probs_e,
-            targets=transcript_e,
-            input_length=encoded_len_e,
-            target_length=transcript_len_e)
+            log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
+        )
 
         # create corresponding eval callback
         tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
 
         eval_callback = nemo.core.EvaluatorCallback(
-            eval_tensors=[loss_e, predictions_e,
-                          transcript_e, transcript_len_e],
-            user_iter_callback=partial(
-                process_evaluation_batch,
-                labels=vocab),
-            user_epochs_done_callback=partial(
-                process_evaluation_epoch,
-                tag=tagname),
+            eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e,],
+            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
+            user_epochs_done_callback=partial(process_evaluation_epoch, tag=tagname),
             eval_step=args.eval_freq,
-            tb_writer=neural_factory.tb_writer)
+            tb_writer=neural_factory.tb_writer,
+        )
 
         callbacks.append(eval_callback)
 
@@ -250,13 +222,7 @@ def main():
 
     args = parse_args()
 
-    name = construct_name(
-        args.exp_name,
-        args.lr,
-        args.batch_size,
-        args.num_epochs,
-        args.weight_decay,
-        args.optimizer)
+    name = construct_name(args.exp_name, args.lr, args.batch_size, args.num_epochs, args.weight_decay, args.optimizer,)
     work_dir = name
     if args.work_dir:
         work_dir = os.path.join(args.work_dir, name)
@@ -271,7 +237,8 @@ def main():
         create_tb_writer=args.create_tb_writer,
         files_to_copy=[args.model_config, __file__],
         cudnn_benchmark=args.cudnn_benchmark,
-        tensorboard_dir=args.tensorboard_dir)
+        tensorboard_dir=args.tensorboard_dir,
+    )
     args.num_gpus = neural_factory.world_size
 
     args.checkpoint_dir = neural_factory.checkpoint_dir
@@ -280,28 +247,25 @@ def main():
         nemo.logging.info('Doing ALL GPU')
 
     # build dags
-    train_loss, callbacks, steps_per_epoch = \
-        create_all_dags(args, neural_factory)
+    train_loss, callbacks, steps_per_epoch = create_all_dags(args, neural_factory)
 
     # train model
     neural_factory.train(
         tensors_to_optimize=[train_loss],
         callbacks=callbacks,
-        lr_policy=CosineAnnealing(
-            args.num_epochs * steps_per_epoch,
-            warmup_steps=args.warmup_steps),
+        lr_policy=CosineAnnealing(args.num_epochs * steps_per_epoch, warmup_steps=args.warmup_steps),
         optimizer=args.optimizer,
         optimization_params={
             "num_epochs": args.num_epochs,
             "lr": args.lr,
-            "betas": (
-                args.beta1,
-                args.beta2),
+            "betas": (args.beta1, args.beta2),
             "weight_decay": args.weight_decay,
-            "grad_norm_clip": None},
+            "grad_norm_clip": None,
+        },
         batches_per_step=args.iter_per_step,
         synced_batchnorm=args.synced_bn,
-        synced_batchnorm_groupsize=args.synced_bn_groupsize)
+        synced_batchnorm_groupsize=args.synced_bn_groupsize,
+    )
 
 
 if __name__ == '__main__':
diff --git a/examples/image/gan.py b/examples/image/gan.py
index 94b7cd69d928..6a01e822830e 100644
--- a/examples/image/gan.py
+++ b/examples/image/gan.py
@@ -7,21 +7,21 @@
 from tensorboardX import SummaryWriter
 
 import nemo
-from nemo.backends.pytorch.torchvision.helpers import eval_iter_callback, \
-    eval_epochs_done_callback, compute_accuracy
 import nemo.collections.simple_gan as nemo_simple_gan
-
+from nemo.backends.pytorch.torchvision.helpers import compute_accuracy, eval_epochs_done_callback, eval_iter_callback
 
 parser = argparse.ArgumentParser(description='MNIST')
 parser.add_argument("--local_rank", default=None, type=int)
 parser.add_argument("--batch_size", default=128, type=int)
 parser.add_argument("--num_epochs", default=5000, type=int)
 parser.add_argument("--work_dir", default=None, type=str)
-parser.add_argument("--train_dataset",
-                    # set default=os.getcwd() unless your are running test
-                    default="/home/mrjenkins/TestData", type=str)
-parser.add_argument("--amp_opt_level", choices=['O0', 'O1', 'O2', 'O3'],
-                    default='O0')
+parser.add_argument(
+    "--train_dataset",
+    # set default=os.getcwd() unless your are running test
+    default="/home/mrjenkins/TestData",
+    type=str,
+)
+parser.add_argument("--amp_opt_level", choices=['O0', 'O1', 'O2', 'O3'], default='O0')
 
 args = parser.parse_args()
 
@@ -37,22 +37,18 @@
     optimization_level=args.amp_opt_level,
     log_dir=work_dir,
     create_tb_writer=True,
-    files_to_copy=[__file__]
-    )
+    files_to_copy=[__file__],
+)
 
 mnist_data = nemo_simple_gan.MnistGanDataLayer(
-    batch_size=batch_size,
-    shuffle=True,
-    train=True,
-    root=args.train_dataset)
+    batch_size=batch_size, shuffle=True, train=True, root=args.train_dataset
+)
 
-generator = nemo_simple_gan.SimpleGenerator(
-    batch_size=batch_size)
+generator = nemo_simple_gan.SimpleGenerator(batch_size=batch_size)
 discriminator = nemo_simple_gan.SimpleDiscriminator()
 neg_disc_loss = nemo_simple_gan.DiscriminatorLoss(neg=True)
 disc_loss = nemo_simple_gan.DiscriminatorLoss()
-disc_grad_penalty = nemo_simple_gan.GradientPenalty(
-    lambda_=3)
+disc_grad_penalty = nemo_simple_gan.GradientPenalty(lambda_=3)
 interpolater = nemo_simple_gan.InterpolateImage()
 
 # Create generator DAG
@@ -67,13 +63,10 @@
 real_decision = discriminator(image=real_data)
 interpolated_loss = disc_loss(decision=interpolated_decision)
 real_loss = neg_disc_loss(decision=real_decision)
-grad_penalty = disc_grad_penalty(
-    interpolated_image=interpolated_image,
-    interpolated_decision=interpolated_decision)
+grad_penalty = disc_grad_penalty(interpolated_image=interpolated_image, interpolated_decision=interpolated_decision,)
 
 # Create Eval DAG
-random_data = nemo_simple_gan.RandomDataLayer(
-    batch_size=batch_size)
+random_data = nemo_simple_gan.RandomDataLayer(batch_size=batch_size)
 latents_e = random_data()
 generated_image_e = generator(latents=latents_e)
 
@@ -85,19 +78,11 @@
 # For single loss and single optimizer, the following steps can be skipped
 # and an optimizer will be created in trainer.train()
 optimizer_G = neural_factory.create_optimizer(
-    optimizer="adam",
-    things_to_optimize=[generator],
-    optimizer_params={
-        "lr": 1e-4,
-        "betas": (0.5, 0.9),
-    })
+    optimizer="adam", things_to_optimize=[generator], optimizer_params={"lr": 1e-4, "betas": (0.5, 0.9),},
+)
 optimizer_D = neural_factory.create_optimizer(
-    optimizer="adam",
-    things_to_optimize=[discriminator],
-    optimizer_params={
-        "lr": 1e-4,
-        "betas": (0.5, 0.9),
-    })
+    optimizer="adam", things_to_optimize=[discriminator], optimizer_params={"lr": 1e-4, "betas": (0.5, 0.9),},
+)
 
 
 def save_image(global_vars):
@@ -137,10 +122,10 @@ def get_tb_name_value(tensors):
     print_func=print_losses,
     get_tb_values=get_tb_name_value,
     step_freq=500,
-    tb_writer=neural_factory.tb_writer)
+    tb_writer=neural_factory.tb_writer,
+)
 
-checkpoint_callback = nemo.core.CheckpointCallback(
-    folder=neural_factory.checkpoint_dir, step_freq=1000)
+checkpoint_callback = nemo.core.CheckpointCallback(folder=neural_factory.checkpoint_dir, step_freq=1000)
 
 tensors_to_optimize = [
     (optimizer_D, losses_D),
@@ -151,4 +136,5 @@ def get_tb_name_value(tensors):
 neural_factory.train(
     tensors_to_optimize=tensors_to_optimize,
     callbacks=[eval_callback, logger_callback, checkpoint_callback],
-    optimization_params={"num_epochs": args.num_epochs})
+    optimization_params={"num_epochs": args.num_epochs},
+)
diff --git a/examples/image/resnet50.py b/examples/image/resnet50.py
index a1b3ecf1bda1..90abeceed755 100644
--- a/examples/image/resnet50.py
+++ b/examples/image/resnet50.py
@@ -1,14 +1,15 @@
 # Copyright (c) 2019 NVIDIA Corporation
-from tensorboardX import SummaryWriter
-from nemo.utils.lr_policies import SquareAnnealing
-from nemo.backends.pytorch.torchvision.helpers import eval_iter_callback, \
-    eval_epochs_done_callback, compute_accuracy
-import nemo
 import argparse
 import os
 import sys
-sys.path.insert(0, os.path.abspath(
-    os.path.join(os.path.dirname(__file__), '../..')))
+
+from tensorboardX import SummaryWriter
+
+import nemo
+from nemo.backends.pytorch.torchvision.helpers import compute_accuracy, eval_epochs_done_callback, eval_iter_callback
+from nemo.utils.lr_policies import SquareAnnealing
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
 
 parser = argparse.ArgumentParser(description='ResNet50 on ImageNet')
 parser.add_argument("--local_rank", default=None, type=int)
@@ -50,44 +51,44 @@
     local_rank=args.local_rank,
     # Set this to nemo.core.Optimization.mxprO1
     # if you have Volta or Turing GPU
-    optimization_level=nemo.core.Optimization.mxprO0)
+    optimization_level=nemo.core.Optimization.mxprO0,
+)
 
-resnet = neural_factory.get_module(name="resnet50",
-                                   params={"placement": device},
-                                   collection="torchvision",
-                                   pretrained=False)
+resnet = neural_factory.get_module(
+    name="resnet50", params={"placement": device}, collection="torchvision", pretrained=False,
+)
 
 dl_train = neural_factory.get_module(
-    name="ImageFolderDataLayer", collection="torchvision",
-    params={"batch_size": batch_size,
-            "input_size":
-                resnet.inputs["x"].axis2type[2].dim,
-            "shuffle": True,
-            "path": args.data_root + "train",
-            # "path": "/mnt/D1/Data/ImageNet/ImageFolder/train",
-            "placement": device
-            })
-
-L_train = neural_factory.get_module(
-    name="CrossEntropyLoss", collection="toys",
-    params={"placement": device})
+    name="ImageFolderDataLayer",
+    collection="torchvision",
+    params={
+        "batch_size": batch_size,
+        "input_size": resnet.inputs["x"].axis2type[2].dim,
+        "shuffle": True,
+        "path": args.data_root + "train",
+        # "path": "/mnt/D1/Data/ImageNet/ImageFolder/train",
+        "placement": device,
+    },
+)
+
+L_train = neural_factory.get_module(name="CrossEntropyLoss", collection="toys", params={"placement": device})
 
 dl_eval = neural_factory.get_module(
-    name="ImageFolderDataLayer", collection="torchvision",
-    params={"batch_size": batch_size,
-            "input_size":
-                resnet.inputs["x"].axis2type[2].dim,
-            "shuffle": False,
-            "is_eval": True,
-            "path": args.data_root + "val",
-            # "path": "/mnt/D1/Data/ImageNet/ImageFolder/val",
-            # "path": "/raid/okuchaiev/Data/ImageNet/ImageFolder/val",
-            "placement": device
-            })
-
-L_eval = neural_factory.get_module(
-    name="CrossEntropyLoss", collection="toys",
-    params={"placement": device})
+    name="ImageFolderDataLayer",
+    collection="torchvision",
+    params={
+        "batch_size": batch_size,
+        "input_size": resnet.inputs["x"].axis2type[2].dim,
+        "shuffle": False,
+        "is_eval": True,
+        "path": args.data_root + "val",
+        # "path": "/mnt/D1/Data/ImageNet/ImageFolder/val",
+        # "path": "/raid/okuchaiev/Data/ImageNet/ImageFolder/val",
+        "placement": device,
+    },
+)
+
+L_eval = neural_factory.get_module(name="CrossEntropyLoss", collection="toys", params={"placement": device})
 
 step_per_epoch = int(len(dl_train) / (batch_size * num_gpus))
 
@@ -101,15 +102,19 @@
 e_loss = L_eval(predictions=e_outputs, labels=e_labels)
 
 callback = nemo.core.SimpleLossLoggerCallback(
-    step_freq=50, tb_writer=tb_writer, tensor_list2str=lambda x: str(
-        x[0].item()), tensor_list2str_evl=lambda x: compute_accuracy(x))
+    step_freq=50,
+    tb_writer=tb_writer,
+    tensor_list2str=lambda x: str(x[0].item()),
+    tensor_list2str_evl=lambda x: compute_accuracy(x),
+)
 
 callback_eval = nemo.core.EvaluatorCallback(
     eval_tensors=[e_loss, e_outputs, e_labels],
     user_iter_callback=eval_iter_callback,
     user_epochs_done_callback=eval_epochs_done_callback,
     eval_step=10000,
-    tb_writer=tb_writer)
+    tb_writer=tb_writer,
+)
 
 # Instantiate an optimizer to perform `train` action
 optimizer = neural_factory.get_trainer(
@@ -119,9 +124,14 @@
             "lr": learning_rate,
             "max_steps": max_steps,
             "weight_decay": weight_decay,
-            "momentum": momentum}})
-
-optimizer.train(tensors_to_optimize=[train_loss],
-                tensors_to_evaluate=[outputs, labels],
-                callbacks=[callback, callback_eval],
-                lr_policy=SquareAnnealing(num_epochs * step_per_epoch))
+            "momentum": momentum,
+        }
+    }
+)
+
+optimizer.train(
+    tensors_to_optimize=[train_loss],
+    tensors_to_evaluate=[outputs, labels],
+    callbacks=[callback, callback_eval],
+    lr_policy=SquareAnnealing(num_epochs * step_per_epoch),
+)
diff --git a/examples/image/transfer_learning.py b/examples/image/transfer_learning.py
index ffd2f6282b86..2ef2edcc1f54 100644
--- a/examples/image/transfer_learning.py
+++ b/examples/image/transfer_learning.py
@@ -1,19 +1,19 @@
-from tensorboardX import SummaryWriter
-import nemo
-from nemo.backends.pytorch.torchvision.helpers import eval_iter_callback, \
-    eval_epochs_done_callback, compute_accuracy
 import argparse
 import os
 import subprocess
-import zipfile
 import sys
-sys.path.insert(0, os.path.abspath(
-    os.path.join(os.path.dirname(__file__), '../..')))
+import zipfile
+
+from tensorboardX import SummaryWriter
+
+import nemo
+from nemo.backends.pytorch.torchvision.helpers import compute_accuracy, eval_epochs_done_callback, eval_iter_callback
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
 
 if not os.path.isdir("hymenoptera_data"):
     print("Datafolder not found. Downloading data from the Web")
-    subprocess.run(
-        ["wget", "https://download.pytorch.org/tutorial/hymenoptera_data.zip"])
+    subprocess.run(["wget", "https://download.pytorch.org/tutorial/hymenoptera_data.zip"])
     zip_ref = zipfile.ZipFile('hymenoptera_data.zip', 'r')
     zip_ref.extractall('.')
     zip_ref.close()
@@ -51,41 +51,38 @@
 device = nemo.core.DeviceType.GPU
 
 # Instantiate Neural Factory and Neural Modules
-neural_factory = nemo.core.NeuralModuleFactory(
-    backend=nemo.core.Backend.PyTorch,
-    placement=device)
+neural_factory = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch, placement=device)
 
 # NOTICE: pretrain=True argument
-resnet = neural_factory.get_module(name="resnet18",
-                                   params={"num_classes": 2},
-                                   collection="torchvision",
-                                   pretrained=True)
+resnet = neural_factory.get_module(
+    name="resnet18", params={"num_classes": 2}, collection="torchvision", pretrained=True,
+)
 
 dl_train = neural_factory.get_module(
-    name="ImageFolderDataLayer", collection="torchvision",
-    params={"batch_size": batch_size,
-            "input_size":
-                resnet.inputs["x"].axis2type[2].dim,
-            "shuffle": True,
-            "path": "hymenoptera_data/train",
-            })
+    name="ImageFolderDataLayer",
+    collection="torchvision",
+    params={
+        "batch_size": batch_size,
+        "input_size": resnet.inputs["x"].axis2type[2].dim,
+        "shuffle": True,
+        "path": "hymenoptera_data/train",
+    },
+)
 
 dl_eval = neural_factory.get_module(
-    name="ImageFolderDataLayer", collection="torchvision",
-    params={"batch_size": batch_size,
-            "input_size":
-                resnet.inputs["x"].axis2type[2].dim,
-            "shuffle": False,
-            "path": "hymenoptera_data/val",
-            })
-
-L_train = neural_factory.get_module(
-    name="CrossEntropyLoss", collection="toys",
-    params={})
-
-L_eval = neural_factory.get_module(
-    name="CrossEntropyLoss", collection="toys",
-    params={})
+    name="ImageFolderDataLayer",
+    collection="torchvision",
+    params={
+        "batch_size": batch_size,
+        "input_size": resnet.inputs["x"].axis2type[2].dim,
+        "shuffle": False,
+        "path": "hymenoptera_data/val",
+    },
+)
+
+L_train = neural_factory.get_module(name="CrossEntropyLoss", collection="toys", params={})
+
+L_eval = neural_factory.get_module(name="CrossEntropyLoss", collection="toys", params={})
 
 # NOTICE: Freeze all Neural Module's weights
 resnet.freeze()
@@ -101,15 +98,19 @@
 e_loss = L_eval(predictions=e_outputs, labels=e_labels)
 
 callback = nemo.core.SimpleLossLoggerCallback(
-    step_freq=20, tb_writer=tb_writer, tensor_list2str=lambda x: str(
-        x[0].item()), tensor_list2str_evl=lambda x: compute_accuracy(x))
+    step_freq=20,
+    tb_writer=tb_writer,
+    tensor_list2str=lambda x: str(x[0].item()),
+    tensor_list2str_evl=lambda x: compute_accuracy(x),
+)
 
 callback_eval = nemo.core.EvaluatorCallback(
     eval_tensors=[e_loss, e_outputs, e_labels],
     user_iter_callback=eval_iter_callback,
     user_epochs_done_callback=eval_epochs_done_callback,
     eval_step=30,
-    tb_writer=tb_writer)
+    tb_writer=tb_writer,
+)
 
 
 optimizer = neural_factory.get_trainer(
@@ -119,8 +120,11 @@
             "lr": learning_rate,
             "max_steps": max_steps,
             "weight_decay": weight_decay,
-            "momentum": momentum}})
-
-optimizer.train(tensors_to_optimize=[train_loss],
-                tensors_to_evaluate=[outputs, labels],
-                callbacks=[callback, callback_eval])
+            "momentum": momentum,
+        }
+    }
+)
+
+optimizer.train(
+    tensors_to_optimize=[train_loss], tensors_to_evaluate=[outputs, labels], callbacks=[callback, callback_eval],
+)
diff --git a/examples/nlp/asr_postprocessor.py b/examples/nlp/asr_postprocessor.py
index 192a5a65d892..89c5a889ea72 100644
--- a/examples/nlp/asr_postprocessor.py
+++ b/examples/nlp/asr_postprocessor.py
@@ -1,37 +1,35 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import math
+import os
 
 import torch
 
-import os
 import nemo
-from nemo.core.callbacks import CheckpointCallback
-from nemo.utils.lr_policies import SquareAnnealing
-
 import nemo.collections.nlp as nemo_nlp
 from nemo.collections.nlp.data.tokenizers.bert_tokenizer import NemoBertTokenizer
-from nemo.collections.nlp.utils.callbacks.translation import \
-    eval_iter_callback, eval_epochs_done_callback_wer
+from nemo.collections.nlp.utils.callbacks.translation import eval_epochs_done_callback_wer, eval_iter_callback
+from nemo.core.callbacks import CheckpointCallback
+from nemo.utils.lr_policies import SquareAnnealing
 
 parser = nemo.utils.NemoArgParser(description='ASR postprocessor')
-parser.set_defaults(train_dataset="train",
-                    eval_datasets=["valid"],
-                    optimizer="novograd",
-                    amp_opt_level="O1",
-                    num_epochs=1000,
-                    batch_size=4096,
-                    eval_batch_size=1024,
-                    lr=0.001,
-                    weight_decay=0,
-                    max_steps=2000,
-                    iter_per_step=1,
-                    checkpoint_save_freq=10000,
-                    work_dir='outputs/asr_postprocessor',
-                    eval_freq=200)
-
-parser.add_argument("--pretrained_model",
-                    default="bert-base-uncased",
-                    type=str)
+parser.set_defaults(
+    train_dataset="train",
+    eval_datasets=["valid"],
+    optimizer="novograd",
+    amp_opt_level="O1",
+    num_epochs=1000,
+    batch_size=4096,
+    eval_batch_size=1024,
+    lr=0.001,
+    weight_decay=0,
+    max_steps=2000,
+    iter_per_step=1,
+    checkpoint_save_freq=10000,
+    work_dir='outputs/asr_postprocessor',
+    eval_freq=200,
+)
+
+parser.add_argument("--pretrained_model", default="bert-base-uncased", type=str)
 parser.add_argument("--warmup_steps", default=2000, type=int)
 parser.add_argument("--d_model", default=768, type=int)
 parser.add_argument("--d_inner", default=3072, type=int)
@@ -48,33 +46,33 @@
 parser.add_argument("--tgt_lang", default="real", type=str)
 parser.add_argument("--beam_size", default=4, type=int)
 parser.add_argument("--len_pen", default=0.0, type=float)
-parser.add_argument("--restore_from",
-                    dest="restore_from",
-                    type=str,
-                    default="../../scripts/bert-base-uncased_decoder.pt")
+parser.add_argument(
+    "--restore_from", dest="restore_from", type=str, default="../../scripts/bert-base-uncased_decoder.pt",
+)
 args = parser.parse_args()
 
 
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   local_rank=args.local_rank,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=args.work_dir,
-                                   create_tb_writer=False,
-                                   files_to_copy=[__file__],
-                                   add_time_to_log_dir=False)
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch,
+    local_rank=args.local_rank,
+    optimization_level=args.amp_opt_level,
+    log_dir=args.work_dir,
+    create_tb_writer=False,
+    files_to_copy=[__file__],
+    add_time_to_log_dir=False,
+)
 
 tokenizer = NemoBertTokenizer(pretrained_model=args.pretrained_model)
 vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8)
 tokens_to_add = vocab_size - tokenizer.vocab_size
 
 zeros_transform = nemo.backends.pytorch.common.ZerosLikeNM()
-encoder = nemo_nlp.huggingface.BERT(
-    pretrained_model_name=args.pretrained_model,
-    local_rank=args.local_rank)
+encoder = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_model, local_rank=args.local_rank)
 device = encoder.bert.embeddings.word_embeddings.weight.get_device()
 zeros = torch.zeros((tokens_to_add, args.d_model)).to(device=device)
 encoder.bert.embeddings.word_embeddings.weight.data = torch.cat(
-    (encoder.bert.embeddings.word_embeddings.weight.data, zeros))
+    (encoder.bert.embeddings.word_embeddings.weight.data, zeros)
+)
 
 decoder = nemo_nlp.TransformerDecoderNM(
     d_model=args.d_model,
@@ -88,17 +86,14 @@
     max_seq_length=args.max_seq_length,
     embedding_dropout=args.embedding_dropout,
     learn_positional_encodings=True,
-    hidden_act="gelu")
+    hidden_act="gelu",
+)
 
 decoder.restore_from(args.restore_from, local_rank=args.local_rank)
 
-t_log_softmax = nemo_nlp.TokenClassifier(args.d_model,
-                                         num_classes=vocab_size,
-                                         num_layers=1,
-                                         log_softmax=True)
+t_log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True)
 
-loss_fn = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(
-    pad_id=tokenizer.pad_id(), smoothing=0.1)
+loss_fn = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id(), smoothing=0.1)
 
 beam_search = nemo_nlp.BeamSearchTranslatorNM(
     decoder=decoder,
@@ -108,54 +103,48 @@
     length_penalty=args.len_pen,
     bos_token=tokenizer.bos_id(),
     pad_token=tokenizer.pad_id(),
-    eos_token=tokenizer.eos_id())
+    eos_token=tokenizer.eos_id(),
+)
 
 # tie all embeddings weights
-t_log_softmax.mlp.layer0.weight = \
-    encoder.bert.embeddings.word_embeddings.weight
-decoder.embedding_layer.token_embedding.weight = \
-    encoder.bert.embeddings.word_embeddings.weight
-decoder.embedding_layer.position_embedding.weight = \
-    encoder.bert.embeddings.position_embeddings.weight
+t_log_softmax.mlp.layer0.weight = encoder.bert.embeddings.word_embeddings.weight
+decoder.embedding_layer.token_embedding.weight = encoder.bert.embeddings.word_embeddings.weight
+decoder.embedding_layer.position_embedding.weight = encoder.bert.embeddings.position_embeddings.weight
 
 
 def create_pipeline(dataset, tokens_in_batch, clean=False, training=True):
     dataset_src = os.path.join(args.data_dir, dataset + "." + args.src_lang)
     dataset_tgt = os.path.join(args.data_dir, dataset + "." + args.tgt_lang)
-    data_layer = nemo_nlp.TranslationDataLayer(tokenizer_src=tokenizer,
-                                               tokenizer_tgt=tokenizer,
-                                               dataset_src=dataset_src,
-                                               dataset_tgt=dataset_tgt,
-                                               tokens_in_batch=tokens_in_batch,
-                                               clean=clean)
+    data_layer = nemo_nlp.TranslationDataLayer(
+        tokenizer_src=tokenizer,
+        tokenizer_tgt=tokenizer,
+        dataset_src=dataset_src,
+        dataset_tgt=dataset_tgt,
+        tokens_in_batch=tokens_in_batch,
+        clean=clean,
+    )
     src, src_mask, tgt, tgt_mask, labels, sent_ids = data_layer()
     input_type_ids = zeros_transform(input_type_ids=src)
-    src_hiddens = encoder(input_ids=src,
-                          token_type_ids=input_type_ids,
-                          attention_mask=src_mask)
-    tgt_hiddens = decoder(input_ids_tgt=tgt,
-                          hidden_states_src=src_hiddens,
-                          input_mask_src=src_mask,
-                          input_mask_tgt=tgt_mask)
+    src_hiddens = encoder(input_ids=src, token_type_ids=input_type_ids, attention_mask=src_mask)
+    tgt_hiddens = decoder(
+        input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask,
+    )
     log_softmax = t_log_softmax(hidden_states=tgt_hiddens)
     loss = loss_fn(logits=log_softmax, target_ids=labels)
     beam_results = None
     if not training:
-        beam_results = beam_search(hidden_states_src=src_hiddens,
-                                   input_mask_src=src_mask)
+        beam_results = beam_search(hidden_states_src=src_hiddens, input_mask_src=src_mask)
     return loss, [tgt, loss, beam_results, sent_ids]
 
 
 # training pipeline
-train_loss, _ = create_pipeline(args.train_dataset, args.batch_size,
-                                clean=False)
+train_loss, _ = create_pipeline(args.train_dataset, args.batch_size, clean=False)
 
 # evaluation pipelines
 all_eval_losses = {}
 all_eval_tensors = {}
 for eval_dataset in args.eval_datasets:
-    eval_loss, eval_tensors = create_pipeline(
-        eval_dataset, args.eval_batch_size, clean=False, training=False)
+    eval_loss, eval_tensors = create_pipeline(eval_dataset, args.eval_batch_size, clean=False, training=False)
     all_eval_losses[eval_dataset] = eval_loss
     all_eval_tensors[eval_dataset] = eval_tensors
 
@@ -171,7 +160,8 @@ def print_loss(x):
     step_freq=100,
     print_func=print_loss,
     get_tb_values=lambda x: [["loss", x[0]]],
-    tb_writer=nf.tb_writer)
+    tb_writer=nf.tb_writer,
+)
 
 callbacks = [callback_train]
 
@@ -181,24 +171,22 @@ def print_loss(x):
         user_iter_callback=lambda x, y: eval_iter_callback(x, y, tokenizer),
         user_epochs_done_callback=eval_epochs_done_callback_wer,
         eval_step=args.eval_freq,
-        tb_writer=nf.tb_writer)
+        tb_writer=nf.tb_writer,
+    )
     callbacks.append(callback)
 
-checkpointer_callback = CheckpointCallback(folder=args.work_dir,
-                                           step_freq=args.checkpoint_save_freq)
+checkpointer_callback = CheckpointCallback(folder=args.work_dir, step_freq=args.checkpoint_save_freq)
 callbacks.append(checkpointer_callback)
 
 # define learning rate decay policy
-lr_policy = SquareAnnealing(total_steps=args.max_steps,
-                            min_lr=1e-5,
-                            warmup_steps=args.warmup_steps)
+lr_policy = SquareAnnealing(total_steps=args.max_steps, min_lr=1e-5, warmup_steps=args.warmup_steps)
 
 # Create trainer and execute training action
-nf.train(tensors_to_optimize=[train_loss],
-         callbacks=callbacks,
-         optimizer=args.optimizer,
-         lr_policy=lr_policy,
-         optimization_params={"num_epochs": 300,
-                              "lr": args.lr,
-                              "weight_decay": args.weight_decay},
-         batches_per_step=args.iter_per_step)
+nf.train(
+    tensors_to_optimize=[train_loss],
+    callbacks=callbacks,
+    optimizer=args.optimizer,
+    lr_policy=lr_policy,
+    optimization_params={"num_epochs": 300, "lr": args.lr, "weight_decay": args.weight_decay,},
+    batches_per_step=args.iter_per_step,
+)
diff --git a/examples/nlp/bert_pretraining.py b/examples/nlp/bert_pretraining.py
index f8fcee155bfa..37330d18f03c 100644
--- a/examples/nlp/bert_pretraining.py
+++ b/examples/nlp/bert_pretraining.py
@@ -59,18 +59,18 @@
 should finish under 5 days and yield an MRPC score of ACC/F1 85.05/89.35.
 """
 import argparse
-import os
 import math
+import os
+
 import torch
+
 import nemo
-from nemo.utils.lr_policies import get_lr_policy
-from pytorch_transformers import BertConfig
 import nemo.collections.nlp as nemo_nlp
 from nemo.collections.nlp.data.datasets.utils import BERTPretrainingDataDesc
 from nemo.collections.nlp.transformer.utils import gelu
-from nemo.collections.nlp.utils.callbacks.bert_pretraining import \
-    eval_iter_callback, eval_epochs_done_callback
-
+from nemo.collections.nlp.utils.callbacks.bert_pretraining import eval_epochs_done_callback, eval_iter_callback
+from nemo.utils.lr_policies import get_lr_policy
+from pytorch_transformers import BertConfig
 
 parser = argparse.ArgumentParser(description='BERT pretraining')
 parser.add_argument("--local_rank", default=None, type=int)
@@ -84,15 +84,11 @@
 parser.add_argument("--optimizer", default="novograd", type=str)
 parser.add_argument("--beta1", default=0.95, type=float)
 parser.add_argument("--beta2", default=0.25, type=float)
-parser.add_argument("--amp_opt_level",
-                    default="O0",
-                    type=str,
-                    choices=["O0", "O1", "O2"])
+parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
 parser.add_argument("--weight_decay", default=0.0, type=float)
-parser.add_argument("--tokenizer",
-                    default="sentence-piece",
-                    type=str,
-                    choices=["sentence-piece", "nemo-bert"])
+parser.add_argument(
+    "--tokenizer", default="sentence-piece", type=str, choices=["sentence-piece", "nemo-bert"],
+)
 parser.add_argument("--max_seq_length", default=128, type=int)
 parser.add_argument("--sample_size", default=1e7, type=int)
 parser.add_argument("--mask_probability", default=0.15, type=float)
@@ -103,39 +99,52 @@
 parser.add_argument("--num_hidden_layers", default=12, type=int)
 parser.add_argument("--num_attention_heads", default=12, type=int)
 parser.add_argument("--hidden_act", default="gelu", type=str)
-parser.add_argument("--max_predictions_per_seq", default=20, type=int,
-                    help="maximum number of masked tokens to predict,\
-                    needed when --preprocessed_data is specified")
+parser.add_argument(
+    "--max_predictions_per_seq",
+    default=20,
+    type=int,
+    help="maximum number of masked tokens to predict,\
+                    needed when --preprocessed_data is specified",
+)
 parser.add_argument("--data_dir", default="data/lm/wikitext-2", type=str)
-parser.add_argument("--preprocessed_data", action="store_true",
-                    default=False, help="specify if using preprocessed data")
-parser.add_argument("--gradient_predivide", action="store_true",
-                    default=False, help="use gradient predivide")
-parser.add_argument("--only_mlm_loss", action="store_true",
-                    default=False, help="use only masked language model loss")
-parser.add_argument("--max_steps", default=-1,
-                    type=int, help="if specified overrides --num_epochs.\
-                        Used for preprocessed data")
+parser.add_argument(
+    "--preprocessed_data", action="store_true", default=False, help="specify if using preprocessed data",
+)
+parser.add_argument(
+    "--gradient_predivide", action="store_true", default=False, help="use gradient predivide",
+)
+parser.add_argument(
+    "--only_mlm_loss", action="store_true", default=False, help="use only masked language model loss",
+)
+parser.add_argument(
+    "--max_steps",
+    default=-1,
+    type=int,
+    help="if specified overrides --num_epochs.\
+                        Used for preprocessed data",
+)
 parser.add_argument("--dataset_name", default="wikitext-2", type=str)
 parser.add_argument("--load_dir", default=None, type=str)
-parser.add_argument("--bert_checkpoint", default=None, type=str,
-                    help="specify path to pretrained BERT weights")
+parser.add_argument(
+    "--bert_checkpoint", default=None, type=str, help="specify path to pretrained BERT weights",
+)
 parser.add_argument("--work_dir", default="outputs/bert_lm", type=str)
 parser.add_argument("--save_epoch_freq", default=1, type=int)
 parser.add_argument("--save_step_freq", default=100, type=int)
 parser.add_argument("--print_step_freq", default=25, type=int)
-parser.add_argument("--config_file", default=None, type=str,
-                    help="The BERT model config")
+parser.add_argument("--config_file", default=None, type=str, help="The BERT model config")
 args = parser.parse_args()
 
 
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   local_rank=args.local_rank,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=args.work_dir,
-                                   create_tb_writer=True,
-                                   files_to_copy=[__file__],
-                                   add_time_to_log_dir=True)
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch,
+    local_rank=args.local_rank,
+    optimization_level=args.amp_opt_level,
+    log_dir=args.work_dir,
+    create_tb_writer=True,
+    files_to_copy=[__file__],
+    add_time_to_log_dir=True,
+)
 
 if args.config_file is not None:
     config = BertConfig.from_json_file(args.config_file).to_dict()
@@ -149,16 +158,12 @@
 
 if not args.preprocessed_data:
     special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
-    data_desc = BERTPretrainingDataDesc(args.dataset_name,
-                                        args.data_dir,
-                                        args.vocab_size,
-                                        args.sample_size,
-                                        special_tokens,
-                                        'train.txt')
+    data_desc = BERTPretrainingDataDesc(
+        args.dataset_name, args.data_dir, args.vocab_size, args.sample_size, special_tokens, 'train.txt',
+    )
     if args.tokenizer == "sentence-piece":
         nemo.logging.info("To use SentencePieceTokenizer.")
-        tokenizer = nemo_nlp.SentencePieceTokenizer(
-            model_path=data_desc.tokenizer_model)
+        tokenizer = nemo_nlp.SentencePieceTokenizer(model_path=data_desc.tokenizer_model)
         tokenizer.add_special_tokens(special_tokens)
     elif args.tokenizer == "nemo-bert":
         nemo.logging.info("To use NemoBertTokenizer.")
@@ -166,8 +171,7 @@
         # To train on a Chinese dataset, use NemoBertTokenizer
         tokenizer = nemo_nlp.NemoBertTokenizer(vocab_file=vocab_file)
     else:
-        raise ValueError("Please add your tokenizer "
-                         "or use sentence-piece or nemo-bert.")
+        raise ValueError("Please add your tokenizer " "or use sentence-piece or nemo-bert.")
     args.vocab_size = tokenizer.vocab_size
 
 print(vars(args))
@@ -178,8 +182,8 @@
     num_attention_heads=args.num_attention_heads,
     intermediate_size=args.intermediate_size,
     max_position_embeddings=args.max_seq_length,
-    hidden_act=args.hidden_act
-    )
+    hidden_act=args.hidden_act,
+)
 
 if args.bert_checkpoint is not None:
     bert_model.restore_from(args.bert_checkpoint)
@@ -189,69 +193,51 @@
 """
 
 mlm_classifier = nemo_nlp.BertTokenClassifier(
-                            args.hidden_size,
-                            num_classes=args.vocab_size,
-                            activation=args.hidden_act,
-                            log_softmax=True)
+    args.hidden_size, num_classes=args.vocab_size, activation=args.hidden_act, log_softmax=True,
+)
 mlm_loss_fn = nemo_nlp.MaskedLanguageModelingLossNM()
 if not args.only_mlm_loss:
     nsp_classifier = nemo_nlp.SequenceClassifier(
-                                            args.hidden_size,
-                                            num_classes=2,
-                                            num_layers=2,
-                                            activation='tanh',
-                                            log_softmax=False)
+        args.hidden_size, num_classes=2, num_layers=2, activation='tanh', log_softmax=False,
+    )
     nsp_loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss()
 
     bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)
 
 # tie weights of MLM softmax layer and embedding layer of the encoder
-if (mlm_classifier.mlp.last_linear_layer.weight.shape !=
-        bert_model.bert.embeddings.word_embeddings.weight.shape):
-    raise ValueError("Final classification layer does not match embedding "
-                     "layer.")
-mlm_classifier.mlp.last_linear_layer.weight = \
-    bert_model.bert.embeddings.word_embeddings.weight
+if mlm_classifier.mlp.last_linear_layer.weight.shape != bert_model.bert.embeddings.word_embeddings.weight.shape:
+    raise ValueError("Final classification layer does not match embedding " "layer.")
+mlm_classifier.mlp.last_linear_layer.weight = bert_model.bert.embeddings.word_embeddings.weight
 
 
-def create_pipeline(data_file,
-                    batch_size,
-                    preprocessed_data=False,
-                    batches_per_step=1,
-                    **kwargs):
+def create_pipeline(
+    data_file, batch_size, preprocessed_data=False, batches_per_step=1, **kwargs,
+):
 
     if not preprocessed_data:
-        max_seq_length, mask_probability, short_seq_prob =\
-            kwargs['max_seq_length'], kwargs['mask_probability'],\
-            kwargs['short_seq_prob']
+        max_seq_length, mask_probability, short_seq_prob = (
+            kwargs['max_seq_length'],
+            kwargs['mask_probability'],
+            kwargs['short_seq_prob'],
+        )
         data_layer = nemo_nlp.BertPretrainingDataLayer(
-                                                    tokenizer,
-                                                    data_file,
-                                                    max_seq_length,
-                                                    mask_probability,
-                                                    short_seq_prob,
-                                                    batch_size=batch_size)
+            tokenizer, data_file, max_seq_length, mask_probability, short_seq_prob, batch_size=batch_size,
+        )
     else:
-        training, max_predictions_per_seq =\
-            kwargs['training'], kwargs['max_predictions_per_seq']
+        training, max_predictions_per_seq = (
+            kwargs['training'],
+            kwargs['max_predictions_per_seq'],
+        )
         data_layer = nemo_nlp.BertPretrainingPreprocessedDataLayer(
-                            data_file,
-                            max_predictions_per_seq,
-                            batch_size=batch_size, training=training)
-
-    steps_per_epoch = \
-        math.ceil(len(data_layer) / (
-            batch_size * args.num_gpus * batches_per_step))
-
-    input_ids, input_type_ids, input_mask, \
-        output_ids, output_mask, nsp_labels = data_layer()
-    hidden_states = bert_model(input_ids=input_ids,
-                               token_type_ids=input_type_ids,
-                               attention_mask=input_mask)
+            data_file, max_predictions_per_seq, batch_size=batch_size, training=training,
+        )
+
+    steps_per_epoch = math.ceil(len(data_layer) / (batch_size * args.num_gpus * batches_per_step))
+
+    (input_ids, input_type_ids, input_mask, output_ids, output_mask, nsp_labels,) = data_layer()
+    hidden_states = bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
     mlm_logits = mlm_classifier(hidden_states=hidden_states)
-    mlm_loss = mlm_loss_fn(logits=mlm_logits,
-                           output_ids=output_ids,
-                           output_mask=output_mask)
+    mlm_loss = mlm_loss_fn(logits=mlm_logits, output_ids=output_ids, output_mask=output_mask)
     if not args.only_mlm_loss:
         nsp_logits = nsp_classifier(hidden_states=hidden_states)
         nsp_loss = nsp_loss_fn(logits=nsp_logits, labels=nsp_labels)
@@ -264,22 +250,24 @@ def create_pipeline(data_file,
 
 if not args.preprocessed_data:
     train_loss, mlm_loss, nsp_loss, steps_per_epoch = create_pipeline(
-                                        data_file=data_desc.train_file,
-                                        preprocessed_data=False,
-                                        max_seq_length=args.max_seq_length,
-                                        mask_probability=args.mask_probability,
-                                        short_seq_prob=args.short_seq_prob,
-                                        batch_size=args.batch_size,
-                                        batches_per_step=args.batches_per_step)
+        data_file=data_desc.train_file,
+        preprocessed_data=False,
+        max_seq_length=args.max_seq_length,
+        mask_probability=args.mask_probability,
+        short_seq_prob=args.short_seq_prob,
+        batch_size=args.batch_size,
+        batches_per_step=args.batches_per_step,
+    )
 else:
     max_pred_len = args.max_predictions_per_seq
     train_loss, mlm_loss, nsp_loss, steps_per_epoch = create_pipeline(
-                                      data_file=args.data_dir,
-                                      preprocessed_data=True,
-                                      max_predictions_per_seq=max_pred_len,
-                                      training=True,
-                                      batch_size=args.batch_size,
-                                      batches_per_step=args.batches_per_step)
+        data_file=args.data_dir,
+        preprocessed_data=True,
+        max_predictions_per_seq=max_pred_len,
+        training=True,
+        batch_size=args.batch_size,
+        batches_per_step=args.batches_per_step,
+    )
 
 print("steps per epoch", steps_per_epoch)
 # callback which prints training loss and perplexity once in a while
@@ -292,28 +280,28 @@ def create_pipeline(data_file,
 train_callback = nemo.core.SimpleLossLoggerCallback(
     tensors=log_tensors,
     step_freq=args.print_step_freq,
-    print_func=lambda x: nemo.logging.info(
-        print_msg.format(
-            *[y.item() for y in x])),
+    print_func=lambda x: nemo.logging.info(print_msg.format(*[y.item() for y in x])),
     get_tb_values=lambda x: [["loss", x[0]]],
-    tb_writer=nf.tb_writer)
+    tb_writer=nf.tb_writer,
+)
 
-ckpt_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir,
-                                             epoch_freq=args.save_epoch_freq,
-                                             load_from_folder=args.load_dir,
-                                             step_freq=args.save_step_freq)
+ckpt_callback = nemo.core.CheckpointCallback(
+    folder=nf.checkpoint_dir,
+    epoch_freq=args.save_epoch_freq,
+    load_from_folder=args.load_dir,
+    step_freq=args.save_step_freq,
+)
 
 # define learning rate decay policy
 if args.lr_policy is not None:
     if args.max_steps < 0:
         lr_policy_fn = get_lr_policy(
-                            args.lr_policy,
-                            total_steps=args.num_epochs * steps_per_epoch,
-                            warmup_ratio=args.lr_warmup_proportion)
+            args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+        )
     else:
-        lr_policy_fn = get_lr_policy(args.lr_policy,
-                                     total_steps=args.max_steps,
-                                     warmup_ratio=args.lr_warmup_proportion)
+        lr_policy_fn = get_lr_policy(
+            args.lr_policy, total_steps=args.max_steps, warmup_ratio=args.lr_warmup_proportion,
+        )
 else:
     lr_policy_fn = None
 
@@ -322,20 +310,24 @@ def create_pipeline(data_file,
     bert_model.config.to_json_file(config_path)
 
 # define and launch training algorithm (optimizer)
-optimization_params = {"batch_size": args.batch_size,
-                       "lr": args.lr,
-                       "betas": (args.beta1, args.beta2),
-                       "weight_decay": args.weight_decay}
+optimization_params = {
+    "batch_size": args.batch_size,
+    "lr": args.lr,
+    "betas": (args.beta1, args.beta2),
+    "weight_decay": args.weight_decay,
+}
 
 if args.max_steps < 0:
     optimization_params['num_epochs'] = args.num_epochs
 else:
     optimization_params['max_steps'] = args.max_steps
 
-nf.train(tensors_to_optimize=[train_loss],
-         lr_policy=lr_policy_fn,
-         callbacks=[train_callback, ckpt_callback],
-         optimizer=args.optimizer,
-         batches_per_step=args.batches_per_step,
-         gradient_predivide=args.gradient_predivide,
-         optimization_params=optimization_params)
+nf.train(
+    tensors_to_optimize=[train_loss],
+    lr_policy=lr_policy_fn,
+    callbacks=[train_callback, ckpt_callback],
+    optimizer=args.optimizer,
+    batches_per_step=args.batches_per_step,
+    gradient_predivide=args.gradient_predivide,
+    optimization_params=optimization_params,
+)
diff --git a/examples/nlp/glue_with_BERT.py b/examples/nlp/glue_with_BERT.py
index 12cd720deec8..2513acde300e 100644
--- a/examples/nlp/glue_with_BERT.py
+++ b/examples/nlp/glue_with_BERT.py
@@ -60,97 +60,126 @@
 
 """
 
-import os
-import sys
-
 import argparse
 import json
+import os
+import sys
 
 import nemo
-from nemo.backends.pytorch.common import CrossEntropyLoss
-from nemo.backends.pytorch.common import MSELoss
-
-from nemo.utils.lr_policies import get_lr_policy
-
 import nemo.collections.nlp as nemo_nlp
-
-from nemo.collections.nlp import NemoBertTokenizer
-from nemo.collections.nlp import SentencePieceTokenizer
-
-from nemo.collections.nlp.utils.callbacks.glue import eval_iter_callback
-from nemo.collections.nlp.utils.callbacks.glue import eval_epochs_done_callback
-
-from nemo.collections.nlp.data.datasets.utils import processors
-from nemo.collections.nlp.data.datasets.utils import output_modes
+from nemo.backends.pytorch.common import CrossEntropyLoss, MSELoss
+from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer
+from nemo.collections.nlp.data.datasets.utils import output_modes, processors
+from nemo.collections.nlp.utils.callbacks.glue import eval_epochs_done_callback, eval_iter_callback
+from nemo.utils.lr_policies import get_lr_policy
 
 parser = argparse.ArgumentParser(description="GLUE_with_pretrained_BERT")
 
 # Parsing arguments
-parser.add_argument("--data_dir", default='COLA', type=str, required=True,
-                    help="The input data dir. Should contain the .tsv    \
-                    files (or other data files) for the task.")
-parser.add_argument("--task_name", default="CoLA", type=str, required=True,
-                    choices=['cola', 'sst-2', 'mrpc', 'sts-b', 'qqp', 'mnli',
-                             'qnli', 'rte', 'wnli'],
-                    help="GLUE task name, MNLI includes both matched and \
-                    mismatched tasks")
-parser.add_argument("--dataset_type", default="GLUEDataset", type=str,
-                    help='Type of dataset to create datalayers')
-parser.add_argument("--pretrained_bert_model", default="bert-base-cased",
-                    type=str, help="Name of the pre-trained model")
-parser.add_argument("--bert_checkpoint", default=None, type=str,
-                    help="Path to model checkpoint")
-parser.add_argument("--bert_config", default=None, type=str,
-                    help="Path to bert config file in json format")
-parser.add_argument("--tokenizer_model", default="tokenizer.model", type=str,
-                    help="Path to pretrained tokenizer model, \
-                    only used if --tokenizer is sentencepiece")
-parser.add_argument("--tokenizer", default="nemobert", type=str,
-                    choices=["nemobert", "sentencepiece"],
-                    help="tokenizer to use, \
-                    only relevant when using custom pretrained checkpoint.")
-parser.add_argument("--max_seq_length", default=128, type=int,
-                    choices=range(1, 513),
-                    help="The maximum total input sequence length after   \
+parser.add_argument(
+    "--data_dir",
+    default='COLA',
+    type=str,
+    required=True,
+    help="The input data dir. Should contain the .tsv    \
+                    files (or other data files) for the task.",
+)
+parser.add_argument(
+    "--task_name",
+    default="CoLA",
+    type=str,
+    required=True,
+    choices=['cola', 'sst-2', 'mrpc', 'sts-b', 'qqp', 'mnli', 'qnli', 'rte', 'wnli',],
+    help="GLUE task name, MNLI includes both matched and \
+                    mismatched tasks",
+)
+parser.add_argument(
+    "--dataset_type", default="GLUEDataset", type=str, help='Type of dataset to create datalayers',
+)
+parser.add_argument(
+    "--pretrained_bert_model", default="bert-base-cased", type=str, help="Name of the pre-trained model",
+)
+parser.add_argument(
+    "--bert_checkpoint", default=None, type=str, help="Path to model checkpoint",
+)
+parser.add_argument(
+    "--bert_config", default=None, type=str, help="Path to bert config file in json format",
+)
+parser.add_argument(
+    "--tokenizer_model",
+    default="tokenizer.model",
+    type=str,
+    help="Path to pretrained tokenizer model, \
+                    only used if --tokenizer is sentencepiece",
+)
+parser.add_argument(
+    "--tokenizer",
+    default="nemobert",
+    type=str,
+    choices=["nemobert", "sentencepiece"],
+    help="tokenizer to use, \
+                    only relevant when using custom pretrained checkpoint.",
+)
+parser.add_argument(
+    "--max_seq_length",
+    default=128,
+    type=int,
+    choices=range(1, 513),
+    help="The maximum total input sequence length after   \
                     tokenization.Sequences longer than this will be       \
-                    truncated, sequences shorter will be padded.")
-parser.add_argument("--optimizer_kind", default="adam", type=str,
-                    help="Optimizer kind")
+                    truncated, sequences shorter will be padded.",
+)
+parser.add_argument("--optimizer_kind", default="adam", type=str, help="Optimizer kind")
 parser.add_argument("--lr_policy", default="WarmupAnnealing", type=str)
-parser.add_argument("--lr", default=5e-5, type=float,
-                    help="The initial learning rate.")
+parser.add_argument("--lr", default=5e-5, type=float, help="The initial learning rate.")
 parser.add_argument("--lr_warmup_proportion", default=0.1, type=float)
-parser.add_argument("--weight_decay", default=0.0, type=float,
-                    help="Weight deay if we apply some.")
-parser.add_argument("--num_epochs", default=3, type=int,
-                    help="Total number of training epochs to perform.")
-parser.add_argument("--batch_size", default=8, type=int,
-                    help="Batch size per GPU/CPU for training/evaluation.")
-parser.add_argument("--num_gpus", default=1, type=int,
-                    help="Number of GPUs")
-parser.add_argument("--amp_opt_level", default="O0", type=str,
-                    choices=["O0", "O1", "O2"],
-                    help="01/02 to enable mixed precision")
-parser.add_argument("--local_rank", type=int, default=None,
-                    help="For distributed training: local_rank")
-parser.add_argument("--work_dir", default='output_glue', type=str,
-                    help="The output directory where the model predictions \
-                    and checkpoints will be written.")
-parser.add_argument("--save_epoch_freq", default=1, type=int,
-                    help="Frequency of saving checkpoint \
-                    '-1' - epoch checkpoint won't be saved")
-parser.add_argument("--save_step_freq", default=-1, type=int,
-                    help="Frequency of saving checkpoint \
-                    '-1' - step checkpoint won't be saved")
-parser.add_argument("--loss_step_freq", default=25, type=int,
-                    help="Frequency of printing loss")
+parser.add_argument(
+    "--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.",
+)
+parser.add_argument(
+    "--num_epochs", default=3, type=int, help="Total number of training epochs to perform.",
+)
+parser.add_argument(
+    "--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training/evaluation.",
+)
+parser.add_argument("--num_gpus", default=1, type=int, help="Number of GPUs")
+parser.add_argument(
+    "--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"], help="01/02 to enable mixed precision",
+)
+parser.add_argument(
+    "--local_rank", type=int, default=None, help="For distributed training: local_rank",
+)
+parser.add_argument(
+    "--work_dir",
+    default='output_glue',
+    type=str,
+    help="The output directory where the model predictions \
+                    and checkpoints will be written.",
+)
+parser.add_argument(
+    "--save_epoch_freq",
+    default=1,
+    type=int,
+    help="Frequency of saving checkpoint \
+                    '-1' - epoch checkpoint won't be saved",
+)
+parser.add_argument(
+    "--save_step_freq",
+    default=-1,
+    type=int,
+    help="Frequency of saving checkpoint \
+                    '-1' - step checkpoint won't be saved",
+)
+parser.add_argument("--loss_step_freq", default=25, type=int, help="Frequency of printing loss")
 
 args = parser.parse_args()
 
 if not os.path.exists(args.data_dir):
-    raise FileNotFoundError("GLUE datasets not found. Datasets can be "
-                            "obtained at https://gist.github.com/W4ngatang/ \
-                            60c2bdb54d156a41194446737ce03e2e")
+    raise FileNotFoundError(
+        "GLUE datasets not found. Datasets can be "
+        "obtained at https://gist.github.com/W4ngatang/ \
+                            60c2bdb54d156a41194446737ce03e2e"
+    )
 
 args.work_dir = f'{args.work_dir}/{args.task_name.upper()}'
 
@@ -170,13 +199,15 @@
 output_mode = output_modes[args.task_name]
 
 # Instantiate neural factory with supported backend
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   local_rank=args.local_rank,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=args.work_dir,
-                                   create_tb_writer=True,
-                                   files_to_copy=[__file__],
-                                   add_time_to_log_dir=True)
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch,
+    local_rank=args.local_rank,
+    optimization_level=args.amp_opt_level,
+    log_dir=args.work_dir,
+    create_tb_writer=True,
+    files_to_copy=[__file__],
+    add_time_to_log_dir=True,
+)
 
 if args.bert_checkpoint is None:
     """ Use this if you're using a standard BERT model.
@@ -184,8 +215,7 @@
     nemo_nlp.huggingface.BERT.list_pretrained_models()
     """
     tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
-    model = nemo_nlp.huggingface.BERT(
-        pretrained_model_name=args.pretrained_bert_model)
+    model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 else:
     """ Use this if you're using a BERT model that you pre-trained yourself.
     Replace BERT-STEP-150000.pt with the path to your checkpoint.
@@ -202,8 +232,7 @@
             config = json.load(json_file)
         model = nemo_nlp.huggingface.BERT(**config)
     else:
-        model = nemo_nlp.huggingface.BERT(
-            pretrained_model_name=args.pretrained_bert_model)
+        model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 
     model.restore_from(args.bert_checkpoint)
 
@@ -214,18 +243,18 @@
     pooler = nemo_nlp.SequenceRegression(hidden_size=hidden_size)
     glue_loss = MSELoss()
 else:
-    pooler = nemo_nlp.SequenceClassifier(hidden_size=hidden_size,
-                                         num_classes=num_labels,
-                                         log_softmax=False)
+    pooler = nemo_nlp.SequenceClassifier(hidden_size=hidden_size, num_classes=num_labels, log_softmax=False)
     glue_loss = CrossEntropyLoss()
 
 
-def create_pipeline(max_seq_length=args.max_seq_length,
-                    batch_size=args.batch_size,
-                    local_rank=args.local_rank,
-                    num_gpus=args.num_gpus,
-                    evaluate=False,
-                    processor=task_processors[0]):
+def create_pipeline(
+    max_seq_length=args.max_seq_length,
+    batch_size=args.batch_size,
+    local_rank=args.local_rank,
+    num_gpus=args.num_gpus,
+    evaluate=False,
+    processor=task_processors[0],
+):
 
     data_layer = 'GlueDataLayerClassification'
     if output_mode == 'regression':
@@ -234,22 +263,21 @@ def create_pipeline(max_seq_length=args.max_seq_length,
     data_layer = getattr(sys.modules[__name__], data_layer)
 
     data_layer = data_layer(
-                    dataset_type=args.dataset_type,
-                    processor=processor,
-                    evaluate=evaluate,
-                    batch_size=batch_size,
-                    num_workers=0,
-                    local_rank=local_rank,
-                    tokenizer=tokenizer,
-                    data_dir=args.data_dir,
-                    max_seq_length=max_seq_length,
-                    token_params=token_params)
+        dataset_type=args.dataset_type,
+        processor=processor,
+        evaluate=evaluate,
+        batch_size=batch_size,
+        num_workers=0,
+        local_rank=local_rank,
+        tokenizer=tokenizer,
+        data_dir=args.data_dir,
+        max_seq_length=max_seq_length,
+        token_params=token_params,
+    )
 
     input_ids, input_type_ids, input_mask, labels = data_layer()
 
-    hidden_states = model(input_ids=input_ids,
-                          token_type_ids=input_type_ids,
-                          attention_mask=input_mask)
+    hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
 
     """
     For STS-B (regressiont tast), the pooler_output represents a is single
@@ -267,37 +295,41 @@ def create_pipeline(max_seq_length=args.max_seq_length,
     return loss, steps_per_epoch, data_layer, [pooler_output, labels]
 
 
-token_params = {'bos_token': None,
-                'eos_token': '[SEP]',
-                'pad_token': '[PAD]',
-                'cls_token': '[CLS]'}
+token_params = {
+    'bos_token': None,
+    'eos_token': '[SEP]',
+    'pad_token': '[PAD]',
+    'cls_token': '[CLS]',
+}
 
 train_loss, steps_per_epoch, _, _ = create_pipeline()
 _, _, eval_data_layer, eval_tensors = create_pipeline(evaluate=True)
 
-callbacks_eval = [nemo.core.EvaluatorCallback(
-    eval_tensors=eval_tensors,
-    user_iter_callback=lambda x, y: eval_iter_callback(x, y),
-    user_epochs_done_callback=lambda x:
-        eval_epochs_done_callback(x, args.work_dir, eval_task_names[0]),
-    tb_writer=nf.tb_writer,
-    eval_step=steps_per_epoch)]
+callbacks_eval = [
+    nemo.core.EvaluatorCallback(
+        eval_tensors=eval_tensors,
+        user_iter_callback=lambda x, y: eval_iter_callback(x, y),
+        user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, args.work_dir, eval_task_names[0]),
+        tb_writer=nf.tb_writer,
+        eval_step=steps_per_epoch,
+    )
+]
 
 """
 MNLI task has two dev sets: matched and mismatched
 Create additional callback and data layer for MNLI mismatched dev set
 """
 if args.task_name == 'mnli':
-    _, _, eval_data_layer_mm, eval_tensors_mm = create_pipeline(
-                                                evaluate=True,
-                                                processor=task_processors[1])
-    callbacks_eval.append(nemo.core.EvaluatorCallback(
-        eval_tensors=eval_tensors_mm,
-        user_iter_callback=lambda x, y: eval_iter_callback(x, y),
-        user_epochs_done_callback=lambda x:
-            eval_epochs_done_callback(x, args.work_dir, eval_task_names[1]),
-        tb_writer=nf.tb_writer,
-        eval_step=steps_per_epoch))
+    _, _, eval_data_layer_mm, eval_tensors_mm = create_pipeline(evaluate=True, processor=task_processors[1])
+    callbacks_eval.append(
+        nemo.core.EvaluatorCallback(
+            eval_tensors=eval_tensors_mm,
+            user_iter_callback=lambda x, y: eval_iter_callback(x, y),
+            user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, args.work_dir, eval_task_names[1]),
+            tb_writer=nf.tb_writer,
+            eval_step=steps_per_epoch,
+        )
+    )
 
 nemo.logging.info(f"steps_per_epoch = {steps_per_epoch}")
 callback_train = nemo.core.SimpleLossLoggerCallback(
@@ -305,20 +337,21 @@ def create_pipeline(max_seq_length=args.max_seq_length,
     print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())),
     get_tb_values=lambda x: [["loss", x[0]]],
     step_freq=args.loss_step_freq,
-    tb_writer=nf.tb_writer)
+    tb_writer=nf.tb_writer,
+)
 
 ckpt_callback = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir,
-    epoch_freq=args.save_epoch_freq,
-    step_freq=args.save_step_freq)
-
-lr_policy_fn = get_lr_policy(args.lr_policy,
-                             total_steps=args.num_epochs * steps_per_epoch,
-                             warmup_ratio=args.lr_warmup_proportion)
-
-nf.train(tensors_to_optimize=[train_loss],
-         callbacks=[callback_train, ckpt_callback] + callbacks_eval,
-         lr_policy=lr_policy_fn,
-         optimizer=args.optimizer_kind,
-         optimization_params={"num_epochs": args.num_epochs,
-                              "lr": args.lr})
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+)
+
+lr_policy_fn = get_lr_policy(
+    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+)
+
+nf.train(
+    tensors_to_optimize=[train_loss],
+    callbacks=[callback_train, ckpt_callback] + callbacks_eval,
+    lr_policy=lr_policy_fn,
+    optimizer=args.optimizer_kind,
+    optimization_params={"num_epochs": args.num_epochs, "lr": args.lr},
+)
diff --git a/examples/nlp/joint_intent_slot_infer.py b/examples/nlp/joint_intent_slot_infer.py
index fe32cfd92fb5..1df4a66de300 100644
--- a/examples/nlp/joint_intent_slot_infer.py
+++ b/examples/nlp/joint_intent_slot_infer.py
@@ -2,31 +2,24 @@
 import os
 
 import numpy as np
+from sklearn.metrics import classification_report, confusion_matrix
 from transformers import BertTokenizer
-from sklearn.metrics import confusion_matrix, classification_report
 
 import nemo
 import nemo.collections.nlp as nemo_nlp
 from nemo.collections.nlp.data.datasets.utils import JointIntentSlotDataDesc
 
-
 # Parsing arguments
 parser = argparse.ArgumentParser(description='Joint-intent BERT')
 parser.add_argument("--local_rank", default=None, type=int)
 parser.add_argument("--batch_size", default=128, type=int)
 parser.add_argument("--max_seq_length", default=50, type=int)
-parser.add_argument("--pretrained_bert_model",
-                    default="bert-base-uncased",
-                    type=str)
+parser.add_argument("--pretrained_bert_model", default="bert-base-uncased", type=str)
 parser.add_argument("--dataset_name", default='snips-all', type=str)
 parser.add_argument("--data_dir", default='data/nlu/snips', type=str)
-parser.add_argument("--work_dir",
-                    required=True,
-                    help="your checkpoint folder",
-                    type=str)
+parser.add_argument("--work_dir", required=True, help="your checkpoint folder", type=str)
 parser.add_argument("--eval_file_prefix", default='test', type=str)
-parser.add_argument("--amp_opt_level", default="O0",
-                    type=str, choices=["O0", "O1", "O2"])
+parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
 parser.add_argument("--do_lower_case", action='store_false')
 
 args = parser.parse_args()
@@ -34,24 +27,20 @@
 if not os.path.exists(args.data_dir):
     raise ValueError(f'Data not found at {args.data_dir}')
 
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   local_rank=args.local_rank,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=None)
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=None,
+)
 
 """ Load the pretrained BERT parameters
 See the list of pretrained models, call:
 nemo_nlp.huggingface.BERT.list_pretrained_models()
 """
-pretrained_bert_model = nemo_nlp.huggingface.BERT(
-    pretrained_model_name=args.pretrained_bert_model)
+pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 hidden_size = pretrained_bert_model.local_parameters["hidden_size"]
 tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model)
 
 
-data_desc = JointIntentSlotDataDesc(args.data_dir,
-                                    args.do_lower_case,
-                                    args.dataset_name)
+data_desc = JointIntentSlotDataDesc(args.data_dir, args.do_lower_case, args.dataset_name)
 
 # Evaluation pipeline
 nemo.logging.info("Loading eval data...")
@@ -64,20 +53,16 @@
     shuffle=False,
     batch_size=args.batch_size,
     num_workers=0,
-    local_rank=args.local_rank)
+    local_rank=args.local_rank,
+)
 
 classifier = nemo_nlp.JointIntentSlotClassifier(
-    hidden_size=hidden_size,
-    num_intents=data_desc.num_intents,
-    num_slots=data_desc.num_slots)
+    hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots,
+)
 
-ids, type_ids, \
-    input_mask, loss_mask, subtokens_mask, \
-    intents, slots = data_layer()
+(ids, type_ids, input_mask, loss_mask, subtokens_mask, intents, slots,) = data_layer()
 
-hidden_states = pretrained_bert_model(input_ids=ids,
-                                      token_type_ids=type_ids,
-                                      attention_mask=input_mask)
+hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask)
 intent_logits, slot_logits = classifier(hidden_states=hidden_states)
 
 ###########################################################################
@@ -85,10 +70,7 @@
 
 # Instantiate an optimizer to perform `infer` action
 evaluated_tensors = nf.infer(
-    tensors=[intent_logits, slot_logits,
-             loss_mask, subtokens_mask,
-             intents, slots],
-    checkpoint_dir=args.work_dir,
+    tensors=[intent_logits, slot_logits, loss_mask, subtokens_mask, intents, slots,], checkpoint_dir=args.work_dir,
 )
 
 
@@ -100,8 +82,9 @@ def get_preds(logits):
     return np.argmax(logits, 1)
 
 
-intent_logits, slot_logits, loss_mask, subtokens_mask, intents, slot_labels =\
-    [concatenate(tensors) for tensors in evaluated_tensors]
+intent_logits, slot_logits, loss_mask, subtokens_mask, intents, slot_labels = [
+    concatenate(tensors) for tensors in evaluated_tensors
+]
 
 
 pred_intents = np.argmax(intent_logits, 1)
@@ -124,7 +107,6 @@ def get_preds(logits):
 nemo.logging.info('Slot prediction results')
 slot_labels_list = np.asarray(slot_labels_list)
 slot_preds_list = np.asarray(slot_preds_list)
-slot_accuracy = sum(slot_labels_list == slot_preds_list) / \
-    len(slot_labels_list)
+slot_accuracy = sum(slot_labels_list == slot_preds_list) / len(slot_labels_list)
 nemo.logging.info(f'Slot accuracy: {slot_accuracy}')
 nemo.logging.info(classification_report(slot_labels_list, slot_preds_list))
diff --git a/examples/nlp/joint_intent_slot_infer_b1.py b/examples/nlp/joint_intent_slot_infer_b1.py
index 1a9cd3c0b990..8d6f0bedfd84 100644
--- a/examples/nlp/joint_intent_slot_infer_b1.py
+++ b/examples/nlp/joint_intent_slot_infer_b1.py
@@ -8,91 +8,67 @@
 from nemo.collections.nlp.data.datasets.utils import JointIntentSlotDataDesc
 from nemo.collections.nlp.utils.nlp_utils import read_intent_slot_outputs
 
-
 # Parsing arguments
 parser = argparse.ArgumentParser(description='Joint-intent BERT')
 parser.add_argument("--max_seq_length", default=50, type=int)
 parser.add_argument("--fc_dropout", default=0.1, type=float)
-parser.add_argument("--pretrained_bert_model",
-                    default="bert-base-uncased",
-                    type=str)
+parser.add_argument("--pretrained_bert_model", default="bert-base-uncased", type=str)
 parser.add_argument("--dataset_name", default='snips-all', type=str)
-parser.add_argument("--data_dir",
-                    default='data/nlu/snips',
-                    type=str)
+parser.add_argument("--data_dir", default='data/nlu/snips', type=str)
 parser.add_argument("--query", default='please turn on the light', type=str)
-parser.add_argument("--work_dir",
-                    required=True,
-                    help="your checkpoint folder",
-                    type=str)
-parser.add_argument("--amp_opt_level", default="O0",
-                    type=str, choices=["O0", "O1", "O2"])
+parser.add_argument("--work_dir", required=True, help="your checkpoint folder", type=str)
+parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
 parser.add_argument("--do_lower_case", action='store_false')
 
 args = parser.parse_args()
 
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=None)
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None,
+)
 
 """ Load the pretrained BERT parameters
 See the list of pretrained models, call:
 nemo_nlp.huggingface.BERT.list_pretrained_models()
 """
-pretrained_bert_model = nemo_nlp.huggingface.BERT(
-    pretrained_model_name=args.pretrained_bert_model, factory=nf)
+pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model, factory=nf)
 tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model)
 hidden_size = pretrained_bert_model.local_parameters["hidden_size"]
 
-data_desc = JointIntentSlotDataDesc(args.data_dir,
-                                    args.do_lower_case,
-                                    args.dataset_name)
+data_desc = JointIntentSlotDataDesc(args.data_dir, args.do_lower_case, args.dataset_name)
 
 query = args.query
 if args.do_lower_case:
     query = query.lower()
 
 data_layer = nemo_nlp.BertJointIntentSlotInferDataLayer(
-    queries=[query],
-    tokenizer=tokenizer,
-    max_seq_length=args.max_seq_length,
-    batch_size=1)
+    queries=[query], tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1,
+)
 
 
 # Create sentence classification loss on top
 classifier = nemo_nlp.JointIntentSlotClassifier(
-    hidden_size=hidden_size,
-    num_intents=data_desc.num_intents,
-    num_slots=data_desc.num_slots,
-    dropout=args.fc_dropout)
+    hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout,
+)
 
 ids, type_ids, input_mask, loss_mask, subtokens_mask = data_layer()
 
 
-hidden_states = pretrained_bert_model(input_ids=ids,
-                                      token_type_ids=type_ids,
-                                      attention_mask=input_mask)
+hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask)
 
 intent_logits, slot_logits = classifier(hidden_states=hidden_states)
 
 ###########################################################################
 
 
-evaluated_tensors = nf.infer(
-    tensors=[intent_logits, slot_logits, subtokens_mask],
-    checkpoint_dir=args.work_dir)
+evaluated_tensors = nf.infer(tensors=[intent_logits, slot_logits, subtokens_mask], checkpoint_dir=args.work_dir,)
 
 
 def concatenate(lists):
     return np.concatenate([t.cpu() for t in lists])
 
 
-intent_logits, slot_logits, subtokens_mask = \
-    [concatenate(tensors) for tensors in evaluated_tensors]
+intent_logits, slot_logits, subtokens_mask = [concatenate(tensors) for tensors in evaluated_tensors]
 
-read_intent_slot_outputs([query],
-                         data_desc.intent_dict_file,
-                         data_desc.slot_dict_file,
-                         intent_logits,
-                         slot_logits,
-                         subtokens_mask)
+read_intent_slot_outputs(
+    [query], data_desc.intent_dict_file, data_desc.slot_dict_file, intent_logits, slot_logits, subtokens_mask,
+)
diff --git a/examples/nlp/joint_intent_slot_with_bert.py b/examples/nlp/joint_intent_slot_with_bert.py
index 8c404755f07a..665f1701b62c 100644
--- a/examples/nlp/joint_intent_slot_with_bert.py
+++ b/examples/nlp/joint_intent_slot_with_bert.py
@@ -6,17 +6,13 @@
 from transformers import BertTokenizer
 
 import nemo
-from nemo.utils.lr_policies import get_lr_policy
-
 import nemo.collections.nlp as nemo_nlp
 from nemo.collections.nlp.data.datasets.utils import JointIntentSlotDataDesc
-from nemo.collections.nlp.utils.callbacks.joint_intent_slot import \
-    eval_iter_callback, eval_epochs_done_callback
-
+from nemo.collections.nlp.utils.callbacks.joint_intent_slot import eval_epochs_done_callback, eval_iter_callback
+from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments
-parser = argparse.ArgumentParser(
-    description='Joint intent slot filling system with pretrained BERT')
+parser = argparse.ArgumentParser(description='Joint intent slot filling system with pretrained BERT')
 parser.add_argument("--local_rank", default=None, type=int)
 parser.add_argument("--batch_size", default=128, type=int)
 parser.add_argument("--max_seq_length", default=50, type=int)
@@ -31,9 +27,7 @@
 parser.add_argument("--fc_dropout", default=0.1, type=float)
 parser.add_argument("--ignore_start_end", action='store_false')
 parser.add_argument("--ignore_extra_tokens", action='store_false')
-parser.add_argument("--pretrained_bert_model",
-                    default="bert-base-uncased",
-                    type=str)
+parser.add_argument("--pretrained_bert_model", default="bert-base-uncased", type=str)
 parser.add_argument("--bert_checkpoint", default="", type=str)
 parser.add_argument("--bert_config", default="", type=str)
 parser.add_argument("--data_dir", default='data/nlu/atis', type=str)
@@ -46,13 +40,13 @@
 parser.add_argument("--save_epoch_freq", default=1, type=int)
 parser.add_argument("--save_step_freq", default=-1, type=int)
 parser.add_argument("--optimizer_kind", default="adam", type=str)
-parser.add_argument("--amp_opt_level", default="O0",
-                    type=str, choices=["O0", "O1", "O2"])
+parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
 parser.add_argument("--do_lower_case", action='store_true')
 parser.add_argument("--shuffle_data", action='store_true')
 parser.add_argument("--intent_loss_weight", default=0.6, type=float)
-parser.add_argument("--class_balancing", default="regular", type=str,
-                    choices=["regular", "weighted_loss"])
+parser.add_argument(
+    "--class_balancing", default="regular", type=str, choices=["regular", "weighted_loss"],
+)
 
 args = parser.parse_args()
 
@@ -60,13 +54,15 @@
     raise ValueError(f'Data not found at {args.data_dir}')
 
 work_dir = f'{args.work_dir}/{args.dataset_name.upper()}'
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   local_rank=args.local_rank,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=work_dir,
-                                   create_tb_writer=True,
-                                   files_to_copy=[__file__],
-                                   add_time_to_log_dir=True)
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch,
+    local_rank=args.local_rank,
+    optimization_level=args.amp_opt_level,
+    log_dir=work_dir,
+    create_tb_writer=True,
+    files_to_copy=[__file__],
+    add_time_to_log_dir=True,
+)
 
 tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model)
 
@@ -75,46 +71,37 @@
 nemo_nlp.huggingface.BERT.list_pretrained_models()
 """
 if args.bert_checkpoint and args.bert_config:
-    pretrained_bert_model = nemo_nlp.huggingface.BERT(
-        config_filename=args.bert_config, factory=nf)
+    pretrained_bert_model = nemo_nlp.huggingface.BERT(config_filename=args.bert_config, factory=nf)
     pretrained_bert_model.restore_from(args.bert_checkpoint)
 else:
-    pretrained_bert_model = nemo_nlp.huggingface.BERT(
-        pretrained_model_name=args.pretrained_bert_model, factory=nf)
+    pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model, factory=nf)
 
 hidden_size = pretrained_bert_model.local_parameters["hidden_size"]
 
-data_desc = JointIntentSlotDataDesc(args.data_dir,
-                                    args.do_lower_case,
-                                    args.dataset_name,
-                                    args.none_slot_label,
-                                    args.pad_label)
+data_desc = JointIntentSlotDataDesc(
+    args.data_dir, args.do_lower_case, args.dataset_name, args.none_slot_label, args.pad_label,
+)
 
 # Create sentence classification loss on top
 classifier = nemo_nlp.JointIntentSlotClassifier(
-    hidden_size=hidden_size,
-    num_intents=data_desc.num_intents,
-    num_slots=data_desc.num_slots,
-    dropout=args.fc_dropout)
+    hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout,
+)
 
 if args.class_balancing == 'weighted_loss':
     # Using weighted loss will enable weighted loss for both intents and slots
     # Use the intent_loss_weight hyperparameter to adjust intent loss to
     # prevent overfitting or underfitting.
     loss_fn = nemo_nlp.JointIntentSlotLoss(
-      num_slots=data_desc.num_slots,
-      slot_classes_loss_weights=data_desc.slot_weights,
-      intent_classes_loss_weights=data_desc.intent_weights,
-      intent_loss_weight=args.intent_loss_weight)
+        num_slots=data_desc.num_slots,
+        slot_classes_loss_weights=data_desc.slot_weights,
+        intent_classes_loss_weights=data_desc.intent_weights,
+        intent_loss_weight=args.intent_loss_weight,
+    )
 else:
     loss_fn = nemo_nlp.JointIntentSlotLoss(num_slots=data_desc.num_slots)
 
 
-def create_pipeline(num_samples=-1,
-                    batch_size=32,
-                    num_gpus=1,
-                    local_rank=0,
-                    mode='train'):
+def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'):
     nemo.logging.info(f"Loading {mode} data...")
     data_file = f'{data_desc.data_dir}/{mode}.tsv'
     slot_file = f'{data_desc.data_dir}/{mode}_slots.tsv'
@@ -132,11 +119,10 @@ def create_pipeline(num_samples=-1,
         num_workers=0,
         local_rank=local_rank,
         ignore_extra_tokens=args.ignore_extra_tokens,
-        ignore_start_end=args.ignore_start_end
-        )
+        ignore_start_end=args.ignore_start_end,
+    )
 
-    ids, type_ids, input_mask, loss_mask, \
-        subtokens_mask, intents, slots = data_layer()
+    (ids, type_ids, input_mask, loss_mask, subtokens_mask, intents, slots,) = data_layer()
     data_size = len(data_layer)
 
     print(f'The length of data layer is {data_size}')
@@ -149,23 +135,24 @@ def create_pipeline(num_samples=-1,
     steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus))
     nemo.logging.info(f"Steps_per_epoch = {steps_per_epoch}")
 
-    hidden_states = pretrained_bert_model(input_ids=ids,
-                                          token_type_ids=type_ids,
-                                          attention_mask=input_mask)
+    hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask)
 
     intent_logits, slot_logits = classifier(hidden_states=hidden_states)
 
-    loss = loss_fn(intent_logits=intent_logits,
-                   slot_logits=slot_logits,
-                   loss_mask=loss_mask,
-                   intents=intents,
-                   slots=slots)
+    loss = loss_fn(
+        intent_logits=intent_logits, slot_logits=slot_logits, loss_mask=loss_mask, intents=intents, slots=slots,
+    )
 
     if mode == 'train':
         tensors_to_evaluate = [loss, intent_logits, slot_logits]
     else:
-        tensors_to_evaluate = [intent_logits, slot_logits, intents,
-                               slots, subtokens_mask]
+        tensors_to_evaluate = [
+            intent_logits,
+            slot_logits,
+            intents,
+            slots,
+            subtokens_mask,
+        ]
 
     return tensors_to_evaluate, loss, steps_per_epoch, data_layer
 
@@ -175,13 +162,15 @@ def create_pipeline(num_samples=-1,
     batch_size=args.batch_size,
     num_gpus=args.num_gpus,
     local_rank=args.local_rank,
-    mode=args.train_file_prefix)
-eval_tensors, _,  _, data_layer = create_pipeline(
+    mode=args.train_file_prefix,
+)
+eval_tensors, _, _, data_layer = create_pipeline(
     args.num_eval_samples,
     batch_size=args.batch_size,
     num_gpus=args.num_gpus,
     local_rank=args.local_rank,
-    mode=args.eval_file_prefix)
+    mode=args.eval_file_prefix,
+)
 
 # Create callbacks for train and eval modes
 train_callback = nemo.core.SimpleLossLoggerCallback(
@@ -189,31 +178,30 @@ def create_pipeline(num_samples=-1,
     print_func=lambda x: str(np.round(x[0].item(), 3)),
     tb_writer=nf.tb_writer,
     get_tb_values=lambda x: [["loss", x[0]]],
-    step_freq=steps_per_epoch)
+    step_freq=steps_per_epoch,
+)
 
 eval_callback = nemo.core.EvaluatorCallback(
     eval_tensors=eval_tensors,
-    user_iter_callback=lambda x, y: eval_iter_callback(
-        x, y, data_layer),
-    user_epochs_done_callback=lambda x: eval_epochs_done_callback(
-        x, f'{nf.work_dir}/graphs'),
+    user_iter_callback=lambda x, y: eval_iter_callback(x, y, data_layer),
+    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, f'{nf.work_dir}/graphs'),
     tb_writer=nf.tb_writer,
-    eval_step=steps_per_epoch)
+    eval_step=steps_per_epoch,
+)
 
 # Create callback to save checkpoints
 ckpt_callback = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir,
-    epoch_freq=args.save_epoch_freq,
-    step_freq=args.save_step_freq)
-
-lr_policy_fn = get_lr_policy(args.lr_policy,
-                             total_steps=args.num_epochs * steps_per_epoch,
-                             warmup_ratio=args.lr_warmup_proportion)
-
-nf.train(tensors_to_optimize=[train_loss],
-         callbacks=[train_callback, eval_callback, ckpt_callback],
-         lr_policy=lr_policy_fn,
-         optimizer=args.optimizer_kind,
-         optimization_params={"num_epochs": args.num_epochs,
-                              "lr": args.lr,
-                              "weight_decay": args.weight_decay})
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+)
+
+lr_policy_fn = get_lr_policy(
+    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+)
+
+nf.train(
+    tensors_to_optimize=[train_loss],
+    callbacks=[train_callback, eval_callback, ckpt_callback],
+    lr_policy=lr_policy_fn,
+    optimizer=args.optimizer_kind,
+    optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay,},
+)
diff --git a/examples/nlp/nmt_tutorial.py b/examples/nlp/nmt_tutorial.py
index 0eaa2c6b75b3..49775c187ce3 100644
--- a/examples/nlp/nmt_tutorial.py
+++ b/examples/nlp/nmt_tutorial.py
@@ -6,26 +6,25 @@
 import torch
 
 import nemo
-from nemo.utils.lr_policies import get_lr_policy
-
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp.utils.callbacks.translation import \
-    eval_iter_callback, eval_epochs_done_callback
+from nemo.collections.nlp.utils.callbacks.translation import eval_epochs_done_callback, eval_iter_callback
+from nemo.utils.lr_policies import get_lr_policy
 
-parser = nemo.utils.NemoArgParser(
-    description='Transformer for Neural Machine Translation')
-parser.set_defaults(train_dataset="train",
-                    eval_datasets=["valid"],
-                    work_dir="outputs/transformer_nmt",
-                    optimizer="novograd",
-                    batch_size=4096,
-                    eval_batch_size=4096,
-                    lr_policy='CosineAnnealing',
-                    lr=0.005,
-                    weight_decay=0,
-                    max_steps=500,
-                    iter_per_step=1,
-                    eval_freq=1000)
+parser = nemo.utils.NemoArgParser(description='Transformer for Neural Machine Translation')
+parser.set_defaults(
+    train_dataset="train",
+    eval_datasets=["valid"],
+    work_dir="outputs/transformer_nmt",
+    optimizer="novograd",
+    batch_size=4096,
+    eval_batch_size=4096,
+    lr_policy='CosineAnnealing',
+    lr=0.005,
+    weight_decay=0,
+    max_steps=500,
+    iter_per_step=1,
+    eval_freq=1000,
+)
 parser.add_argument("--data_dir", default="../../tests/data/en_de", type=str)
 parser.add_argument("--dataset_name", default="wmt16", type=str)
 parser.add_argument("--src_lang", default="en", type=str)
@@ -43,12 +42,10 @@
 parser.add_argument("--label_smoothing", default=0.1, type=float)
 parser.add_argument("--beam_size", default=4, type=int)
 # pass a YouTokenToMe model to YouTokenToMeTokenizer for en
-parser.add_argument("--src_tokenizer_model",
-                    default="bpe8k_yttm.model", type=str)
+parser.add_argument("--src_tokenizer_model", default="bpe8k_yttm.model", type=str)
 # pass a YouTokenToMe model to YouTokenToMeTokenizer for de
 # if the target is zh, we should pass a vocabulary file, e.g. zh_vocab.txt
-parser.add_argument("--tgt_tokenizer_model",
-                    default="bpe8k_yttm.model", type=str)
+parser.add_argument("--tgt_tokenizer_model", default="bpe8k_yttm.model", type=str)
 parser.add_argument("--interactive", action="store_true")
 parser.add_argument("--save_epoch_freq", default=5, type=int)
 parser.add_argument("--save_step_freq", default=-1, type=int)
@@ -57,12 +54,14 @@
 args = parser.parse_args()
 
 work_dir = f'{args.work_dir}/{args.dataset_name.upper()}'
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   local_rank=args.local_rank,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=args.work_dir,
-                                   create_tb_writer=True,
-                                   files_to_copy=[__file__])
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch,
+    local_rank=args.local_rank,
+    optimization_level=args.amp_opt_level,
+    log_dir=args.work_dir,
+    create_tb_writer=True,
+    files_to_copy=[__file__],
+)
 
 # tie weight of embedding and log_softmax layers if use the same tokenizer
 # for the source and the target
@@ -77,16 +76,14 @@
     We use YouTokenToMe tokenizer trained on joint
     English & German data for both source and target languages.
     """
-    src_tokenizer = nemo_nlp.YouTokenToMeTokenizer(
-        model_path=f"{args.data_dir}/{args.src_tokenizer_model}")
+    src_tokenizer = nemo_nlp.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.src_tokenizer_model}")
     src_vocab_size = src_tokenizer.vocab_size
     if args.src_tokenizer_model == args.tgt_tokenizer_model:
         tgt_tokenizer = src_tokenizer
         # source and target use the same tokenizer, set tie_weight to True
         tie_weight = True
     else:
-        tgt_tokenizer = nemo_nlp.YouTokenToMeTokenizer(
-            model_path=f"{args.data_dir}/{args.tgt_tokenizer_model}")
+        tgt_tokenizer = nemo_nlp.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.tgt_tokenizer_model}")
         # source and target use different tokenizers, set tie_weight to False
         tie_weight = False
     tgt_vocab_size = tgt_tokenizer.vocab_size
@@ -95,17 +92,14 @@
     We use YouTokenToMeTokenizer for src since the src contains English words
     and CharTokenizer for tgt since the tgt contains Chinese characters.
     """
-    src_tokenizer = nemo_nlp.YouTokenToMeTokenizer(
-        model_path=f"{args.data_dir}/{args.src_tokenizer_model}")
+    src_tokenizer = nemo_nlp.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.src_tokenizer_model}")
     src_vocab_size = src_tokenizer.vocab_size
-    tgt_tokenizer = nemo_nlp.CharTokenizer(
-        vocab_path=f"{args.data_dir}/{args.tgt_tokenizer_model}")
+    tgt_tokenizer = nemo_nlp.CharTokenizer(vocab_path=f"{args.data_dir}/{args.tgt_tokenizer_model}")
     tgt_vocab_size = tgt_tokenizer.vocab_size
     # source and target use different tokenizers, set tie_weight to False
     tie_weight = False
 else:
-    raise ValueError(
-        f"Unsupported language pair:{args.src_lang}-{args.tgt_lang}.")
+    raise ValueError(f"Unsupported language pair:{args.src_lang}-{args.tgt_lang}.")
 
 # instantiate necessary modules for the whole translation pipeline, namely
 # data layers, encoder, decoder, output log_softmax, beam_search_translator
@@ -120,7 +114,8 @@
     vocab_size=src_vocab_size,
     attn_score_dropout=args.attn_score_dropout,
     attn_layer_dropout=args.attn_layer_dropout,
-    max_seq_length=args.max_seq_length)
+    max_seq_length=args.max_seq_length,
+)
 
 decoder = nemo_nlp.TransformerDecoderNM(
     d_model=args.d_model,
@@ -132,12 +127,12 @@
     vocab_size=tgt_vocab_size,
     attn_score_dropout=args.attn_score_dropout,
     attn_layer_dropout=args.attn_layer_dropout,
-    max_seq_length=args.max_seq_length)
+    max_seq_length=args.max_seq_length,
+)
 
-log_softmax = nemo_nlp.TokenClassifier(args.d_model,
-                                       num_classes=tgt_tokenizer.vocab_size,
-                                       num_layers=1,
-                                       log_softmax=True)
+log_softmax = nemo_nlp.TokenClassifier(
+    args.d_model, num_classes=tgt_tokenizer.vocab_size, num_layers=1, log_softmax=True,
+)
 
 beam_search = nemo_nlp.BeamSearchTranslatorNM(
     decoder=decoder,
@@ -146,60 +141,49 @@
     beam_size=args.beam_size,
     bos_token=tgt_tokenizer.bos_id(),
     pad_token=tgt_tokenizer.pad_id(),
-    eos_token=tgt_tokenizer.eos_id())
+    eos_token=tgt_tokenizer.eos_id(),
+)
 
 loss_fn = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(
-    pad_id=tgt_tokenizer.pad_id(),
-    label_smoothing=args.label_smoothing)
+    pad_id=tgt_tokenizer.pad_id(), label_smoothing=args.label_smoothing
+)
 
 if tie_weight:
-    log_softmax.mlp.last_linear_layer.weight = \
-        encoder.embedding_layer.token_embedding.weight
-    decoder.embedding_layer.token_embedding.weight = \
-        encoder.embedding_layer.token_embedding.weight
-
-
-def create_pipeline(dataset_src,
-                    dataset_tgt,
-                    tokens_in_batch,
-                    clean=False,
-                    training=True):
-    data_layer = nemo_nlp.TranslationDataLayer(tokenizer_src=src_tokenizer,
-                                               tokenizer_tgt=tgt_tokenizer,
-                                               dataset_src=dataset_src,
-                                               dataset_tgt=dataset_tgt,
-                                               tokens_in_batch=tokens_in_batch,
-                                               clean=clean)
+    log_softmax.mlp.last_linear_layer.weight = encoder.embedding_layer.token_embedding.weight
+    decoder.embedding_layer.token_embedding.weight = encoder.embedding_layer.token_embedding.weight
+
+
+def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, training=True):
+    data_layer = nemo_nlp.TranslationDataLayer(
+        tokenizer_src=src_tokenizer,
+        tokenizer_tgt=tgt_tokenizer,
+        dataset_src=dataset_src,
+        dataset_tgt=dataset_tgt,
+        tokens_in_batch=tokens_in_batch,
+        clean=clean,
+    )
     src, src_mask, tgt, tgt_mask, labels, sent_ids = data_layer()
     src_hiddens = encoder(input_ids=src, input_mask_src=src_mask)
-    tgt_hiddens = decoder(input_ids_tgt=tgt,
-                          hidden_states_src=src_hiddens,
-                          input_mask_src=src_mask,
-                          input_mask_tgt=tgt_mask)
+    tgt_hiddens = decoder(
+        input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask,
+    )
     logits = log_softmax(hidden_states=tgt_hiddens)
     loss = loss_fn(logits=logits, target_ids=labels)
     beam_results = None
     if not training:
-        beam_results = beam_search(hidden_states_src=src_hiddens,
-                                   input_mask_src=src_mask)
+        beam_results = beam_search(hidden_states_src=src_hiddens, input_mask_src=src_mask)
     return loss, [tgt, loss, beam_results, sent_ids]
 
 
 train_dataset_src = f"{args.data_dir}/{args.train_dataset}.{args.src_lang}"
 train_dataset_tgt = f"{args.data_dir}/{args.train_dataset}.{args.tgt_lang}"
 
-train_loss, _ = create_pipeline(train_dataset_src,
-                                train_dataset_tgt,
-                                args.batch_size,
-                                clean=True)
+train_loss, _ = create_pipeline(train_dataset_src, train_dataset_tgt, args.batch_size, clean=True)
 
 eval_dataset_src = f"{args.data_dir}/{args.eval_datasets[0]}.{args.src_lang}"
 eval_dataset_tgt = f"{args.data_dir}/{args.eval_datasets[0]}.{args.tgt_lang}"
 
-eval_loss, eval_tensors = create_pipeline(eval_dataset_src,
-                                          eval_dataset_tgt,
-                                          args.eval_batch_size,
-                                          training=False)
+eval_loss, eval_tensors = create_pipeline(eval_dataset_src, eval_dataset_tgt, args.eval_batch_size, training=False)
 
 # callback which prints training loss once in a while
 train_callback = nemo.core.SimpleLossLoggerCallback(
@@ -207,31 +191,27 @@ def create_pipeline(dataset_src,
     step_freq=100,
     print_func=lambda x: str(x[0].item()),
     get_tb_values=lambda x: [["loss", x[0]]],
-    tb_writer=nf.tb_writer)
+    tb_writer=nf.tb_writer,
+)
 
 # callback which calculates evaluation loss and both common BLEU and SacreBLEU
 # scores between outputs of beam search and reference translations
 eval_callback = nemo.core.EvaluatorCallback(
     eval_tensors=eval_tensors,
     user_iter_callback=lambda x, y: eval_iter_callback(x, y, tgt_tokenizer),
-    user_epochs_done_callback=lambda x: eval_epochs_done_callback(
-        x, validation_dataset=eval_dataset_tgt),
+    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, validation_dataset=eval_dataset_tgt),
     eval_step=args.eval_freq,
-    tb_writer=nf.tb_writer)
+    tb_writer=nf.tb_writer,
+)
 
 # callback which saves checkpoints once in a while
-ckpt_dir = nf.checkpoint_dir if not args.interactive \
-    else args.restore_checkpoint_from
+ckpt_dir = nf.checkpoint_dir if not args.interactive else args.restore_checkpoint_from
 ckpt_callback = nemo.core.CheckpointCallback(
-    folder=ckpt_dir,
-    epoch_freq=args.save_epoch_freq,
-    step_freq=args.save_step_freq,
-    checkpoints_to_keep=1)
+    folder=ckpt_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=1,
+)
 
 # define learning rate decay policy
-lr_policy_fn = get_lr_policy(args.lr_policy,
-                             total_steps=args.max_steps,
-                             warmup_steps=args.warmup_steps)
+lr_policy_fn = get_lr_policy(args.lr_policy, total_steps=args.max_steps, warmup_steps=args.warmup_steps)
 
 if args.max_steps is not None and args.num_epochs is not None:
     raise ValueError("Please specify either max_steps or num_epochs.")
@@ -243,19 +223,21 @@ def create_pipeline(dataset_src,
     else:
         stop_training_condition = {"num_epochs": args.num_epochs}
 
-    nf.train(tensors_to_optimize=[train_loss],
-             callbacks=[train_callback, eval_callback, ckpt_callback],
-             optimizer=args.optimizer,
-             lr_policy=lr_policy_fn,
-             optimization_params={**stop_training_condition,
-                                  "lr": args.lr,
-                                  "weight_decay": args.weight_decay},
-             batches_per_step=args.iter_per_step)
+    nf.train(
+        tensors_to_optimize=[train_loss],
+        callbacks=[train_callback, eval_callback, ckpt_callback],
+        optimizer=args.optimizer,
+        lr_policy=lr_policy_fn,
+        optimization_params={**stop_training_condition, "lr": args.lr, "weight_decay": args.weight_decay,},
+        batches_per_step=args.iter_per_step,
+    )
 else:
-    nf.train(tensors_to_optimize=[train_loss],
-             callbacks=[ckpt_callback],
-             optimizer=args.optimizer,
-             optimization_params={"num_epochs": 0, "lr": args.lr})
+    nf.train(
+        tensors_to_optimize=[train_loss],
+        callbacks=[ckpt_callback],
+        optimizer=args.optimizer,
+        optimization_params={"num_epochs": 0, "lr": args.lr},
+    )
 
 
 def translate_sentence(text):
diff --git a/examples/nlp/punctuation_capitalization.py b/examples/nlp/punctuation_capitalization.py
index 80790320b0db..ca2d951d8ed1 100644
--- a/examples/nlp/punctuation_capitalization.py
+++ b/examples/nlp/punctuation_capitalization.py
@@ -6,18 +6,20 @@
 import sys
 
 import nemo
-from nemo.utils.lr_policies import get_lr_policy
-
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer, \
-    TokenClassifier, TokenClassificationLoss
+from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer, TokenClassificationLoss, TokenClassifier
 from nemo.collections.nlp.data.datasets import utils
-from nemo.collections.nlp.utils.callbacks.punctuation_capitalization import \
-    eval_iter_callback, eval_epochs_done_callback
+from nemo.collections.nlp.utils.callbacks.punctuation_capitalization import (
+    eval_epochs_done_callback,
+    eval_iter_callback,
+)
+from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments
-parser = argparse.ArgumentParser(description="Punctuation and \
-    capitalization model with pretrained BERT")
+parser = argparse.ArgumentParser(
+    description="Punctuation and \
+    capitalization model with pretrained BERT"
+)
 parser.add_argument("--local_rank", default=None, type=int)
 parser.add_argument("--batch_size", default=8, type=int)
 parser.add_argument("--max_seq_length", default=128, type=int)
@@ -28,8 +30,7 @@
 parser.add_argument("--lr_policy", default="WarmupAnnealing", type=str)
 parser.add_argument("--weight_decay", default=0, type=float)
 parser.add_argument("--optimizer_kind", default="adam", type=str)
-parser.add_argument("--amp_opt_level", default="O0",
-                    type=str, choices=["O0", "O1", "O2"])
+parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
 parser.add_argument("--data_dir", default="/data", type=str)
 parser.add_argument("--punct_num_fc_layers", default=3, type=int)
 parser.add_argument("--fc_dropout", default=0.1, type=float)
@@ -37,49 +38,76 @@
 parser.add_argument("--ignore_extra_tokens", action='store_false')
 parser.add_argument("--none_label", default='O', type=str)
 parser.add_argument("--shuffle_data", action='store_true')
-parser.add_argument("--pretrained_bert_model",
-                    default="bert-base-uncased", type=str)
+parser.add_argument("--pretrained_bert_model", default="bert-base-uncased", type=str)
 parser.add_argument("--bert_checkpoint", default=None, type=str)
-parser.add_argument("--bert_config", default=None, type=str,
-                    help="Path to bert config file in json format")
+parser.add_argument(
+    "--bert_config", default=None, type=str, help="Path to bert config file in json format",
+)
 parser.add_argument("--punct_classifier_checkpoint", default=None, type=str)
 parser.add_argument("--capit_classifier_checkpoint", default=None, type=str)
-parser.add_argument("--tokenizer_model", default="tokenizer.model", type=str,
-                    help="Path to pretrained tokenizer model, \
-                    only used if --tokenizer is sentencepiece")
-parser.add_argument("--tokenizer", default="nemobert", type=str,
-                    choices=["nemobert", "sentencepiece"],
-                    help="tokenizer to use, \
-                    only relevant when using custom pretrained checkpoint.")
-parser.add_argument("--work_dir", default='output', type=str,
-                    help="The output directory where the model prediction\
-                    and checkpoints will be written.")
-parser.add_argument("--use_cache", action='store_true',
-                    help="Whether to cache preprocessed data")
-parser.add_argument("--save_epoch_freq", default=1, type=int,
-                    help="Frequency of saving checkpoint\
-                    '-1' - step checkpoint won't be saved")
-parser.add_argument("--save_step_freq", default=200, type=int,
-                    help="Frequency of saving checkpoint \
-                    '-1' - step checkpoint won't be saved")
-parser.add_argument("--loss_step_freq", default=250, type=int,
-                    help="Frequency of printing loss")
-parser.add_argument("--use_weighted_loss_punct", action='store_true',
-                    help="Flag to indicate whether to use weighted loss \
-                    to mitigate classs unbalancing for the punctuation task")
+parser.add_argument(
+    "--tokenizer_model",
+    default="tokenizer.model",
+    type=str,
+    help="Path to pretrained tokenizer model, \
+                    only used if --tokenizer is sentencepiece",
+)
+parser.add_argument(
+    "--tokenizer",
+    default="nemobert",
+    type=str,
+    choices=["nemobert", "sentencepiece"],
+    help="tokenizer to use, \
+                    only relevant when using custom pretrained checkpoint.",
+)
+parser.add_argument(
+    "--work_dir",
+    default='output',
+    type=str,
+    help="The output directory where the model prediction\
+                    and checkpoints will be written.",
+)
+parser.add_argument(
+    "--use_cache", action='store_true', help="Whether to cache preprocessed data",
+)
+parser.add_argument(
+    "--save_epoch_freq",
+    default=1,
+    type=int,
+    help="Frequency of saving checkpoint\
+                    '-1' - step checkpoint won't be saved",
+)
+parser.add_argument(
+    "--save_step_freq",
+    default=200,
+    type=int,
+    help="Frequency of saving checkpoint \
+                    '-1' - step checkpoint won't be saved",
+)
+parser.add_argument(
+    "--loss_step_freq", default=250, type=int, help="Frequency of printing loss",
+)
+parser.add_argument(
+    "--use_weighted_loss_punct",
+    action='store_true',
+    help="Flag to indicate whether to use weighted loss \
+                    to mitigate classs unbalancing for the punctuation task",
+)
 
 args = parser.parse_args()
 
 if not os.path.exists(args.data_dir):
     raise FileNotFoundError("Dataset not found.")
 
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   local_rank=args.local_rank,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=args.work_dir,
-                                   create_tb_writer=True,
-                                   files_to_copy=[__file__],
-                                   add_time_to_log_dir=True)
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch,
+    local_rank=args.local_rank,
+    optimization_level=args.amp_opt_level,
+    log_dir=args.work_dir,
+    create_tb_writer=True,
+    files_to_copy=[__file__],
+    add_time_to_log_dir=True,
+)
 
 nemo.logging.info(args)
 
@@ -91,8 +119,7 @@
     nemo_nlp.huggingface.BERT.list_pretrained_models()
     """
     tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
-    model = nemo_nlp.huggingface.BERT(
-        pretrained_model_name=args.pretrained_bert_model)
+    model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 else:
     """ Use this if you're using a BERT model that you pre-trained yourself.
     """
@@ -108,8 +135,7 @@
             config = json.load(json_file)
         model = nemo_nlp.huggingface.BERT(**config)
     else:
-        model = nemo_nlp.huggingface.BERT(
-            pretrained_model_name=args.pretrained_bert_model)
+        model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 
     model.restore_from(args.bert_checkpoint)
     nemo.logging.info(f"Model restored from {args.bert_checkpoint}")
@@ -125,23 +151,24 @@
 task_loss = None
 
 
-def create_pipeline(num_samples=-1,
-                    pad_label=args.none_label,
-                    max_seq_length=args.max_seq_length,
-                    batch_size=args.batch_size,
-                    local_rank=args.local_rank,
-                    num_gpus=args.num_gpus,
-                    mode='train',
-                    punct_label_ids=None,
-                    capit_label_ids=None,
-                    ignore_extra_tokens=args.ignore_extra_tokens,
-                    ignore_start_end=args.ignore_start_end,
-                    use_cache=args.use_cache,
-                    dropout=args.fc_dropout,
-                    punct_num_layers=args.punct_num_fc_layers):
-
-    global punct_classifier, punct_loss, \
-        capit_classifier, capit_loss, task_loss
+def create_pipeline(
+    num_samples=-1,
+    pad_label=args.none_label,
+    max_seq_length=args.max_seq_length,
+    batch_size=args.batch_size,
+    local_rank=args.local_rank,
+    num_gpus=args.num_gpus,
+    mode='train',
+    punct_label_ids=None,
+    capit_label_ids=None,
+    ignore_extra_tokens=args.ignore_extra_tokens,
+    ignore_start_end=args.ignore_start_end,
+    use_cache=args.use_cache,
+    dropout=args.fc_dropout,
+    punct_num_layers=args.punct_num_fc_layers,
+):
+
+    global punct_classifier, punct_loss, capit_classifier, capit_loss, task_loss
 
     nemo.logging.info(f"Loading {mode} data...")
     shuffle = args.shuffle_data if mode == 'train' else False
@@ -150,7 +177,8 @@ def create_pipeline(num_samples=-1,
     label_file = f'{args.data_dir}/labels_{mode}.txt'
 
     if not (os.path.exists(text_file) or (os.path.exists(label_file))):
-        raise FileNotFoundError(f'{text_file} or {label_file} not found. \
+        raise FileNotFoundError(
+            f'{text_file} or {label_file} not found. \
            The data should be splitted into 2 files: text.txt and labels.txt. \
            Each line of the text.txt file contains text sequences, where words\
            are separated with spaces. The labels.txt file contains \
@@ -158,7 +186,8 @@ def create_pipeline(num_samples=-1,
            separated with spaces. Each line of the files should follow the \
            format:  \
            [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \
-           [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).')
+           [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).'
+        )
 
     data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(
         tokenizer=tokenizer,
@@ -174,10 +203,10 @@ def create_pipeline(num_samples=-1,
         shuffle=shuffle,
         ignore_extra_tokens=ignore_extra_tokens,
         ignore_start_end=ignore_start_end,
-        use_cache=use_cache)
+        use_cache=use_cache,
+    )
 
-    input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, \
-        punct_labels, capit_labels = data_layer()
+    (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, punct_labels, capit_labels,) = data_layer()
 
     if mode == 'train':
         punct_label_ids = data_layer.dataset.punct_label_ids
@@ -191,66 +220,64 @@ def create_pipeline(num_samples=-1,
 
         # Initialize punctuation loss
         punct_classifier = getattr(sys.modules[__name__], punct_classifier)
-        punct_classifier = punct_classifier(hidden_size=hidden_size,
-                                            num_classes=len(punct_label_ids),
-                                            dropout=dropout,
-                                            num_layers=punct_num_layers,
-                                            name='Punctuation')
+        punct_classifier = punct_classifier(
+            hidden_size=hidden_size,
+            num_classes=len(punct_label_ids),
+            dropout=dropout,
+            num_layers=punct_num_layers,
+            name='Punctuation',
+        )
 
         punct_loss = getattr(sys.modules[__name__], punct_loss)
-        punct_loss = punct_loss(num_classes=len(punct_label_ids),
-                                class_weights=class_weights)
+        punct_loss = punct_loss(num_classes=len(punct_label_ids), class_weights=class_weights)
 
         # Initialize capitalization loss
         capit_classifier = getattr(sys.modules[__name__], capit_classifier)
-        capit_classifier = capit_classifier(hidden_size=hidden_size,
-                                            num_classes=len(capit_label_ids),
-                                            dropout=dropout,
-                                            name='Capitalization')
+        capit_classifier = capit_classifier(
+            hidden_size=hidden_size, num_classes=len(capit_label_ids), dropout=dropout, name='Capitalization',
+        )
         capit_loss = getattr(sys.modules[__name__], capit_loss)
         capit_loss = capit_loss(num_classes=len(capit_label_ids))
 
         task_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)
 
-    hidden_states = model(input_ids=input_ids,
-                          token_type_ids=input_type_ids,
-                          attention_mask=input_mask)
+    hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
 
     punct_logits = punct_classifier(hidden_states=hidden_states)
     capit_logits = capit_classifier(hidden_states=hidden_states)
 
     if mode == 'train':
-        punct_loss = punct_loss(logits=punct_logits,
-                                labels=punct_labels,
-                                loss_mask=loss_mask)
-        capit_loss = capit_loss(logits=capit_logits,
-                                labels=capit_labels,
-                                loss_mask=loss_mask)
+        punct_loss = punct_loss(logits=punct_logits, labels=punct_labels, loss_mask=loss_mask)
+        capit_loss = capit_loss(logits=capit_logits, labels=capit_labels, loss_mask=loss_mask)
         task_loss = task_loss(loss_1=punct_loss, loss_2=capit_loss)
 
         steps_per_epoch = len(data_layer) // (batch_size * num_gpus)
 
         losses = [task_loss, punct_loss, capit_loss]
         logits = [punct_logits, capit_logits]
-        return (losses, logits,
-                steps_per_epoch,
-                punct_label_ids,
-                capit_label_ids)
+        return (
+            losses,
+            logits,
+            steps_per_epoch,
+            punct_label_ids,
+            capit_label_ids,
+        )
     else:
-        tensors_to_evaluate = [punct_logits,
-                               capit_logits,
-                               punct_labels,
-                               capit_labels,
-                               subtokens_mask]
+        tensors_to_evaluate = [
+            punct_logits,
+            capit_logits,
+            punct_labels,
+            capit_labels,
+            subtokens_mask,
+        ]
         return tensors_to_evaluate, data_layer
 
 
-losses, train_logits, steps_per_epoch, punct_label_ids, capit_label_ids = \
-    create_pipeline()
+(losses, train_logits, steps_per_epoch, punct_label_ids, capit_label_ids,) = create_pipeline()
 
-eval_tensors, data_layer = create_pipeline(mode='dev',
-                                           punct_label_ids=punct_label_ids,
-                                           capit_label_ids=capit_label_ids)
+eval_tensors, data_layer = create_pipeline(
+    mode='dev', punct_label_ids=punct_label_ids, capit_label_ids=capit_label_ids,
+)
 
 nemo.logging.info(f"steps_per_epoch = {steps_per_epoch}")
 
@@ -259,31 +286,31 @@ def create_pipeline(num_samples=-1,
     tensors=losses + train_logits,
     print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())),
     get_tb_values=lambda x: [["loss", x[0]]],
-    tb_writer=nf.tb_writer)
+    tb_writer=nf.tb_writer,
+)
 
 eval_callback = nemo.core.EvaluatorCallback(
     eval_tensors=eval_tensors,
     user_iter_callback=lambda x, y: eval_iter_callback(x, y),
-    user_epochs_done_callback=lambda x:
-        eval_epochs_done_callback(x,
-                                  punct_label_ids,
-                                  capit_label_ids,
-                                  f'{nf.work_dir}/graphs'),
+    user_epochs_done_callback=lambda x: eval_epochs_done_callback(
+        x, punct_label_ids, capit_label_ids, f'{nf.work_dir}/graphs'
+    ),
     tb_writer=nf.tb_writer,
-    eval_step=steps_per_epoch)
+    eval_step=steps_per_epoch,
+)
 
 ckpt_callback = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir,
-    epoch_freq=args.save_epoch_freq,
-    step_freq=args.save_step_freq)
-
-lr_policy_fn = get_lr_policy(args.lr_policy,
-                             total_steps=args.num_epochs * steps_per_epoch,
-                             warmup_ratio=args.lr_warmup_proportion)
-
-nf.train(tensors_to_optimize=[losses[0]],
-         callbacks=[train_callback, eval_callback, ckpt_callback],
-         lr_policy=lr_policy_fn,
-         optimizer=args.optimizer_kind,
-         optimization_params={"num_epochs": args.num_epochs,
-                              "lr": args.lr})
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+)
+
+lr_policy_fn = get_lr_policy(
+    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+)
+
+nf.train(
+    tensors_to_optimize=[losses[0]],
+    callbacks=[train_callback, eval_callback, ckpt_callback],
+    lr_policy=lr_policy_fn,
+    optimizer=args.optimizer_kind,
+    optimization_params={"num_epochs": args.num_epochs, "lr": args.lr},
+)
diff --git a/examples/nlp/punctuation_capitalization_infer.py b/examples/nlp/punctuation_capitalization_infer.py
index 3953e51d1d16..25d08e67ad7d 100644
--- a/examples/nlp/punctuation_capitalization_infer.py
+++ b/examples/nlp/punctuation_capitalization_infer.py
@@ -14,48 +14,60 @@
 parser.add_argument("--max_seq_length", default=128, type=int)
 parser.add_argument("--fc_dropout", default=0, type=float)
 parser.add_argument("--punct_num_fc_layers", default=3, type=int)
-parser.add_argument("--pretrained_bert_model",
-                    default="bert-base-uncased", type=str)
+parser.add_argument("--pretrained_bert_model", default="bert-base-uncased", type=str)
 parser.add_argument("--none_label", default='O', type=str)
-parser.add_argument("--queries", action='append',
-                    default=['we bought four shirts from the ' +
-                             'nvidia gear store in santa clara',
-                             'nvidia is a company',
-                             'can i help you',
-                             'how are you',
-                             'how\'s the weather today',
-                             'okay',
-                             'we bought four shirts one mug and ten ' +
-                             'thousand titan rtx graphics cards the more ' +
-                             'you buy the more you save'],
-                    help="Example: --queries 'san francisco' --queries 'la'")
-parser.add_argument("--add_brackets", action='store_false',
-                    help="Whether to take predicted label in brackets or \
-                    just append to word in the output")
-parser.add_argument("--checkpoints_dir", default='output/checkpoints',
-                    type=str)
-parser.add_argument("--punct_labels_dict", default='punct_label_ids.csv',
-                    type=str, help='This file is generated during training \
-                    when the datalayer is created')
-parser.add_argument("--capit_labels_dict", default='capit_label_ids.csv',
-                    type=str, help='This file is generated during training \
-                    when the datalayer is created')
-parser.add_argument("--amp_opt_level", default="O0",
-                    type=str, choices=["O0", "O1", "O2"])
+parser.add_argument(
+    "--queries",
+    action='append',
+    default=[
+        'we bought four shirts from the ' + 'nvidia gear store in santa clara',
+        'nvidia is a company',
+        'can i help you',
+        'how are you',
+        'how\'s the weather today',
+        'okay',
+        'we bought four shirts one mug and ten '
+        + 'thousand titan rtx graphics cards the more '
+        + 'you buy the more you save',
+    ],
+    help="Example: --queries 'san francisco' --queries 'la'",
+)
+parser.add_argument(
+    "--add_brackets",
+    action='store_false',
+    help="Whether to take predicted label in brackets or \
+                    just append to word in the output",
+)
+parser.add_argument("--checkpoints_dir", default='output/checkpoints', type=str)
+parser.add_argument(
+    "--punct_labels_dict",
+    default='punct_label_ids.csv',
+    type=str,
+    help='This file is generated during training \
+                    when the datalayer is created',
+)
+parser.add_argument(
+    "--capit_labels_dict",
+    default='capit_label_ids.csv',
+    type=str,
+    help='This file is generated during training \
+                    when the datalayer is created',
+)
+parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
 
 args = parser.parse_args()
 
 if not os.path.exists(args.checkpoints_dir):
     raise ValueError(f'Checkpoints folder not found at {args.checkpoints_dir}')
-if not (os.path.exists(args.punct_labels_dict) and
-        os.path.exists(args.capit_labels_dict)):
+if not (os.path.exists(args.punct_labels_dict) and os.path.exists(args.capit_labels_dict)):
     raise ValueError(
         f'Dictionary with ids to labels not found at {args.punct_labels_dict} \
-         or {args.punct_labels_dict}')
+         or {args.punct_labels_dict}'
+    )
 
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=None)
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None,
+)
 
 punct_labels_dict = get_vocab(args.punct_labels_dict)
 
@@ -65,34 +77,29 @@
 See the list of pretrained models, call:
 nemo_nlp.huggingface.BERT.list_pretrained_models()
 """
-pretrained_bert_model = nemo_nlp.huggingface.BERT(
-    pretrained_model_name=args.pretrained_bert_model)
+pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 hidden_size = pretrained_bert_model.local_parameters["hidden_size"]
 tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
 
 data_layer = nemo_nlp.BertTokenClassificationInferDataLayer(
-    queries=args.queries,
-    tokenizer=tokenizer,
-    max_seq_length=args.max_seq_length,
-    batch_size=1)
-
-punct_classifier = \
-    nemo_nlp.TokenClassifier(hidden_size=hidden_size,
-                             num_classes=len(punct_labels_dict),
-                             dropout=args.fc_dropout,
-                             num_layers=args.punct_num_fc_layers,
-                             name='Punctuation')
-
-capit_classifier = nemo_nlp.TokenClassifier(hidden_size=hidden_size,
-                                            num_classes=len(capit_labels_dict),
-                                            dropout=args.fc_dropout,
-                                            name='Capitalization')
+    queries=args.queries, tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1,
+)
+
+punct_classifier = nemo_nlp.TokenClassifier(
+    hidden_size=hidden_size,
+    num_classes=len(punct_labels_dict),
+    dropout=args.fc_dropout,
+    num_layers=args.punct_num_fc_layers,
+    name='Punctuation',
+)
+
+capit_classifier = nemo_nlp.TokenClassifier(
+    hidden_size=hidden_size, num_classes=len(capit_labels_dict), dropout=args.fc_dropout, name='Capitalization',
+)
 
 input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = data_layer()
 
-hidden_states = pretrained_bert_model(input_ids=input_ids,
-                                      token_type_ids=input_type_ids,
-                                      attention_mask=input_mask)
+hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
 
 punct_logits = punct_classifier(hidden_states=hidden_states)
 capit_logits = capit_classifier(hidden_states=hidden_states)
@@ -101,8 +108,7 @@
 
 # Instantiate an optimizer to perform `infer` action
 evaluated_tensors = nf.infer(
-    tensors=[punct_logits, capit_logits, subtokens_mask],
-    checkpoint_dir=args.checkpoints_dir,
+    tensors=[punct_logits, capit_logits, subtokens_mask], checkpoint_dir=args.checkpoints_dir,
 )
 
 
@@ -114,8 +120,7 @@ def get_preds(logits):
     return np.argmax(logits, 1)
 
 
-punct_logits, capit_logits, subtokens_mask = \
-    [concatenate(tensors) for tensors in evaluated_tensors]
+punct_logits, capit_logits, subtokens_mask = [concatenate(tensors) for tensors in evaluated_tensors]
 
 punct_preds = np.argmax(punct_logits, axis=2)
 capit_preds = np.argmax(capit_logits, axis=2)
diff --git a/examples/nlp/scripts/create_vocab.py b/examples/nlp/scripts/create_vocab.py
index c531355305e6..2d3160e01896 100644
--- a/examples/nlp/scripts/create_vocab.py
+++ b/examples/nlp/scripts/create_vocab.py
@@ -45,8 +45,7 @@ def main():
         # file before proceeding
         # filepaths = glob.glob(os.path.join(args.dataset_dir, "**", "*.txt"))
         filepaths = glob.glob(os.path.join(args.dataset_dir, "*.txt"))
-        print("Found {} files, concatenenating dataset into one file..."
-              .format(len(filepaths)))
+        print("Found {} files, concatenenating dataset into one file...".format(len(filepaths)))
 
         with open(MERGED_FILE, "w") as f:
             for filepath in tqdm(filepaths):
@@ -59,15 +58,16 @@ def main():
         print("One of 'dataset_dir' and 'train_path' must be specified")
         return
 
-    SPT.Train("--input={} ".format(train_path) +
-              "--model_prefix={} ".format(args.model_prefix) +
-              "--vocab_size={} ".format(args.vocab_size
-                                        - args.num_placeholders) +
-              "--input_sentence_size={} ".format(args.sample_size) +
-              "--shuffle_input_sentence=true " +
-              "--hard_vocab_limit=false " +
-              "--bos_id=-1 " +
-              "--eos_id=-1")
+    SPT.Train(
+        "--input={} ".format(train_path)
+        + "--model_prefix={} ".format(args.model_prefix)
+        + "--vocab_size={} ".format(args.vocab_size - args.num_placeholders)
+        + "--input_sentence_size={} ".format(args.sample_size)
+        + "--shuffle_input_sentence=true "
+        + "--hard_vocab_limit=false "
+        + "--bos_id=-1 "
+        + "--eos_id=-1"
+    )
 
     # Add BERT control symbols
     vocab = ["[PAD]"]
@@ -88,8 +88,7 @@ def main():
 
             tokens.append(token)
 
-    vocab.extend(["[unused{}]".format(i)
-                 for i in range(args.vocab_size - len(tokens))])
+    vocab.extend(["[unused{}]".format(i) for i in range(args.vocab_size - len(tokens))])
     vocab.extend(["[UNK]", "[CLS]", "[SEP]", "[MASK]"])
     vocab.extend(tokens)
 
diff --git a/examples/nlp/scripts/download_squad.py b/examples/nlp/scripts/download_squad.py
index e99e3e92f7a0..80c4739e7b62 100755
--- a/examples/nlp/scripts/download_squad.py
+++ b/examples/nlp/scripts/download_squad.py
@@ -30,22 +30,14 @@ def __init__(self, save_path):
             os.makedirs(self.save_path + '/v2.0')
 
         self.download_urls = {
-            'https://rajpurkar.github.io/SQuAD-explorer'
-            '/dataset/train-v1.1.json': 'v1.1/train-v1.1.json',
-            'https://rajpurkar.github.io/SQuAD-explorer'
-            '/dataset/dev-v1.1.json': 'v1.1/dev-v1.1.json',
+            'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/train-v1.1.json': 'v1.1/train-v1.1.json',
+            'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/dev-v1.1.json': 'v1.1/dev-v1.1.json',
             'https://worksheets.codalab.org/rest/bundles'
-            '/0xbcd57bee090b421c982906709c8c27e1/contents/blob/':
-                'v1.1/evaluate-v1.1.py',
-            'https://rajpurkar.github.io/SQuAD-explorer'
-            '/dataset/train-v2.0.json':
-                'v2.0/train-v2.0.json',
-            'https://rajpurkar.github.io/SQuAD-explorer'
-            '/dataset/dev-v2.0.json':
-                'v2.0/dev-v2.0.json',
+            '/0xbcd57bee090b421c982906709c8c27e1/contents/blob/': 'v1.1/evaluate-v1.1.py',
+            'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/train-v2.0.json': 'v2.0/train-v2.0.json',
+            'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/dev-v2.0.json': 'v2.0/dev-v2.0.json',
             'https://worksheets.codalab.org/rest/bundles'
-            '/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/':
-                'v2.0/evaluate-v2.0.py',
+            '/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/': 'v2.0/evaluate-v2.0.py',
         }
 
     def download(self):
@@ -64,10 +56,13 @@ def download(self):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Download Squad')
-    parser.add_argument('--destDir', type=str, required=False,
-                        help='directory to store data',
-                        default=os.path.split(os.path.abspath(__file__))[0]
-                        + '/../data/lm')
+    parser.add_argument(
+        '--destDir',
+        type=str,
+        required=False,
+        help='directory to store data',
+        default=os.path.split(os.path.abspath(__file__))[0] + '/../data/lm',
+    )
     args = parser.parse_args()
     squad_dl = SquadDownloader(args.destDir)
     squad_dl.download()
diff --git a/examples/nlp/scripts/process_wiki_zh.py b/examples/nlp/scripts/process_wiki_zh.py
index 5e3e8714de3b..a7f195fbb9c0 100755
--- a/examples/nlp/scripts/process_wiki_zh.py
+++ b/examples/nlp/scripts/process_wiki_zh.py
@@ -15,19 +15,17 @@
 # limitations under the License.
 # =============================================================================
 
-import os
+import glob
 import json
+import os
+import re
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
-import re
-import glob
 
 
-def create_vocab(lines,
-                 vocab_file,
-                 min_frequency=3,
-                 special_symbols=["[PAD]", "[SEP]", "[CLS]",
-                                  "[MASK]", "[UNK]"]):
+def create_vocab(
+    lines, vocab_file, min_frequency=3, special_symbols=["[PAD]", "[SEP]", "[CLS]", "[MASK]", "[UNK]"],
+):
     """Create vocabulary from lines"""
     # Count word occurency
     vocab = {}
@@ -78,7 +76,7 @@ def read_files(data_dir, regex, max_files=-1):
     executor = ProcessPoolExecutor(max_workers=4)
 
     tasks = []
-    files = glob.glob(data_dir+'/*/wiki*')
+    files = glob.glob(data_dir + '/*/wiki*')
     for f in files[:max_files]:
         tasks.append(executor.submit(partial(read, f, regex)))
     print(f'Preprocessing wiki texts in {data_dir}, please wait...')
@@ -116,10 +114,10 @@ def save(output_dir, lines, train_ratio=0.95):
 def process(data_dir, output_dir=None, min_frequency=3, max_files=-1):
     # Define filter rule
     regex = []
-    regex += ['[a-zA-Z0-9]']        # English and numerics
-    regex += [r'[\u4e00-\u9fff]']   # CJK char
-    regex += [r'[\u3400-\u4DBF]']   # CJK char extend
-    regex += [r'[\uf900-\ufaff]']   # CJK compatable
+    regex += ['[a-zA-Z0-9]']  # English and numerics
+    regex += [r'[\u4e00-\u9fff]']  # CJK char
+    regex += [r'[\u3400-\u4DBF]']  # CJK char extend
+    regex += [r'[\uf900-\ufaff]']  # CJK compatable
     regex += ['[\n]']
     regex = "|".join(regex)
 
@@ -136,17 +134,17 @@ def process(data_dir, output_dir=None, min_frequency=3, max_files=-1):
 
 if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(
-        description='Process wiki_zh dataset for BERT pretraining')
+
+    parser = argparse.ArgumentParser(description='Process wiki_zh dataset for BERT pretraining')
     # Read data directory from command line argument
     parser.add_argument("--data_dir", default="/raid/data/wiki_zh", type=str)
     parser.add_argument("--output_dir", default="./", type=str)
-    parser.add_argument("--min_frequency", default=0, type=int,
-                        help="Characters occuring less frequently "
-                             "will be filtered out")
-    parser.add_argument("--max_files", default=-1, type=int,
-                        help="Max number of dirs to process")
+    parser.add_argument(
+        "--min_frequency", default=0, type=int, help="Characters occuring less frequently " "will be filtered out",
+    )
+    parser.add_argument(
+        "--max_files", default=-1, type=int, help="Max number of dirs to process",
+    )
     args = parser.parse_args()
 
-    process(args.data_dir, args.output_dir, args.min_frequency,
-            args.max_files)
+    process(args.data_dir, args.output_dir, args.min_frequency, args.max_files)
diff --git a/examples/nlp/sentence_classification_with_bert.py b/examples/nlp/sentence_classification_with_bert.py
index 716e0549fe8c..d064203fba02 100644
--- a/examples/nlp/sentence_classification_with_bert.py
+++ b/examples/nlp/sentence_classification_with_bert.py
@@ -2,21 +2,18 @@
 import math
 
 import numpy as np
-from transformers import BertTokenizer
-from torch import nn
 import torch
+from torch import nn
+from transformers import BertTokenizer
 
 import nemo
-from nemo.utils.lr_policies import get_lr_policy
-
 import nemo.collections.nlp as nemo_nlp
 from nemo.collections.nlp.data.datasets.utils import SentenceClassificationDataDesc
-from nemo.collections.nlp.utils.callbacks.sentence_classification import \
-    eval_iter_callback, eval_epochs_done_callback
+from nemo.collections.nlp.utils.callbacks.sentence_classification import eval_epochs_done_callback, eval_iter_callback
+from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments
-parser = argparse.ArgumentParser(
-    description='Sentence classification with pretrained BERT')
+parser = argparse.ArgumentParser(description='Sentence classification with pretrained BERT')
 parser.add_argument("--local_rank", default=None, type=int)
 parser.add_argument("--batch_size", default=32, type=int)
 parser.add_argument("--max_seq_length", default=36, type=int)
@@ -29,9 +26,7 @@
 parser.add_argument("--lr_policy", default="WarmupAnnealing", type=str)
 parser.add_argument("--weight_decay", default=0.01, type=float)
 parser.add_argument("--fc_dropout", default=0.1, type=float)
-parser.add_argument("--pretrained_bert_model",
-                    default="bert-base-uncased",
-                    type=str)
+parser.add_argument("--pretrained_bert_model", default="bert-base-uncased", type=str)
 parser.add_argument("--bert_checkpoint", default="", type=str)
 parser.add_argument("--bert_config", default="", type=str)
 parser.add_argument("--data_dir", required=True, type=str)
@@ -42,23 +37,25 @@
 parser.add_argument("--save_epoch_freq", default=1, type=int)
 parser.add_argument("--save_step_freq", default=-1, type=int)
 parser.add_argument("--optimizer_kind", default="adam", type=str)
-parser.add_argument("--amp_opt_level", default="O0",
-                    type=str, choices=["O0", "O1", "O2"])
+parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
 parser.add_argument("--do_lower_case", action='store_true')
 parser.add_argument("--shuffle_data", action='store_true')
-parser.add_argument("--class_balancing", default="None", type=str,
-                    choices=["None", "weighted_loss"])
+parser.add_argument(
+    "--class_balancing", default="None", type=str, choices=["None", "weighted_loss"],
+)
 
 args = parser.parse_args()
 
 work_dir = f'{args.work_dir}/{args.dataset_name.upper()}'
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   local_rank=args.local_rank,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=work_dir,
-                                   create_tb_writer=True,
-                                   files_to_copy=[__file__],
-                                   add_time_to_log_dir=True)
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch,
+    local_rank=args.local_rank,
+    optimization_level=args.amp_opt_level,
+    log_dir=work_dir,
+    create_tb_writer=True,
+    files_to_copy=[__file__],
+    add_time_to_log_dir=True,
+)
 
 """ Load the pretrained BERT parameters
 See the list of pretrained models, call:
@@ -66,37 +63,29 @@
 """
 
 if args.bert_checkpoint and args.bert_config:
-    pretrained_bert_model = nemo_nlp.huggingface.BERT(
-        config_filename=args.bert_config, factory=nf)
+    pretrained_bert_model = nemo_nlp.huggingface.BERT(config_filename=args.bert_config, factory=nf)
     pretrained_bert_model.restore_from(args.bert_checkpoint)
 else:
-    pretrained_bert_model = nemo_nlp.huggingface.BERT(
-        pretrained_model_name=args.pretrained_bert_model, factory=nf)
+    pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model, factory=nf)
 
 hidden_size = pretrained_bert_model.local_parameters["hidden_size"]
 tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model)
 
-data_desc = SentenceClassificationDataDesc(
-    args.dataset_name, args.data_dir, args.do_lower_case)
+data_desc = SentenceClassificationDataDesc(args.dataset_name, args.data_dir, args.do_lower_case)
 
 # Create sentence classification loss on top
-classifier = nemo_nlp.SequenceClassifier(hidden_size=hidden_size,
-                                         num_classes=data_desc.num_labels,
-                                         dropout=args.fc_dropout)
+classifier = nemo_nlp.SequenceClassifier(
+    hidden_size=hidden_size, num_classes=data_desc.num_labels, dropout=args.fc_dropout,
+)
 
 if args.class_balancing == 'weighted_loss':
     # You may need to increase the number of epochs for convergence.
-    loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss(
-        weight=data_desc.class_weights)
+    loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss(weight=data_desc.class_weights)
 else:
     loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss()
 
 
-def create_pipeline(num_samples=-1,
-                    batch_size=32,
-                    num_gpus=1,
-                    local_rank=0,
-                    mode='train'):
+def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'):
     nemo.logging.info(f"Loading {mode} data...")
     data_file = f'{data_desc.data_dir}/{mode}.tsv'
     shuffle = args.shuffle_data if mode == 'train' else False
@@ -109,7 +98,8 @@ def create_pipeline(num_samples=-1,
         shuffle=shuffle,
         batch_size=batch_size,
         num_workers=0,
-        local_rank=local_rank)
+        local_rank=local_rank,
+    )
 
     ids, type_ids, input_mask, labels = data_layer()
     data_size = len(data_layer)
@@ -122,9 +112,7 @@ def create_pipeline(num_samples=-1,
     steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus))
     nemo.logging.info(f"Steps_per_epoch = {steps_per_epoch}")
 
-    hidden_states = pretrained_bert_model(input_ids=ids,
-                                          token_type_ids=type_ids,
-                                          attention_mask=input_mask)
+    hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask)
 
     logits = classifier(hidden_states=hidden_states)
     loss = loss_fn(logits=logits, labels=labels)
@@ -137,18 +125,20 @@ def create_pipeline(num_samples=-1,
     return tensors_to_evaluate, loss, steps_per_epoch, data_layer
 
 
-train_tensors, train_loss, steps_per_epoch, _ =\
-    create_pipeline(num_samples=args.num_train_samples,
-                    batch_size=args.batch_size,
-                    num_gpus=args.num_gpus,
-                    local_rank=args.local_rank,
-                    mode=args.train_file_prefix)
-eval_tensors, _, _, data_layer =\
-    create_pipeline(num_samples=args.num_eval_samples,
-                    batch_size=args.batch_size,
-                    num_gpus=args.num_gpus,
-                    local_rank=args.local_rank,
-                    mode=args.eval_file_prefix)
+train_tensors, train_loss, steps_per_epoch, _ = create_pipeline(
+    num_samples=args.num_train_samples,
+    batch_size=args.batch_size,
+    num_gpus=args.num_gpus,
+    local_rank=args.local_rank,
+    mode=args.train_file_prefix,
+)
+eval_tensors, _, _, data_layer = create_pipeline(
+    num_samples=args.num_eval_samples,
+    batch_size=args.batch_size,
+    num_gpus=args.num_gpus,
+    local_rank=args.local_rank,
+    mode=args.eval_file_prefix,
+)
 
 # Create callbacks for train and eval modes
 train_callback = nemo.core.SimpleLossLoggerCallback(
@@ -156,31 +146,30 @@ def create_pipeline(num_samples=-1,
     print_func=lambda x: str(np.round(x[0].item(), 3)),
     tb_writer=nf.tb_writer,
     get_tb_values=lambda x: [["loss", x[0]]],
-    step_freq=steps_per_epoch)
+    step_freq=steps_per_epoch,
+)
 
 eval_callback = nemo.core.EvaluatorCallback(
     eval_tensors=eval_tensors,
-    user_iter_callback=lambda x, y: eval_iter_callback(
-        x, y, data_layer),
-    user_epochs_done_callback=lambda x: eval_epochs_done_callback(
-        x, f'{nf.work_dir}/graphs'),
+    user_iter_callback=lambda x, y: eval_iter_callback(x, y, data_layer),
+    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, f'{nf.work_dir}/graphs'),
     tb_writer=nf.tb_writer,
-    eval_step=steps_per_epoch)
+    eval_step=steps_per_epoch,
+)
 
 # Create callback to save checkpoints
 ckpt_callback = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir,
-    epoch_freq=args.save_epoch_freq,
-    step_freq=args.save_step_freq)
-
-lr_policy_fn = get_lr_policy(args.lr_policy,
-                             total_steps=args.num_epochs * steps_per_epoch,
-                             warmup_ratio=args.lr_warmup_proportion)
-
-nf.train(tensors_to_optimize=[train_loss],
-         callbacks=[train_callback, eval_callback, ckpt_callback],
-         lr_policy=lr_policy_fn,
-         optimizer=args.optimizer_kind,
-         optimization_params={"num_epochs": args.num_epochs,
-                              "lr": args.lr,
-                              "weight_decay": args.weight_decay})
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+)
+
+lr_policy_fn = get_lr_policy(
+    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+)
+
+nf.train(
+    tensors_to_optimize=[train_loss],
+    callbacks=[train_callback, eval_callback, ckpt_callback],
+    lr_policy=lr_policy_fn,
+    optimizer=args.optimizer_kind,
+    optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay,},
+)
diff --git a/examples/nlp/squad.py b/examples/nlp/squad.py
index 248fc5cc1ef2..06ffe5aea307 100755
--- a/examples/nlp/squad.py
+++ b/examples/nlp/squad.py
@@ -64,161 +64,212 @@
 
 import nemo
 import nemo.collections.nlp as nemo_nlp
+from nemo.collections.nlp.utils.callbacks.squad import eval_epochs_done_callback, eval_iter_callback
 from nemo.utils.lr_policies import get_lr_policy
-from nemo.collections.nlp.utils.callbacks.squad import (
-    eval_iter_callback,
-    eval_epochs_done_callback)
 
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Squad_with_pretrained_BERT")
-    parser.add_argument("--data_dir", type=str, required=True,
-                        help="The input data dir. Should contain "
-                        "train.*.json, dev.*.json files "
-                        "(or other data files) for the task.")
-    parser.add_argument("--pretrained_bert_model", default="bert-base-uncased",
-                        type=str, help="Name of the pre-trained model")
-    parser.add_argument("--checkpoint_dir", default=None, type=str,
-                        help="Checkpoint directory for inference.")
-    parser.add_argument("--bert_checkpoint", default=None, type=str,
-                        help="Path to BERT model checkpoint for finetuning.")
-    parser.add_argument("--bert_config", default=None, type=str,
-                        help="Path to bert config file in json format")
-    parser.add_argument("--tokenizer_model", default="tokenizer.model",
-                        type=str,
-                        help="Path to pretrained tokenizer model,"
-                        "only used if --tokenizer is sentencepiece")
-    parser.add_argument("--tokenizer", default="nemobert", type=str,
-                        choices=["nemobert", "sentencepiece"],
-                        help="tokenizer to use, "
-                        "only relevant when using custom "
-                        "pretrained checkpoint.")
-    parser.add_argument("--optimizer_kind", default="adam", type=str,
-                        help="Optimizer kind")
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        required=True,
+        help="The input data dir. Should contain "
+        "train.*.json, dev.*.json files "
+        "(or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--pretrained_bert_model", default="bert-base-uncased", type=str, help="Name of the pre-trained model",
+    )
+    parser.add_argument(
+        "--checkpoint_dir", default=None, type=str, help="Checkpoint directory for inference.",
+    )
+    parser.add_argument(
+        "--bert_checkpoint", default=None, type=str, help="Path to BERT model checkpoint for finetuning.",
+    )
+    parser.add_argument(
+        "--bert_config", default=None, type=str, help="Path to bert config file in json format",
+    )
+    parser.add_argument(
+        "--tokenizer_model",
+        default="tokenizer.model",
+        type=str,
+        help="Path to pretrained tokenizer model," "only used if --tokenizer is sentencepiece",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        default="nemobert",
+        type=str,
+        choices=["nemobert", "sentencepiece"],
+        help="tokenizer to use, " "only relevant when using custom " "pretrained checkpoint.",
+    )
+    parser.add_argument("--optimizer_kind", default="adam", type=str, help="Optimizer kind")
     parser.add_argument("--lr_policy", default="WarmupAnnealing", type=str)
-    parser.add_argument("--lr", default=3e-5, type=float,
-                        help="The initial learning rate.")
+    parser.add_argument("--lr", default=3e-5, type=float, help="The initial learning rate.")
     parser.add_argument("--lr_warmup_proportion", default=0.0, type=float)
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--num_epochs", default=2, type=int,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training/evaluation.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Whether to lower case the input text. "
-                        "True for uncased models, False for cased models.")
-    parser.add_argument("--evaluation_only", action='store_true',
-                        help="Whether to only do evaluation.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, "
-                        "how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. "
-                        "Questions longer than this will be truncated to "
-                        "this length.")
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after "
-                        "WordPiece tokenization. Sequences longer than this "
-                        "will be truncated, and sequences shorter than this "
-                        " will be padded.")
-    parser.add_argument("--num_gpus", default=1, type=int,
-                        help="Number of GPUs")
-    parser.add_argument("--amp_opt_level", default="O0", type=str,
-                        choices=["O0", "O1", "O2"],
-                        help="01/02 to enable mixed precision")
-    parser.add_argument("--local_rank", type=int, default=None,
-                        help="For distributed training: local_rank")
-    parser.add_argument("--work_dir", default='output_squad', type=str,
-                        help="The output directory where the "
-                        "model predictions and checkpoints "
-                        "will be written.")
-    parser.add_argument("--save_epoch_freq", default=1, type=int,
-                        help="Frequency of saving checkpoint "
-                        "'-1' - epoch checkpoint won't be saved")
-    parser.add_argument("--save_step_freq", default=-1, type=int,
-                        help="Frequency of saving checkpoint "
-                        "'-1' - step checkpoint won't be saved")
-    parser.add_argument("--loss_step_freq", default=100, type=int,
-                        help="Frequency of printing loss")
-    parser.add_argument("--eval_step_freq", default=500, type=int,
-                        help="Frequency of evaluation on dev data")
-    parser.add_argument("--version_2_with_negative", action="store_true",
-                        help="If true, the SQuAD examples contain some that "
-                        "do not have an answer.")
-    parser.add_argument('--null_score_diff_threshold',
-                        type=float, default=0.0,
-                        help="If null_score - best_non_null is "
-                        "greater than the threshold predict null.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to "
-                        "generate in the nbest_predictions.json output file.")
-    parser.add_argument("--batches_per_step", default=1, type=int,
-                        help="Number of iterations per step.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be "
-                        "generated. This is needed because the start "
-                        "and end predictions are not conditioned "
-                        "on one another.")
-    parser.add_argument("--output_prediction_file", type=str, required=False,
-                        default="predictions.json",
-                        help="File to write predictions to. "
-                        "Only in evaluation mode.")
+    parser.add_argument(
+        "--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.",
+    )
+    parser.add_argument(
+        "--num_epochs", default=2, type=int, help="Total number of training epochs to perform.",
+    )
+    parser.add_argument(
+        "--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training/evaluation.",
+    )
+    parser.add_argument(
+        "--do_lower_case",
+        action='store_true',
+        help="Whether to lower case the input text. " "True for uncased models, False for cased models.",
+    )
+    parser.add_argument(
+        "--evaluation_only", action='store_true', help="Whether to only do evaluation.",
+    )
+    parser.add_argument(
+        "--doc_stride",
+        default=128,
+        type=int,
+        help="When splitting up a long document into chunks, " "how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--max_query_length",
+        default=64,
+        type=int,
+        help="The maximum number of tokens for the question. "
+        "Questions longer than this will be truncated to "
+        "this length.",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after "
+        "WordPiece tokenization. Sequences longer than this "
+        "will be truncated, and sequences shorter than this "
+        " will be padded.",
+    )
+    parser.add_argument("--num_gpus", default=1, type=int, help="Number of GPUs")
+    parser.add_argument(
+        "--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"], help="01/02 to enable mixed precision",
+    )
+    parser.add_argument(
+        "--local_rank", type=int, default=None, help="For distributed training: local_rank",
+    )
+    parser.add_argument(
+        "--work_dir",
+        default='output_squad',
+        type=str,
+        help="The output directory where the " "model predictions and checkpoints " "will be written.",
+    )
+    parser.add_argument(
+        "--save_epoch_freq",
+        default=1,
+        type=int,
+        help="Frequency of saving checkpoint " "'-1' - epoch checkpoint won't be saved",
+    )
+    parser.add_argument(
+        "--save_step_freq",
+        default=-1,
+        type=int,
+        help="Frequency of saving checkpoint " "'-1' - step checkpoint won't be saved",
+    )
+    parser.add_argument(
+        "--loss_step_freq", default=100, type=int, help="Frequency of printing loss",
+    )
+    parser.add_argument(
+        "--eval_step_freq", default=500, type=int, help="Frequency of evaluation on dev data",
+    )
+    parser.add_argument(
+        "--version_2_with_negative",
+        action="store_true",
+        help="If true, the SQuAD examples contain some that " "do not have an answer.",
+    )
+    parser.add_argument(
+        '--null_score_diff_threshold',
+        type=float,
+        default=0.0,
+        help="If null_score - best_non_null is " "greater than the threshold predict null.",
+    )
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to " "generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--batches_per_step", default=1, type=int, help="Number of iterations per step.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help="The maximum length of an answer that can be "
+        "generated. This is needed because the start "
+        "and end predictions are not conditioned "
+        "on one another.",
+    )
+    parser.add_argument(
+        "--output_prediction_file",
+        type=str,
+        required=False,
+        default="predictions.json",
+        help="File to write predictions to. " "Only in evaluation mode.",
+    )
     args = parser.parse_args()
     return args
 
 
 def create_pipeline(
-        data_dir,
-        model,
-        head,
-        loss_fn,
-        max_query_length,
-        max_seq_length,
-        doc_stride,
-        batch_size,
-        version_2_with_negative,
-        num_gpus=1,
-        batches_per_step=1,
-        mode="train"):
+    data_dir,
+    model,
+    head,
+    loss_fn,
+    max_query_length,
+    max_seq_length,
+    doc_stride,
+    batch_size,
+    version_2_with_negative,
+    num_gpus=1,
+    batches_per_step=1,
+    mode="train",
+):
 
     data_layer = nemo_nlp.BertQuestionAnsweringDataLayer(
-                    mode=mode,
-                    version_2_with_negative=version_2_with_negative,
-                    batch_size=batch_size,
-                    tokenizer=tokenizer,
-                    data_dir=data_dir,
-                    max_query_length=max_query_length,
-                    max_seq_length=max_seq_length,
-                    doc_stride=doc_stride)
+        mode=mode,
+        version_2_with_negative=version_2_with_negative,
+        batch_size=batch_size,
+        tokenizer=tokenizer,
+        data_dir=data_dir,
+        max_query_length=max_query_length,
+        max_seq_length=max_seq_length,
+        doc_stride=doc_stride,
+    )
 
     input_data = data_layer()
 
     hidden_states = model(
-                        input_ids=input_data.input_ids,
-                        token_type_ids=input_data.input_type_ids,
-                        attention_mask=input_data.input_mask)
+        input_ids=input_data.input_ids, token_type_ids=input_data.input_type_ids, attention_mask=input_data.input_mask,
+    )
 
     qa_output = head(hidden_states=hidden_states)
     loss_output = loss_fn(
-        logits=qa_output, start_positions=input_data.start_positions,
-        end_positions=input_data.end_positions)
+        logits=qa_output, start_positions=input_data.start_positions, end_positions=input_data.end_positions,
+    )
 
-    steps_per_epoch = len(data_layer) \
-        // (batch_size * num_gpus * batches_per_step)
-    return loss_output.loss, \
-        steps_per_epoch, \
-        [loss_output.start_logits,
-            loss_output.end_logits,
-            input_data.unique_ids], \
-        data_layer
+    steps_per_epoch = len(data_layer) // (batch_size * num_gpus * batches_per_step)
+    return (
+        loss_output.loss,
+        steps_per_epoch,
+        [loss_output.start_logits, loss_output.end_logits, input_data.unique_ids,],
+        data_layer,
+    )
 
 
 if __name__ == "__main__":
     args = parse_args()
     if not os.path.exists(args.data_dir):
-        raise FileNotFoundError("SQUAD datasets not found. Datasets can be "
-                                "obtained using scripts/download_squad.py")
+        raise FileNotFoundError(
+            "SQUAD datasets not found. Datasets can be " "obtained using scripts/download_squad.py"
+        )
 
     if not args.version_2_with_negative:
         args.work_dir = f'{args.work_dir}/squad1.1'
@@ -227,21 +278,23 @@ def create_pipeline(
 
     # Instantiate neural factory with supported backend
     nf = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch,
-            local_rank=args.local_rank,
-            optimization_level=args.amp_opt_level,
-            log_dir=args.work_dir,
-            create_tb_writer=True,
-            files_to_copy=[__file__],
-            add_time_to_log_dir=True)
+        backend=nemo.core.Backend.PyTorch,
+        local_rank=args.local_rank,
+        optimization_level=args.amp_opt_level,
+        log_dir=args.work_dir,
+        create_tb_writer=True,
+        files_to_copy=[__file__],
+        add_time_to_log_dir=True,
+    )
 
     if args.tokenizer == "sentencepiece":
         try:
-            tokenizer = nemo_nlp.SentencePieceTokenizer(
-                model_path=args.tokenizer_model)
+            tokenizer = nemo_nlp.SentencePieceTokenizer(model_path=args.tokenizer_model)
         except Exception:
-            raise ValueError("Using --tokenizer=sentencepiece \
-                        requires valid --tokenizer_model")
+            raise ValueError(
+                "Using --tokenizer=sentencepiece \
+                        requires valid --tokenizer_model"
+            )
         tokenizer.add_special_tokens(["[CLS]", "[SEP]"])
     elif args.tokenizer == "nemobert":
         tokenizer = nemo_nlp.NemoBertTokenizer(args.pretrained_bert_model)
@@ -257,37 +310,17 @@ def create_pipeline(
         To see the list of pretrained models, call:
         nemo_nlp.huggingface.BERT.list_pretrained_models()
         """
-        model = nemo_nlp.huggingface.BERT(
-            pretrained_model_name=args.pretrained_bert_model)
+        model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 
     hidden_size = model.local_parameters["hidden_size"]
 
-    qa_head = nemo_nlp.TokenClassifier(
-                                    hidden_size=hidden_size,
-                                    num_classes=2,
-                                    num_layers=1,
-                                    log_softmax=False)
+    qa_head = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False)
     squad_loss = nemo_nlp.QuestionAnsweringLoss()
     if args.bert_checkpoint is not None:
         model.restore_from(args.bert_checkpoint)
 
     if not args.evaluation_only:
-        train_loss, train_steps_per_epoch, _, _ = \
-            create_pipeline(
-                data_dir=args.data_dir,
-                model=model,
-                head=qa_head,
-                loss_fn=squad_loss,
-                max_query_length=args.max_query_length,
-                max_seq_length=args.max_seq_length,
-                doc_stride=args.doc_stride,
-                batch_size=args.batch_size,
-                version_2_with_negative=args.version_2_with_negative,
-                num_gpus=args.num_gpus,
-                batches_per_step=args.batches_per_step,
-                mode="train")
-    _, _, eval_output, eval_data_layer = \
-        create_pipeline(
+        train_loss, train_steps_per_epoch, _, _ = create_pipeline(
             data_dir=args.data_dir,
             model=model,
             head=qa_head,
@@ -299,7 +332,22 @@ def create_pipeline(
             version_2_with_negative=args.version_2_with_negative,
             num_gpus=args.num_gpus,
             batches_per_step=args.batches_per_step,
-            mode="dev")
+            mode="train",
+        )
+    _, _, eval_output, eval_data_layer = create_pipeline(
+        data_dir=args.data_dir,
+        model=model,
+        head=qa_head,
+        loss_fn=squad_loss,
+        max_query_length=args.max_query_length,
+        max_seq_length=args.max_seq_length,
+        doc_stride=args.doc_stride,
+        batch_size=args.batch_size,
+        version_2_with_negative=args.version_2_with_negative,
+        num_gpus=args.num_gpus,
+        batches_per_step=args.batches_per_step,
+        mode="dev",
+    )
 
     if not args.evaluation_only:
         nemo.logging.info(f"steps_per_epoch = {train_steps_per_epoch}")
@@ -308,30 +356,33 @@ def create_pipeline(
             print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())),
             get_tb_values=lambda x: [["loss", x[0]]],
             step_freq=args.loss_step_freq,
-            tb_writer=nf.tb_writer)
+            tb_writer=nf.tb_writer,
+        )
 
         ckpt_callback = nemo.core.CheckpointCallback(
-            folder=nf.checkpoint_dir,
-            epoch_freq=args.save_epoch_freq,
-            step_freq=args.save_step_freq)
+            folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+        )
         callbacks_eval = nemo.core.EvaluatorCallback(
             eval_tensors=eval_output,
             user_iter_callback=lambda x, y: eval_iter_callback(x, y),
-            user_epochs_done_callback=lambda x:
-                eval_epochs_done_callback(
-                    x, eval_data_layer=eval_data_layer,
-                    do_lower_case=args.do_lower_case,
-                    n_best_size=args.n_best_size,
-                    max_answer_length=args.max_answer_length,
-                    version_2_with_negative=args.version_2_with_negative,
-                    null_score_diff_threshold=args.null_score_diff_threshold),
-                tb_writer=nf.tb_writer,
-                eval_step=args.eval_step_freq)
+            user_epochs_done_callback=lambda x: eval_epochs_done_callback(
+                x,
+                eval_data_layer=eval_data_layer,
+                do_lower_case=args.do_lower_case,
+                n_best_size=args.n_best_size,
+                max_answer_length=args.max_answer_length,
+                version_2_with_negative=args.version_2_with_negative,
+                null_score_diff_threshold=args.null_score_diff_threshold,
+            ),
+            tb_writer=nf.tb_writer,
+            eval_step=args.eval_step_freq,
+        )
 
         lr_policy_fn = get_lr_policy(
-                        args.lr_policy,
-                        total_steps=args.num_epochs * train_steps_per_epoch,
-                        warmup_ratio=args.lr_warmup_proportion)
+            args.lr_policy,
+            total_steps=args.num_epochs * train_steps_per_epoch,
+            warmup_ratio=args.lr_warmup_proportion,
+        )
 
         nf.train(
             tensors_to_optimize=[train_loss],
@@ -339,17 +390,13 @@ def create_pipeline(
             lr_policy=lr_policy_fn,
             optimizer=args.optimizer_kind,
             batches_per_step=args.batches_per_step,
-            optimization_params={
-                "num_epochs": args.num_epochs,
-                "lr": args.lr})
+            optimization_params={"num_epochs": args.num_epochs, "lr": args.lr},
+        )
     else:
 
         if args.checkpoint_dir is not None:
             load_from_folder = args.checkpoint_dir
-        evaluated_tensors = nf.infer(
-                    tensors=eval_output,
-                    checkpoint_dir=load_from_folder,
-                    cache=True)
+        evaluated_tensors = nf.infer(tensors=eval_output, checkpoint_dir=load_from_folder, cache=True)
         unique_ids = []
         start_logits = []
         end_logits = []
@@ -368,7 +415,8 @@ def create_pipeline(
             max_answer_length=args.max_answer_length,
             version_2_with_negative=args.version_2_with_negative,
             null_score_diff_threshold=args.null_score_diff_threshold,
-            do_lower_case=args.do_lower_case)
+            do_lower_case=args.do_lower_case,
+        )
         nemo.logging.info(f"exact_match: {exact_match}, f1: {f1}")
         if args.output_prediction_file is not None:
             with open(args.output_prediction_file, "w") as writer:
diff --git a/examples/nlp/token_classification.py b/examples/nlp/token_classification.py
index 60c8ba134768..86665339b61b 100644
--- a/examples/nlp/token_classification.py
+++ b/examples/nlp/token_classification.py
@@ -6,18 +6,17 @@
 import sys
 
 import nemo
-from nemo.utils.lr_policies import get_lr_policy
-
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer, \
-    TokenClassifier, TokenClassificationLoss
+from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer, TokenClassificationLoss, TokenClassifier
 from nemo.collections.nlp.data.datasets import utils
-from nemo.collections.nlp.utils.callbacks.token_classification import \
-    eval_iter_callback, eval_epochs_done_callback
+from nemo.collections.nlp.utils.callbacks.token_classification import eval_epochs_done_callback, eval_iter_callback
+from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments
-parser = argparse.ArgumentParser(description="Token classification\
-                        with pretrained BERT")
+parser = argparse.ArgumentParser(
+    description="Token classification\
+                        with pretrained BERT"
+)
 parser.add_argument("--local_rank", default=None, type=int)
 parser.add_argument("--batch_size", default=8, type=int)
 parser.add_argument("--max_seq_length", default=128, type=int)
@@ -28,8 +27,7 @@
 parser.add_argument("--lr_policy", default="WarmupAnnealing", type=str)
 parser.add_argument("--weight_decay", default=0, type=float)
 parser.add_argument("--optimizer_kind", default="adam", type=str)
-parser.add_argument("--amp_opt_level", default="O0",
-                    type=str, choices=["O0", "O1", "O2"])
+parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
 parser.add_argument("--data_dir", default="/data", type=str)
 parser.add_argument("--fc_dropout", default=0.5, type=float)
 parser.add_argument("--num_fc_layers", default=2, type=int)
@@ -37,49 +35,76 @@
 parser.add_argument("--ignore_extra_tokens", action='store_false')
 parser.add_argument("--none_label", default='O', type=str)
 parser.add_argument("--shuffle_data", action='store_false')
-parser.add_argument("--pretrained_bert_model",
-                    default="bert-base-cased", type=str)
+parser.add_argument("--pretrained_bert_model", default="bert-base-cased", type=str)
 parser.add_argument("--bert_checkpoint", default=None, type=str)
-parser.add_argument("--bert_config", default=None, type=str,
-                    help="Path to bert config file in json format")
-parser.add_argument("--tokenizer_model", default="tokenizer.model", type=str,
-                    help="Path to pretrained tokenizer model, \
-                    only used if --tokenizer is sentencepiece")
-parser.add_argument("--tokenizer", default="nemobert", type=str,
-                    choices=["nemobert", "sentencepiece"],
-                    help="tokenizer to use, \
-                    only relevant when using custom pretrained checkpoint.")
-parser.add_argument("--work_dir", default='output', type=str,
-                    help="The output directory where the model prediction\
-                    and checkpoints will be written.")
-parser.add_argument("--use_cache", action='store_true',
-                    help="Whether to cache preprocessed data")
-parser.add_argument("--save_epoch_freq", default=1, type=int,
-                    help="Frequency of saving checkpoint\
-                    '-1' - step checkpoint won't be saved")
-parser.add_argument("--save_step_freq", default=-1, type=int,
-                    help="Frequency of saving checkpoint \
-                    '-1' - step checkpoint won't be saved")
-parser.add_argument("--loss_step_freq", default=250, type=int,
-                    help="Frequency of printing loss")
-parser.add_argument("--use_weighted_loss", action='store_true',
-                    help="Flag to indicate whether to use weighted loss")
+parser.add_argument(
+    "--bert_config", default=None, type=str, help="Path to bert config file in json format",
+)
+parser.add_argument(
+    "--tokenizer_model",
+    default="tokenizer.model",
+    type=str,
+    help="Path to pretrained tokenizer model, \
+                    only used if --tokenizer is sentencepiece",
+)
+parser.add_argument(
+    "--tokenizer",
+    default="nemobert",
+    type=str,
+    choices=["nemobert", "sentencepiece"],
+    help="tokenizer to use, \
+                    only relevant when using custom pretrained checkpoint.",
+)
+parser.add_argument(
+    "--work_dir",
+    default='output',
+    type=str,
+    help="The output directory where the model prediction\
+                    and checkpoints will be written.",
+)
+parser.add_argument(
+    "--use_cache", action='store_true', help="Whether to cache preprocessed data",
+)
+parser.add_argument(
+    "--save_epoch_freq",
+    default=1,
+    type=int,
+    help="Frequency of saving checkpoint\
+                    '-1' - step checkpoint won't be saved",
+)
+parser.add_argument(
+    "--save_step_freq",
+    default=-1,
+    type=int,
+    help="Frequency of saving checkpoint \
+                    '-1' - step checkpoint won't be saved",
+)
+parser.add_argument(
+    "--loss_step_freq", default=250, type=int, help="Frequency of printing loss",
+)
+parser.add_argument(
+    "--use_weighted_loss", action='store_true', help="Flag to indicate whether to use weighted loss",
+)
 
 args = parser.parse_args()
 
 if not os.path.exists(args.data_dir):
-    raise FileNotFoundError("Dataset not found. For NER, CoNLL-2003 dataset"
-                            "can be obtained at"
-                            "https://github.com/kyzhouhzau/BERT"
-                            "-NER/tree/master/data.")
-
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   local_rank=args.local_rank,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=args.work_dir,
-                                   create_tb_writer=True,
-                                   files_to_copy=[__file__],
-                                   add_time_to_log_dir=True)
+    raise FileNotFoundError(
+        "Dataset not found. For NER, CoNLL-2003 dataset"
+        "can be obtained at"
+        "https://github.com/kyzhouhzau/BERT"
+        "-NER/tree/master/data."
+    )
+
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch,
+    local_rank=args.local_rank,
+    optimization_level=args.amp_opt_level,
+    log_dir=args.work_dir,
+    create_tb_writer=True,
+    files_to_copy=[__file__],
+    add_time_to_log_dir=True,
+)
 
 nemo.logging.info(args)
 
@@ -91,8 +116,7 @@
     nemo_nlp.huggingface.BERT.list_pretrained_models()
     """
     tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
-    model = nemo_nlp.huggingface.BERT(
-        pretrained_model_name=args.pretrained_bert_model)
+    model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 else:
     """ Use this if you're using a BERT model that you pre-trained yourself.
     """
@@ -108,8 +132,7 @@
             config = json.load(json_file)
         model = nemo_nlp.huggingface.BERT(**config)
     else:
-        model = nemo_nlp.huggingface.BERT(
-            pretrained_model_name=args.pretrained_bert_model)
+        model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 
     model.restore_from(args.bert_checkpoint)
     nemo.logging.info(f"Model restored from {args.bert_checkpoint}")
@@ -121,19 +144,21 @@
 task_loss = "TokenClassificationLoss"
 
 
-def create_pipeline(num_samples=-1,
-                    pad_label=args.none_label,
-                    max_seq_length=args.max_seq_length,
-                    batch_size=args.batch_size,
-                    local_rank=args.local_rank,
-                    num_gpus=args.num_gpus,
-                    mode='train',
-                    label_ids=None,
-                    ignore_extra_tokens=args.ignore_extra_tokens,
-                    ignore_start_end=args.ignore_start_end,
-                    use_cache=args.use_cache,
-                    dropout=args.fc_dropout,
-                    num_layers=args.num_fc_layers):
+def create_pipeline(
+    num_samples=-1,
+    pad_label=args.none_label,
+    max_seq_length=args.max_seq_length,
+    batch_size=args.batch_size,
+    local_rank=args.local_rank,
+    num_gpus=args.num_gpus,
+    mode='train',
+    label_ids=None,
+    ignore_extra_tokens=args.ignore_extra_tokens,
+    ignore_start_end=args.ignore_start_end,
+    use_cache=args.use_cache,
+    dropout=args.fc_dropout,
+    num_layers=args.num_fc_layers,
+):
 
     global classifier, task_loss
 
@@ -144,7 +169,8 @@ def create_pipeline(num_samples=-1,
     label_file = f'{args.data_dir}/labels_{mode}.txt'
 
     if not (os.path.exists(text_file) or (os.path.exists(label_file))):
-        raise FileNotFoundError(f'{text_file} or {label_file} not found. \
+        raise FileNotFoundError(
+            f'{text_file} or {label_file} not found. \
            The data should be splitted into 2 files: text.txt and labels.txt. \
            Each line of the text.txt file contains text sequences, where words\
            are separated with spaces. The labels.txt file contains \
@@ -152,7 +178,8 @@ def create_pipeline(num_samples=-1,
            separated with spaces. Each line of the files should follow the \
            format:  \
            [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \
-           [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).')
+           [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).'
+        )
 
     data_layer = nemo_nlp.BertTokenClassificationDataLayer(
         tokenizer=tokenizer,
@@ -167,10 +194,10 @@ def create_pipeline(num_samples=-1,
         shuffle=shuffle,
         ignore_extra_tokens=ignore_extra_tokens,
         ignore_start_end=ignore_start_end,
-        use_cache=use_cache)
+        use_cache=use_cache,
+    )
 
-    input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, \
-        labels = data_layer()
+    (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, labels,) = data_layer()
 
     if mode == 'train':
         label_ids = data_layer.dataset.label_ids
@@ -184,18 +211,14 @@ def create_pipeline(num_samples=-1,
             nemo.logging.info(f"class_weights: {class_weights}")
 
         classifier = getattr(sys.modules[__name__], classifier)
-        classifier = classifier(hidden_size=hidden_size,
-                                num_classes=len(label_ids),
-                                dropout=dropout,
-                                num_layers=num_layers)
+        classifier = classifier(
+            hidden_size=hidden_size, num_classes=len(label_ids), dropout=dropout, num_layers=num_layers,
+        )
 
         task_loss = getattr(sys.modules[__name__], task_loss)
-        task_loss = task_loss(num_classes=len(label_ids),
-                              class_weights=class_weights)
+        task_loss = task_loss(num_classes=len(label_ids), class_weights=class_weights)
 
-    hidden_states = model(input_ids=input_ids,
-                          token_type_ids=input_type_ids,
-                          attention_mask=input_mask)
+    hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
 
     logits = classifier(hidden_states=hidden_states)
     loss = task_loss(logits=logits, labels=labels, loss_mask=loss_mask)
@@ -211,8 +234,7 @@ def create_pipeline(num_samples=-1,
 
 train_tensors, train_loss, steps_per_epoch, label_ids, _ = create_pipeline()
 
-eval_tensors, _, _, _, data_layer = create_pipeline(mode='dev',
-                                                    label_ids=label_ids)
+eval_tensors, _, _, _, data_layer = create_pipeline(mode='dev', label_ids=label_ids)
 
 nemo.logging.info(f"steps_per_epoch = {steps_per_epoch}")
 
@@ -221,28 +243,29 @@ def create_pipeline(num_samples=-1,
     tensors=train_tensors,
     print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())),
     get_tb_values=lambda x: [["loss", x[0]]],
-    tb_writer=nf.tb_writer)
+    tb_writer=nf.tb_writer,
+)
 
 eval_callback = nemo.core.EvaluatorCallback(
     eval_tensors=eval_tensors,
     user_iter_callback=lambda x, y: eval_iter_callback(x, y),
-    user_epochs_done_callback=lambda x:
-        eval_epochs_done_callback(x, label_ids, f'{nf.work_dir}/graphs'),
+    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, label_ids, f'{nf.work_dir}/graphs'),
     tb_writer=nf.tb_writer,
-    eval_step=steps_per_epoch)
+    eval_step=steps_per_epoch,
+)
 
 ckpt_callback = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir,
-    epoch_freq=args.save_epoch_freq,
-    step_freq=args.save_step_freq)
-
-lr_policy_fn = get_lr_policy(args.lr_policy,
-                             total_steps=args.num_epochs * steps_per_epoch,
-                             warmup_ratio=args.lr_warmup_proportion)
-
-nf.train(tensors_to_optimize=[train_loss],
-         callbacks=[train_callback, eval_callback, ckpt_callback],
-         lr_policy=lr_policy_fn,
-         optimizer=args.optimizer_kind,
-         optimization_params={"num_epochs": args.num_epochs,
-                              "lr": args.lr})
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+)
+
+lr_policy_fn = get_lr_policy(
+    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+)
+
+nf.train(
+    tensors_to_optimize=[train_loss],
+    callbacks=[train_callback, eval_callback, ckpt_callback],
+    lr_policy=lr_policy_fn,
+    optimizer=args.optimizer_kind,
+    optimization_params={"num_epochs": args.num_epochs, "lr": args.lr},
+)
diff --git a/examples/nlp/token_classification_infer.py b/examples/nlp/token_classification_infer.py
index b9470dd19caa..4205909f41cc 100644
--- a/examples/nlp/token_classification_infer.py
+++ b/examples/nlp/token_classification_infer.py
@@ -13,23 +13,29 @@
 parser = argparse.ArgumentParser(description='NER with pretrained BERT')
 parser.add_argument("--max_seq_length", default=128, type=int)
 parser.add_argument("--fc_dropout", default=0, type=float)
-parser.add_argument("--pretrained_bert_model",
-                    default="bert-base-cased", type=str)
+parser.add_argument("--pretrained_bert_model", default="bert-base-cased", type=str)
 parser.add_argument("--none_label", default='O', type=str)
-parser.add_argument("--queries", action='append',
-                    default=['we bought four shirts from the nvidia gear ' +
-                             'store in santa clara', 'Nvidia is a company',
-                             'The Adventures of Tom Sawyer by Mark Twain ' +
-                             'is an 1876 novel about a young boy growing ' +
-                             'up along the Mississippi River'],
-                    help="Example: --queries 'San Francisco' --queries 'LA'")
-parser.add_argument("--add_brackets", action='store_false',
-                    help="Whether to take predicted label in brackets or \
-                    just append to word in the output")
+parser.add_argument(
+    "--queries",
+    action='append',
+    default=[
+        'we bought four shirts from the nvidia gear ' + 'store in santa clara',
+        'Nvidia is a company',
+        'The Adventures of Tom Sawyer by Mark Twain '
+        + 'is an 1876 novel about a young boy growing '
+        + 'up along the Mississippi River',
+    ],
+    help="Example: --queries 'San Francisco' --queries 'LA'",
+)
+parser.add_argument(
+    "--add_brackets",
+    action='store_false',
+    help="Whether to take predicted label in brackets or \
+                    just append to word in the output",
+)
 parser.add_argument("--work_dir", default='output/checkpoints', type=str)
 parser.add_argument("--labels_dict", default='label_ids.csv', type=str)
-parser.add_argument("--amp_opt_level", default="O0",
-                    type=str, choices=["O0", "O1", "O2"])
+parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
 
 args = parser.parse_args()
 print(args)
@@ -37,12 +43,11 @@
 if not os.path.exists(args.work_dir):
     raise ValueError(f'Work directory not found at {args.work_dir}')
 if not os.path.exists(args.labels_dict):
-    raise ValueError(
-        f'Dictionary with ids to labels not found at {args.labels_dict}')
+    raise ValueError(f'Dictionary with ids to labels not found at {args.labels_dict}')
 
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=None)
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None,
+)
 
 labels_dict = get_vocab(args.labels_dict)
 
@@ -50,35 +55,25 @@
 See the list of pretrained models, call:
 nemo_nlp.huggingface.BERT.list_pretrained_models()
 """
-pretrained_bert_model = nemo_nlp.huggingface.BERT(
-    pretrained_model_name=args.pretrained_bert_model)
+pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 hidden_size = pretrained_bert_model.local_parameters["hidden_size"]
 tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
 
 data_layer = nemo_nlp.BertTokenClassificationInferDataLayer(
-    queries=args.queries,
-    tokenizer=tokenizer,
-    max_seq_length=args.max_seq_length,
-    batch_size=1)
+    queries=args.queries, tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1,
+)
 
-classifier = nemo_nlp.TokenClassifier(hidden_size=hidden_size,
-                                      num_classes=len(labels_dict),
-                                      dropout=args.fc_dropout)
+classifier = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=len(labels_dict), dropout=args.fc_dropout,)
 
 input_ids, input_type_ids, input_mask, _, subtokens_mask = data_layer()
 
-hidden_states = pretrained_bert_model(input_ids=input_ids,
-                                      token_type_ids=input_type_ids,
-                                      attention_mask=input_mask)
+hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
 logits = classifier(hidden_states=hidden_states)
 
 ###########################################################################
 
 # Instantiate an optimizer to perform `infer` action
-evaluated_tensors = nf.infer(
-    tensors=[logits, subtokens_mask],
-    checkpoint_dir=args.work_dir,
-)
+evaluated_tensors = nf.infer(tensors=[logits, subtokens_mask], checkpoint_dir=args.work_dir,)
 
 
 def concatenate(lists):
@@ -93,8 +88,7 @@ def add_brackets(text, add=args.add_brackets):
     return '[' + text + ']' if add else text
 
 
-logits, subtokens_mask = \
-    [concatenate(tensors) for tensors in evaluated_tensors]
+logits, subtokens_mask = [concatenate(tensors) for tensors in evaluated_tensors]
 
 preds = np.argmax(logits, axis=2)
 
diff --git a/examples/nlp/transformer_lm.py b/examples/nlp/transformer_lm.py
index ed6a1e39312d..41ca2e960ffb 100644
--- a/examples/nlp/transformer_lm.py
+++ b/examples/nlp/transformer_lm.py
@@ -2,13 +2,10 @@
 import math
 
 import nemo
-from nemo.utils.lr_policies import CosineAnnealing
 import nemo.collections.nlp as nemo_nlp
-
 from nemo.collections.nlp.data.datasets.utils import LanguageModelDataDesc
-from nemo.collections.nlp.utils.callbacks.language_modeling import eval_iter_callback, \
-    eval_epochs_done_callback
-
+from nemo.collections.nlp.utils.callbacks.language_modeling import eval_epochs_done_callback, eval_iter_callback
+from nemo.utils.lr_policies import CosineAnnealing
 
 parser = nemo.utils.NemoArgParser(description='LM Transformer')
 parser.set_defaults(
@@ -27,7 +24,7 @@
     warmup_steps=1000,
     max_steps=50000,
     iter_per_step=1,
-    eval_freq=1000
+    eval_freq=1000,
 )
 parser.add_argument("--data_dir", default="data/lm/wikitext-2", type=str)
 parser.add_argument("--dataset_name", default="wikitext-2", type=str)
@@ -56,15 +53,16 @@
 """
 
 work_dir = f'{args.work_dir}/{args.dataset_name.upper()}'
-nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
-                                   local_rank=args.local_rank,
-                                   optimization_level=args.amp_opt_level,
-                                   log_dir=args.work_dir,
-                                   create_tb_writer=True,
-                                   files_to_copy=[__file__])
+nf = nemo.core.NeuralModuleFactory(
+    backend=nemo.core.Backend.PyTorch,
+    local_rank=args.local_rank,
+    optimization_level=args.amp_opt_level,
+    log_dir=args.work_dir,
+    create_tb_writer=True,
+    files_to_copy=[__file__],
+)
 
-data_desc = LanguageModelDataDesc(
-    args.dataset_name, args.data_dir, args.do_lower_case)
+data_desc = LanguageModelDataDesc(args.dataset_name, args.data_dir, args.do_lower_case)
 
 # define tokenizer, in this example we use word-level tokenizer
 # we also adjust the vocabulary size to make it multiple of 8 to accelerate
@@ -87,45 +85,41 @@
     mask_future=True,
     attn_score_dropout=args.attn_score_dropout,
     attn_layer_dropout=args.attn_layer_dropout,
-    max_seq_length=args.max_seq_length)
+    max_seq_length=args.max_seq_length,
+)
 
-log_softmax = nemo_nlp.TokenClassifier(args.d_model,
-                                       num_classes=vocab_size,
-                                       num_layers=1,
-                                       log_softmax=True)
+log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True)
 
-loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(
-    pad_id=tokenizer.pad_id(),
-    label_smoothing=args.label_smoothing)
+loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id(), label_smoothing=args.label_smoothing)
 
 # tie weight of embedding and log_softmax layers
-log_softmax.mlp.last_linear_layer.weight = \
-    encoder.embedding_layer.token_embedding.weight
-
-
-def create_pipeline(dataset,
-                    max_seq_length=args.max_seq_length,
-                    batch_step=args.max_seq_length,
-                    batch_size=args.batch_size):
-    data_layer = nemo_nlp.LanguageModelingDataLayer(dataset,
-                                                    tokenizer,
-                                                    max_seq_length,
-                                                    batch_step,
-                                                    batch_size=batch_size)
+log_softmax.mlp.last_linear_layer.weight = encoder.embedding_layer.token_embedding.weight
+
+
+def create_pipeline(
+    dataset, max_seq_length=args.max_seq_length, batch_step=args.max_seq_length, batch_size=args.batch_size,
+):
+    data_layer = nemo_nlp.LanguageModelingDataLayer(
+        dataset, tokenizer, max_seq_length, batch_step, batch_size=batch_size
+    )
     src, src_mask, labels = data_layer()
     src_hiddens = encoder(input_ids=src, input_mask_src=src_mask)
     logits = log_softmax(hidden_states=src_hiddens)
     return loss(logits=logits, target_ids=labels)
 
 
-train_loss = create_pipeline(f"{args.data_dir}/{args.train_dataset}",
-                             args.max_seq_length,
-                             batch_step=args.max_seq_length,
-                             batch_size=args.batch_size)
-eval_loss = create_pipeline(f"{args.data_dir}/{args.eval_dataset}",
-                            args.max_seq_length,
-                            batch_step=args.predict_last_k,
-                            batch_size=args.eval_batch_size)
+train_loss = create_pipeline(
+    f"{args.data_dir}/{args.train_dataset}",
+    args.max_seq_length,
+    batch_step=args.max_seq_length,
+    batch_size=args.batch_size,
+)
+eval_loss = create_pipeline(
+    f"{args.data_dir}/{args.eval_dataset}",
+    args.max_seq_length,
+    batch_step=args.predict_last_k,
+    batch_size=args.eval_batch_size,
+)
 
 # callback which prints training loss once in a while
 train_callback = nemo.core.SimpleLossLoggerCallback(
@@ -133,7 +127,8 @@ def create_pipeline(dataset,
     step_freq=100,
     print_func=lambda x: str(x[0].item()),
     get_tb_values=lambda x: [["loss", x[0]]],
-    tb_writer=nf.tb_writer)
+    tb_writer=nf.tb_writer,
+)
 
 # callback which calculates evaluation loss
 eval_callback = nemo.core.EvaluatorCallback(
@@ -141,14 +136,13 @@ def create_pipeline(dataset,
     user_iter_callback=eval_iter_callback,
     user_epochs_done_callback=eval_epochs_done_callback,
     eval_step=args.eval_freq,
-    tb_writer=nf.tb_writer)
+    tb_writer=nf.tb_writer,
+)
 
 # callback which saves checkpoints once in a while
 callback_ckpt = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir,
-    epoch_freq=args.save_epoch_freq,
-    step_freq=args.save_step_freq,
-    checkpoints_to_keep=-1)
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=-1,
+)
 
 # define learning rate decay policy
 lr_policy_fn = CosineAnnealing(args.max_steps, warmup_steps=args.warmup_steps)
@@ -161,12 +155,16 @@ def create_pipeline(dataset,
 if not args.interactive:
     callbacks.extend([train_callback, eval_callback])
 
-nf.train(tensors_to_optimize=[train_loss],
-         callbacks=callbacks,
-         lr_policy=lr_policy_fn,
-         batches_per_step=args.iter_per_step,
-         optimizer=args.optimizer_kind,
-         optimization_params={"num_epochs": args.num_epochs,
-                              "lr": args.lr,
-                              "weight_decay": args.weight_decay,
-                              "betas": (args.beta1, args.beta2)})
+nf.train(
+    tensors_to_optimize=[train_loss],
+    callbacks=callbacks,
+    lr_policy=lr_policy_fn,
+    batches_per_step=args.iter_per_step,
+    optimizer=args.optimizer_kind,
+    optimization_params={
+        "num_epochs": args.num_epochs,
+        "lr": args.lr,
+        "weight_decay": args.weight_decay,
+        "betas": (args.beta1, args.beta2),
+    },
+)
diff --git a/examples/start_here/chatbot_example.py b/examples/start_here/chatbot_example.py
index 8a507cfac567..d6be2c6e37ed 100644
--- a/examples/start_here/chatbot_example.py
+++ b/examples/start_here/chatbot_example.py
@@ -1,6 +1,7 @@
-import os
 import gzip
+import os
 import shutil
+
 import nemo
 
 # Get Data
@@ -47,10 +48,8 @@
 
 # express activations flow
 src, src_lengths, tgt, mask, max_tgt_length = dl()
-encoder_outputs, encoder_hidden = encoder(input_seq=src,
-                                          input_lengths=src_lengths)
-outputs, hidden = decoder(targets=tgt, encoder_outputs=encoder_outputs,
-                          max_target_len=max_tgt_length)
+encoder_outputs, encoder_hidden = encoder(input_seq=src, input_lengths=src_lengths)
+outputs, hidden = decoder(targets=tgt, encoder_outputs=encoder_outputs, max_target_len=max_tgt_length)
 loss = L(predictions=outputs, target=tgt, mask=mask)
 
 # run inference decoder to generate predictions
@@ -69,13 +68,11 @@ def outputs2words(tensors, vocab):
     response = ' '.join([s for s in response if s != 'EOS' and s != 'PAD'])
     target = ' '.join([s for s in target if s != 'EOS' and s != 'PAD'])
     print(f"Train Loss:{str(tensors[0].item())}")
-    print(f"SOURCE: {source} <---> PREDICTED RESPONSE: {response} "
-          f"<---> TARGET: {target}")
+    print(f"SOURCE: {source} <---> PREDICTED RESPONSE: {response} " f"<---> TARGET: {target}")
 
 
 callback = nemo.core.SimpleLossLoggerCallback(
-    tensors=[loss, src, outputs_inf, tgt],
-    print_func=lambda x: outputs2words(x, dl.voc.index2word)
+    tensors=[loss, src, outputs_inf, tgt], print_func=lambda x: outputs2words(x, dl.voc.index2word),
 )
 
 # start training
@@ -83,4 +80,5 @@ def outputs2words(tensors, vocab):
     tensors_to_optimize=[loss],
     callbacks=[callback],
     optimizer="adam",
-    optimization_params={"num_epochs": config["num_epochs"], "lr": 0.001})
+    optimization_params={"num_epochs": config["num_epochs"], "lr": 0.001},
+)
diff --git a/examples/start_here/chatbot_example2.py b/examples/start_here/chatbot_example2.py
index 66e36fd59dbe..79f7220eef1c 100644
--- a/examples/start_here/chatbot_example2.py
+++ b/examples/start_here/chatbot_example2.py
@@ -1,7 +1,8 @@
 # Copyright (c) 2019 NVIDIA Corporation
-import os
 import gzip
+import os
 import shutil
+
 import nemo
 
 # Get Data
@@ -29,52 +30,31 @@
 }
 
 # instantiate Neural Factory with supported backend
-neural_factory = nemo.core.NeuralModuleFactory(
-    backend=nemo.core.Backend.PyTorch,
-    local_rank=None)
+neural_factory = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch, local_rank=None)
 
 # instantiate necessary neural modules
-dl = neural_factory.get_module(
-    name="DialogDataLayer", collection="tutorials",
-    params=config)
+dl = neural_factory.get_module(name="DialogDataLayer", collection="tutorials", params=config)
 
 # Instance one on EncoderRNN
-encoder1 = neural_factory.get_module(
-    name="EncoderRNN", collection="tutorials",
-    params=config)
+encoder1 = neural_factory.get_module(name="EncoderRNN", collection="tutorials", params=config)
 # Instance two on EncoderRNN. It will have different weights from instance one
-encoder2 = neural_factory.get_module(
-    name="EncoderRNN", collection="tutorials",
-    params=config)
-mixer = neural_factory.get_module(
-    name="SimpleCombiner", collection="common",
-    params={}
-)
+encoder2 = neural_factory.get_module(name="EncoderRNN", collection="tutorials", params=config)
+mixer = neural_factory.get_module(name="SimpleCombiner", collection="common", params={})
 
-decoder = neural_factory.get_module(
-    name="LuongAttnDecoderRNN", collection="tutorials",
-    params=config)
+decoder = neural_factory.get_module(name="LuongAttnDecoderRNN", collection="tutorials", params=config)
 
-L = neural_factory.get_module(
-    name="MaskedXEntropyLoss", collection="tutorials",
-    params={})
+L = neural_factory.get_module(name="MaskedXEntropyLoss", collection="tutorials", params={})
 
-decoderInfer = neural_factory.get_module(
-    name="GreedyLuongAttnDecoderRNN", collection="tutorials",
-    params=config)
+decoderInfer = neural_factory.get_module(name="GreedyLuongAttnDecoderRNN", collection="tutorials", params=config)
 # notice trainng and inference decoder share parameters
 decoderInfer.tie_weights_with(decoder, list(decoder.get_weights().keys()))
 
 # express activations flow
 src, src_lengths, tgt, mask, max_tgt_length = dl()
-encoder_outputs1, encoder_hidden1 = encoder1(input_seq=src,
-                                             input_lengths=src_lengths)
-encoder_outputs2, encoder_hidden2 = encoder2(input_seq=src,
-                                             input_lengths=src_lengths)
+encoder_outputs1, encoder_hidden1 = encoder1(input_seq=src, input_lengths=src_lengths)
+encoder_outputs2, encoder_hidden2 = encoder2(input_seq=src, input_lengths=src_lengths)
 encoder_outputs = mixer(x1=encoder_outputs1, x2=encoder_outputs2)
-outputs, hidden = decoder(targets=tgt,
-                          encoder_outputs=encoder_outputs,
-                          max_target_len=max_tgt_length)
+outputs, hidden = decoder(targets=tgt, encoder_outputs=encoder_outputs, max_target_len=max_tgt_length)
 loss = L(predictions=outputs, target=tgt, mask=mask)
 
 # run inference decoder to generate predictions
@@ -100,8 +80,8 @@ def outputs2words(tensors, vocab):
 
 # Create trainer and execute training action
 callback = nemo.core.SimpleLossLoggerCallback(
-    tensors=[loss, src, outputs_inf, tgt],
-    print_func=lambda x: outputs2words(x, dl.voc.index2word))
+    tensors=[loss, src, outputs_inf, tgt], print_func=lambda x: outputs2words(x, dl.voc.index2word),
+)
 # Instantiate an optimizer to perform `train` action
 optimizer = neural_factory.get_trainer()
 
@@ -109,6 +89,5 @@ def outputs2words(tensors, vocab):
     tensors_to_optimize=[loss],
     callbacks=[callback],
     optimizer="adam",
-    optimization_params={"num_epochs": config["num_epochs"],
-                         "lr": 0.001}
+    optimization_params={"num_epochs": config["num_epochs"], "lr": 0.001},
 )
diff --git a/examples/start_here/simplest_example.py b/examples/start_here/simplest_example.py
index 44599c7440d3..2c275cc9f0af 100644
--- a/examples/start_here/simplest_example.py
+++ b/examples/start_here/simplest_example.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import nemo
+
 nf = nemo.core.NeuralModuleFactory()
 # To use CPU-only do:
 # from nemo.core import DeviceType
@@ -7,8 +8,7 @@
 
 # instantiate necessary neural modules
 # RealFunctionDataLayer defaults to f=torch.sin, sampling from x=[-4, 4]
-dl = nemo.tutorials.RealFunctionDataLayer(
-    n=10000, batch_size=128)
+dl = nemo.tutorials.RealFunctionDataLayer(n=10000, batch_size=128)
 fx = nemo.tutorials.TaylorNet(dim=4)
 loss = nemo.tutorials.MSELoss()
 
@@ -19,10 +19,10 @@
 
 # SimpleLossLoggerCallback will print loss values to console.
 callback = nemo.core.SimpleLossLoggerCallback(
-    tensors=[lss],
-    print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'))
+    tensors=[lss], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
+)
 
 # Invoke "train" action
-nf.train([lss], callbacks=[callback],
-         optimization_params={"num_epochs": 3, "lr": 0.0003},
-         optimizer="sgd")
+nf.train(
+    [lss], callbacks=[callback], optimization_params={"num_epochs": 3, "lr": 0.0003}, optimizer="sgd",
+)
diff --git a/examples/tts/tacotron2.py b/examples/tts/tacotron2.py
index b1717f46111d..2980ddf3e701 100644
--- a/examples/tts/tacotron2.py
+++ b/examples/tts/tacotron2.py
@@ -1,31 +1,29 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import argparse
 import copy
-from functools import partial
 import math
 import os
+from functools import partial
 
 from ruamel.yaml import YAML
 
 import nemo
-
-from nemo.utils.lr_policies import CosineAnnealing
-import nemo.utils.argparse as nm_argparse
-
 import nemo.collections.asr as nemo_asr
 import nemo.collections.tts as nemo_tts
-
-from nemo.collections.tts import tacotron2_eval_log_to_tb_func
-from nemo.collections.tts import tacotron2_log_to_tb_func
-from nemo.collections.tts import tacotron2_process_eval_batch
-from nemo.collections.tts import tacotron2_process_final_eval
+import nemo.utils.argparse as nm_argparse
+from nemo.collections.tts import (
+    tacotron2_eval_log_to_tb_func,
+    tacotron2_log_to_tb_func,
+    tacotron2_process_eval_batch,
+    tacotron2_process_final_eval,
+)
+from nemo.utils.lr_policies import CosineAnnealing
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        parents=[nm_argparse.NemoArgParser()],
-        description='Tacotron2',
-        conflict_handler='resolve')
+        parents=[nm_argparse.NemoArgParser()], description='Tacotron2', conflict_handler='resolve',
+    )
     parser.set_defaults(
         checkpoint_dir=None,
         optimizer="adam",
@@ -35,20 +33,23 @@ def parse_args():
         amp_opt_level="O0",
         create_tb_writer=True,
         lr_policy=None,
-        weight_decay=1e-6
+        weight_decay=1e-6,
     )
 
     # Overwrite default args
-    parser.add_argument("--max_steps", type=int, default=None, required=False,
-                        help="max number of steps to train")
-    parser.add_argument("--num_epochs", type=int, default=None, required=False,
-                        help="number of epochs to train")
-    parser.add_argument("--model_config", type=str, required=True,
-                        help="model configuration file: model.yaml")
-    parser.add_argument("--grad_norm_clip", type=float, default=1.0,
-                        help="gradient clipping")
-    parser.add_argument("--min_lr", type=float, default=1e-5,
-                        help="minimum learning rate to decay to")
+    parser.add_argument(
+        "--max_steps", type=int, default=None, required=False, help="max number of steps to train",
+    )
+    parser.add_argument(
+        "--num_epochs", type=int, default=None, required=False, help="number of epochs to train",
+    )
+    parser.add_argument(
+        "--model_config", type=str, required=True, help="model configuration file: model.yaml",
+    )
+    parser.add_argument("--grad_norm_clip", type=float, default=1.0, help="gradient clipping")
+    parser.add_argument(
+        "--min_lr", type=float, default=1e-5, help="minimum learning rate to decay to",
+    )
 
     # Create new args
     parser.add_argument("--exp_name", default="Tacotron2", type=str)
@@ -62,10 +63,11 @@ def parse_args():
     if args.eval_freq % 25 != 0:
         raise ValueError("eval_freq should be a multiple of 25.")
 
-    exp_directory = [f"{args.exp_name}-lr_{args.lr}-bs_{args.batch_size}",
-                     "",
-                     (f"-wd_{args.weight_decay}-opt_{args.optimizer}"
-                      f"-ips_{args.iter_per_step}")]
+    exp_directory = [
+        f"{args.exp_name}-lr_{args.lr}-bs_{args.batch_size}",
+        "",
+        (f"-wd_{args.weight_decay}-opt_{args.optimizer}" f"-ips_{args.iter_per_step}"),
+    ]
     if args.max_steps:
         exp_directory[1] = f"-s_{args.max_steps}"
     elif args.num_epochs:
@@ -77,42 +79,47 @@ def parse_args():
 
 def create_NMs(tacotron2_params, decoder_infer=False):
     data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
-        **tacotron2_params["AudioToMelSpectrogramPreprocessor"])
+        **tacotron2_params["AudioToMelSpectrogramPreprocessor"]
+    )
     text_embedding = nemo_tts.TextEmbedding(
-        len(tacotron2_params["labels"]) + 3,  # + 3 special chars
-        **tacotron2_params["TextEmbedding"])
+        len(tacotron2_params["labels"]) + 3, **tacotron2_params["TextEmbedding"],  # + 3 special chars
+    )
     t2_enc = nemo_tts.Tacotron2Encoder(**tacotron2_params["Tacotron2Encoder"])
     if decoder_infer:
-        t2_dec = nemo_tts.Tacotron2DecoderInfer(
-            **tacotron2_params["Tacotron2Decoder"])
+        t2_dec = nemo_tts.Tacotron2DecoderInfer(**tacotron2_params["Tacotron2Decoder"])
     else:
-        t2_dec = nemo_tts.Tacotron2Decoder(
-            **tacotron2_params["Tacotron2Decoder"])
-    t2_postnet = nemo_tts.Tacotron2Postnet(
-        **tacotron2_params["Tacotron2Postnet"])
+        t2_dec = nemo_tts.Tacotron2Decoder(**tacotron2_params["Tacotron2Decoder"])
+    t2_postnet = nemo_tts.Tacotron2Postnet(**tacotron2_params["Tacotron2Postnet"])
     t2_loss = nemo_tts.Tacotron2Loss(**tacotron2_params["Tacotron2Loss"])
     makegatetarget = nemo_tts.MakeGate()
 
-    total_weights = (text_embedding.num_weights + t2_enc.num_weights
-                     + t2_dec.num_weights + t2_postnet.num_weights)
+    total_weights = text_embedding.num_weights + t2_enc.num_weights + t2_dec.num_weights + t2_postnet.num_weights
 
     nemo.logging.info('================================')
     nemo.logging.info(f"Total number of parameters: {total_weights}")
     nemo.logging.info('================================')
-    return (data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet,
-            t2_loss, makegatetarget)
+    return (
+        data_preprocessor,
+        text_embedding,
+        t2_enc,
+        t2_dec,
+        t2_postnet,
+        t2_loss,
+        makegatetarget,
+    )
 
 
-def create_train_dag(neural_factory,
-                     neural_modules,
-                     tacotron2_params,
-                     train_dataset,
-                     batch_size,
-                     log_freq,
-                     checkpoint_save_freq,
-                     cpu_per_dl=1):
-    (data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet,
-     t2_loss, makegatetarget) = neural_modules
+def create_train_dag(
+    neural_factory,
+    neural_modules,
+    tacotron2_params,
+    train_dataset,
+    batch_size,
+    log_freq,
+    checkpoint_save_freq,
+    cpu_per_dl=1,
+):
+    (data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet, t2_loss, makegatetarget,) = neural_modules
 
     train_dl_params = copy.deepcopy(tacotron2_params["AudioToTextDataLayer"])
     train_dl_params.update(tacotron2_params["AudioToTextDataLayer"]["train"])
@@ -136,21 +143,15 @@ def create_train_dag(neural_factory,
 
     # Train DAG
     audio, audio_len, transcript, transcript_len = data_layer()
-    spec_target, spec_target_len = data_preprocessor(
-        input_signal=audio,
-        length=audio_len)
+    spec_target, spec_target_len = data_preprocessor(input_signal=audio, length=audio_len)
 
     transcript_embedded = text_embedding(char_phone=transcript)
-    transcript_encoded = t2_enc(
-        char_phone_embeddings=transcript_embedded,
-        embedding_length=transcript_len)
+    transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len,)
     mel_decoder, gate, alignments = t2_dec(
-        char_phone_encoded=transcript_encoded,
-        encoded_length=transcript_len,
-        mel_target=spec_target)
+        char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target,
+    )
     mel_postnet = t2_postnet(mel_input=mel_decoder)
-    gate_target = makegatetarget(
-        mel_target=spec_target, target_len=spec_target_len)
+    gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len)
     loss_t = t2_loss(
         mel_out=mel_decoder,
         mel_out_postnet=mel_postnet,
@@ -158,36 +159,27 @@ def create_train_dag(neural_factory,
         mel_target=spec_target,
         gate_target=gate_target,
         target_len=spec_target_len,
-        seq_len=audio_len)
+        seq_len=audio_len,
+    )
 
     # Callbacks needed to print info to console and Tensorboard
     train_callback = nemo.core.SimpleLossLoggerCallback(
-        tensors=[loss_t, spec_target, mel_postnet, gate, gate_target,
-                 alignments],
+        tensors=[loss_t, spec_target, mel_postnet, gate, gate_target, alignments,],
         print_func=lambda x: nemo.logging.info(f"Loss: {x[0].data}"),
-        log_to_tb_func=partial(
-            tacotron2_log_to_tb_func, log_images=True,
-            log_images_freq=log_freq),
+        log_to_tb_func=partial(tacotron2_log_to_tb_func, log_images=True, log_images_freq=log_freq),
         tb_writer=neural_factory.tb_writer,
     )
 
-    chpt_callback = nemo.core.CheckpointCallback(
-        folder=neural_factory.checkpoint_dir,
-        step_freq=checkpoint_save_freq)
+    chpt_callback = nemo.core.CheckpointCallback(folder=neural_factory.checkpoint_dir, step_freq=checkpoint_save_freq)
 
     callbacks = [train_callback, chpt_callback]
     return loss_t, callbacks, steps_per_epoch
 
 
-def create_eval_dags(neural_factory,
-                     neural_modules,
-                     tacotron2_params,
-                     eval_datasets,
-                     eval_batch_size,
-                     eval_freq,
-                     cpu_per_dl=1):
-    (data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet,
-     t2_loss, makegatetarget) = neural_modules
+def create_eval_dags(
+    neural_factory, neural_modules, tacotron2_params, eval_datasets, eval_batch_size, eval_freq, cpu_per_dl=1,
+):
+    (data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet, t2_loss, makegatetarget,) = neural_modules
 
     eval_dl_params = copy.deepcopy(tacotron2_params["AudioToTextDataLayer"])
     eval_dl_params.update(tacotron2_params["AudioToTextDataLayer"]["eval"])
@@ -209,22 +201,15 @@ def create_eval_dags(neural_factory,
         )
 
         audio, audio_len, transcript, transcript_len = data_layer_eval()
-        spec_target, spec_target_len = data_preprocessor(
-            input_signal=audio,
-            length=audio_len)
+        spec_target, spec_target_len = data_preprocessor(input_signal=audio, length=audio_len)
 
         transcript_embedded = text_embedding(char_phone=transcript)
-        transcript_encoded = t2_enc(
-            char_phone_embeddings=transcript_embedded,
-            embedding_length=transcript_len)
+        transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len,)
         mel_decoder, gate, alignments = t2_dec(
-            char_phone_encoded=transcript_encoded,
-            encoded_length=transcript_len,
-            mel_target=spec_target)
+            char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target,
+        )
         mel_postnet = t2_postnet(mel_input=mel_decoder)
-        gate_target = makegatetarget(
-            mel_target=spec_target,
-            target_len=spec_target_len)
+        gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len)
         loss = t2_loss(
             mel_out=mel_decoder,
             mel_out_postnet=mel_postnet,
@@ -232,37 +217,43 @@ def create_eval_dags(neural_factory,
             mel_target=spec_target,
             gate_target=gate_target,
             target_len=spec_target_len,
-            seq_len=audio_len)
+            seq_len=audio_len,
+        )
 
         # create corresponding eval callback
         tagname = os.path.basename(eval_dataset).split(".")[0]
-        eval_tensors = [loss, spec_target, mel_postnet, gate, gate_target,
-                        alignments]
+        eval_tensors = [
+            loss,
+            spec_target,
+            mel_postnet,
+            gate,
+            gate_target,
+            alignments,
+        ]
         eval_callback = nemo.core.EvaluatorCallback(
             eval_tensors=eval_tensors,
             user_iter_callback=tacotron2_process_eval_batch,
-            user_epochs_done_callback=partial(
-                tacotron2_process_final_eval,
-                tag=tagname),
-            tb_writer_func=partial(
-                tacotron2_eval_log_to_tb_func,
-                tag=tagname),
+            user_epochs_done_callback=partial(tacotron2_process_final_eval, tag=tagname),
+            tb_writer_func=partial(tacotron2_eval_log_to_tb_func, tag=tagname),
             eval_step=eval_freq,
-            tb_writer=neural_factory.tb_writer)
+            tb_writer=neural_factory.tb_writer,
+        )
 
         callbacks.append(eval_callback)
     return callbacks
 
 
-def create_all_dags(neural_factory,
-                    neural_modules,
-                    tacotron2_params,
-                    train_dataset,
-                    batch_size,
-                    eval_freq,
-                    checkpoint_save_freq=None,
-                    eval_datasets=None,
-                    eval_batch_size=None):
+def create_all_dags(
+    neural_factory,
+    neural_modules,
+    tacotron2_params,
+    train_dataset,
+    batch_size,
+    eval_freq,
+    checkpoint_save_freq=None,
+    eval_datasets=None,
+    eval_batch_size=None,
+):
     # Calculate num_workers for dataloader
     cpu_per_dl = max(int(os.cpu_count() / neural_factory.world_size), 1)
 
@@ -274,7 +265,8 @@ def create_all_dags(neural_factory,
         batch_size=batch_size,
         log_freq=eval_freq,
         checkpoint_save_freq=checkpoint_save_freq,
-        cpu_per_dl=cpu_per_dl)
+        cpu_per_dl=cpu_per_dl,
+    )
 
     eval_callbacks = []
     if eval_datasets:
@@ -285,7 +277,8 @@ def create_all_dags(neural_factory,
             eval_datasets=eval_datasets,
             eval_batch_size=eval_batch_size,
             eval_freq=eval_freq,
-            cpu_per_dl=cpu_per_dl)
+            cpu_per_dl=cpu_per_dl,
+        )
     else:
         nemo.logging.info("There were no val datasets passed")
 
@@ -310,7 +303,8 @@ def main():
         create_tb_writer=args.create_tb_writer,
         files_to_copy=[args.model_config, __file__],
         cudnn_benchmark=args.cudnn_benchmark,
-        tensorboard_dir=args.tensorboard_dir)
+        tensorboard_dir=args.tensorboard_dir,
+    )
 
     if args.local_rank is not None:
         nemo.logging.info('Doing ALL GPU')
@@ -331,11 +325,11 @@ def main():
         eval_freq=args.eval_freq,
         checkpoint_save_freq=args.checkpoint_save_freq,
         eval_datasets=args.eval_datasets,
-        eval_batch_size=args.eval_batch_size)
+        eval_batch_size=args.eval_batch_size,
+    )
 
     # train model
-    total_steps = (args.max_steps if args.max_steps is not None else
-                   args.num_epochs * steps_per_epoch)
+    total_steps = args.max_steps if args.max_steps is not None else args.num_epochs * steps_per_epoch
     neural_factory.train(
         tensors_to_optimize=[train_loss],
         callbacks=callbacks,
@@ -346,8 +340,10 @@ def main():
             "max_steps": args.max_steps,
             "lr": args.lr,
             "weight_decay": args.weight_decay,
-            "grad_norm_clip": args.grad_norm_clip},
-        batches_per_step=args.iter_per_step)
+            "grad_norm_clip": args.grad_norm_clip,
+        },
+        batches_per_step=args.iter_per_step,
+    )
 
 
 if __name__ == '__main__':
diff --git a/examples/tts/tts_infer.py b/examples/tts/tts_infer.py
index d2cf3e52aa6f..96f53df6c23c 100644
--- a/examples/tts/tts_infer.py
+++ b/examples/tts/tts_infer.py
@@ -19,59 +19,77 @@ def parse_args():
     parser = argparse.ArgumentParser(description='TTS')
     parser.add_argument("--local_rank", default=None, type=int)
     parser.add_argument(
-        "--spec_model", type=str, required=True,
+        "--spec_model",
+        type=str,
+        required=True,
         choices=["tacotron2"],
-        help="Model generated to generate spectrograms")
+        help="Model generated to generate spectrograms",
+    )
     parser.add_argument(
-        "--vocoder", type=str, required=True,
+        "--vocoder",
+        type=str,
+        required=True,
         choices=["griffin-lim", "waveglow"],
-        help="Vocoder used to convert from spectrograms to audio")
+        help="Vocoder used to convert from spectrograms to audio",
+    )
     parser.add_argument(
-        "--spec_model_config", type=str, required=True,
-        help="spec model configuration file: model.yaml")
+        "--spec_model_config", type=str, required=True, help="spec model configuration file: model.yaml",
+    )
     parser.add_argument(
-        "--vocoder_model_config", type=str,
-        help=("vocoder model configuration file: model.yaml. Not required for "
-              "griffin-lim."))
+        "--vocoder_model_config",
+        type=str,
+        help=("vocoder model configuration file: model.yaml. Not required for " "griffin-lim."),
+    )
     parser.add_argument(
-        "--spec_model_load_dir", type=str, required=True,
-        help="directory containing checkpoints for spec model")
+        "--spec_model_load_dir", type=str, required=True, help="directory containing checkpoints for spec model",
+    )
     parser.add_argument(
-        "--vocoder_model_load_dir", type=str,
-        help=("directory containing checkpoints for vocoder model. Not "
-              "required for griffin-lim"))
+        "--vocoder_model_load_dir",
+        type=str,
+        help=("directory containing checkpoints for vocoder model. Not " "required for griffin-lim"),
+    )
     parser.add_argument("--eval_dataset", type=str, required=True)
-    parser.add_argument(
-        "--save_dir", type=str,
-        help="directory to save audio files to")
+    parser.add_argument("--save_dir", type=str, help="directory to save audio files to")
 
     # Grifflin-Lim parameters
     parser.add_argument(
-        "--griffin_lim_mag_scale", type=float, default=2048,
-        help=("This is multiplied with the linear spectrogram. This is "
-              "to avoid audio sounding muted due to mel filter normalization"))
+        "--griffin_lim_mag_scale",
+        type=float,
+        default=2048,
+        help=(
+            "This is multiplied with the linear spectrogram. This is "
+            "to avoid audio sounding muted due to mel filter normalization"
+        ),
+    )
     parser.add_argument(
-        "--griffin_lim_power", type=float, default=1.2,
-        help=("The linear spectrogram is raised to this power prior to running"
-              "the Griffin Lim algorithm. A power of greater than 1 has been "
-              "shown to improve audio quality."))
+        "--griffin_lim_power",
+        type=float,
+        default=1.2,
+        help=(
+            "The linear spectrogram is raised to this power prior to running"
+            "the Griffin Lim algorithm. A power of greater than 1 has been "
+            "shown to improve audio quality."
+        ),
+    )
 
     # Waveglow parameters
     parser.add_argument(
-        "--waveglow_denoiser_strength", type=float, default=0.0,
-        help=("denoiser strength for waveglow. Start with 0 and slowly "
-              "increment"))
+        "--waveglow_denoiser_strength",
+        type=float,
+        default=0.0,
+        help=("denoiser strength for waveglow. Start with 0 and slowly " "increment"),
+    )
     parser.add_argument("--waveglow_sigma", type=float, default=0.6)
 
     parser.add_argument("--batch_size", type=int, default=32)
     parser.add_argument("--amp_opt_level", default="O1")
 
     args = parser.parse_args()
-    if (args.vocoder == "griffin-lim" and
-            (args.vocoder_model_config or args.vocoder_model_load_dir)):
+    if args.vocoder == "griffin-lim" and (args.vocoder_model_config or args.vocoder_model_load_dir):
         raise ValueError(
             "Griffin-Lim was specified as the vocoder but the a value for "
-            "vocoder_model_config or vocoder_model_load_dir was passed.")
+            "vocoder_model_config or vocoder_model_load_dir was passed."
+        )
     return args
 
 
@@ -95,8 +113,7 @@ def griffin_lim(magnitudes, n_iters=50, n_fft=1024):
 
 def plot_and_save_spec(spectrogram, i, save_dir=None):
     fig, ax = plt.subplots(figsize=(12, 3))
-    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
-                   interpolation='none')
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation='none')
     plt.colorbar(im, ax=ax)
     plt.xlabel("Frames")
     plt.ylabel("Channels")
@@ -108,12 +125,9 @@ def plot_and_save_spec(spectrogram, i, save_dir=None):
     plt.close()
 
 
-def create_infer_dags(neural_factory,
-                      neural_modules,
-                      tacotron2_params,
-                      infer_dataset,
-                      infer_batch_size,
-                      cpu_per_dl=1):
+def create_infer_dags(
+    neural_factory, neural_modules, tacotron2_params, infer_dataset, infer_batch_size, cpu_per_dl=1,
+):
     (_, text_embedding, t2_enc, t2_dec, t2_postnet, _, _) = neural_modules
 
     data_layer = nemo_asr.TranscriptDataLayer(
@@ -125,21 +139,18 @@ def create_infer_dags(neural_factory,
         bos_id=len(tacotron2_params['labels']),
         eos_id=len(tacotron2_params['labels']) + 1,
         pad_id=len(tacotron2_params['labels']) + 2,
-        shuffle=False
+        shuffle=False,
     )
     transcript, transcript_len = data_layer()
 
     transcript_embedded = text_embedding(char_phone=transcript)
-    transcript_encoded = t2_enc(
-        char_phone_embeddings=transcript_embedded,
-        embedding_length=transcript_len)
+    transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len,)
     if isinstance(t2_dec, nemo_tts.Tacotron2DecoderInfer):
         mel_decoder, gate, alignments, mel_len = t2_dec(
-            char_phone_encoded=transcript_encoded,
-            encoded_length=transcript_len)
+            char_phone_encoded=transcript_encoded, encoded_length=transcript_len,
+        )
     else:
-        raise ValueError(
-            "The Neural Module for tacotron2 decoder was not understood")
+        raise ValueError("The Neural Module for tacotron2 decoder was not understood")
     mel_postnet = t2_postnet(mel_input=mel_decoder)
 
     return [mel_postnet, gate, alignments, mel_len]
@@ -148,9 +159,8 @@ def create_infer_dags(neural_factory,
 def main():
     args = parse_args()
     neural_factory = nemo.core.NeuralModuleFactory(
-        optimization_level=args.amp_opt_level,
-        backend=nemo.core.Backend.PyTorch,
-        local_rank=args.local_rank)
+        optimization_level=args.amp_opt_level, backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank,
+    )
 
     use_cache = True
     if args.local_rank is not None:
@@ -168,15 +178,13 @@ def main():
             neural_modules=spec_neural_modules,
             tacotron2_params=tacotron2_params,
             infer_dataset=args.eval_dataset,
-            infer_batch_size=args.batch_size)
+            infer_batch_size=args.batch_size,
+        )
 
     print("Running Tacotron 2")
     # Run tacotron 2
     evaluated_tensors = neural_factory.infer(
-        tensors=infer_tensors,
-        checkpoint_dir=args.spec_model_load_dir,
-        cache=use_cache,
-        offload_to_cpu=False
+        tensors=infer_tensors, checkpoint_dir=args.spec_model_load_dir, cache=use_cache, offload_to_cpu=False,
     )
     mel_len = evaluated_tensors[-1]
     print("Done Running Tacotron 2")
@@ -184,7 +192,8 @@ def main():
         sr=tacotron2_params["sample_rate"],
         n_fft=tacotron2_params["n_fft"],
         n_mels=tacotron2_params["n_mels"],
-        fmax=tacotron2_params["fmax"])
+        fmax=tacotron2_params["fmax"],
+    )
 
     if args.vocoder == "griffin-lim":
         print("Running Griffin-Lim")
@@ -194,28 +203,26 @@ def main():
             mel = np.exp(log_mel)
             magnitudes = np.dot(mel, filterbank) * args.griffin_lim_mag_scale
             for j, sample in enumerate(magnitudes):
-                sample = sample[:mel_len[i][j], :]
+                sample = sample[: mel_len[i][j], :]
                 audio = griffin_lim(sample.T ** args.griffin_lim_power)
                 save_file = f"sample_{i*32+j}.wav"
                 if args.save_dir:
                     save_file = os.path.join(args.save_dir, save_file)
                 write(save_file, tacotron2_params["sample_rate"], audio)
-                plot_and_save_spec(log_mel[j][:mel_len[i][j], :].T, i*32+j,
-                                   args.save_dir)
+                plot_and_save_spec(log_mel[j][: mel_len[i][j], :].T, i * 32 + j, args.save_dir)
 
     elif args.vocoder == "waveglow":
         (mel_pred, _, _, _) = infer_tensors
         if not args.vocoder_model_config or not args.vocoder_model_load_dir:
             raise ValueError(
                 "Using waveglow as the vocoder requires the "
-                "--vocoder_model_config and --vocoder_model_load_dir args")
+                "--vocoder_model_config and --vocoder_model_load_dir args"
+            )
 
         yaml = YAML(typ="safe")
         with open(args.vocoder_model_config) as file:
             waveglow_params = yaml.load(file)
-        waveglow = nemo_tts.WaveGlowInferNM(
-            sigma=args.waveglow_sigma,
-            **waveglow_params["WaveGlowNM"])
+        waveglow = nemo_tts.WaveGlowInferNM(sigma=args.waveglow_sigma, **waveglow_params["WaveGlowNM"])
         audio_pred = waveglow(mel_spectrogram=mel_pred)
         # waveglow.restore_from(args.vocoder_model_load_dir)
 
@@ -226,7 +233,7 @@ def main():
             checkpoint_dir=args.vocoder_model_load_dir,
             # checkpoint_dir=None,
             modules_to_restore=[waveglow],
-            use_cache=use_cache
+            use_cache=use_cache,
         )
         print("Done Running Waveglow")
 
@@ -244,15 +251,13 @@ def main():
                 if args.save_dir:
                     save_file = os.path.join(args.save_dir, save_file)
                 if args.waveglow_denoiser_strength > 0:
-                    sample, spec = waveglow.denoise(
-                        sample, strength=args.waveglow_denoiser_strength)
+                    sample, spec = waveglow.denoise(sample, strength=args.waveglow_denoiser_strength)
                 else:
-                    spec, _ = librosa.core.magphase(librosa.core.stft(
-                        sample, n_fft=waveglow_params["n_fft"]))
+                    spec, _ = librosa.core.magphase(librosa.core.stft(sample, n_fft=waveglow_params["n_fft"]))
                 write(save_file, waveglow_params["sample_rate"], sample)
                 spec = np.dot(filterbank, spec)
                 spec = np.log(np.clip(spec, a_min=1e-5, a_max=None))
-                plot_and_save_spec(spec, i*32+j, args.save_dir)
+                plot_and_save_spec(spec, i * 32 + j, args.save_dir)
 
 
 if __name__ == '__main__':
diff --git a/examples/tts/waveglow.py b/examples/tts/waveglow.py
index 7c2c4fe27bac..4f2134933478 100644
--- a/examples/tts/waveglow.py
+++ b/examples/tts/waveglow.py
@@ -1,28 +1,22 @@
 # Copyright (c) 2019 NVIDIA Corporation
-import os
 import argparse
 import copy
-
+import os
 from functools import partial
 
 from ruamel.yaml import YAML
 
 import nemo
-import nemo.utils.argparse as nm_argparse
-
 import nemo.collections.asr as nemo_asr
 import nemo.collections.tts as nemo_tts
-
-from nemo.collections.tts import waveglow_log_to_tb_func
-from nemo.collections.tts import waveglow_process_eval_batch
-from nemo.collections.tts import waveglow_eval_log_to_tb_func
+import nemo.utils.argparse as nm_argparse
+from nemo.collections.tts import waveglow_eval_log_to_tb_func, waveglow_log_to_tb_func, waveglow_process_eval_batch
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        parents=[nm_argparse.NemoArgParser()],
-        description='Waveglow',
-        conflict_handler='resolve')
+        parents=[nm_argparse.NemoArgParser()], description='Waveglow', conflict_handler='resolve',
+    )
     parser.set_defaults(
         checkpoint_dir=None,
         optimizer="adam",
@@ -32,16 +26,19 @@ def parse_args():
         amp_opt_level="O1",
         create_tb_writer=True,
         lr_policy=None,
-        weight_decay=1e-6
+        weight_decay=1e-6,
     )
 
     # Overwrite default args
-    parser.add_argument("--max_steps", type=int, default=None, required=False,
-                        help="max number of steps to train")
-    parser.add_argument("--num_epochs", type=int, default=None, required=False,
-                        help="number of epochs to train")
-    parser.add_argument("--model_config", type=str, required=True,
-                        help="model configuration file: model.yaml")
+    parser.add_argument(
+        "--max_steps", type=int, default=None, required=False, help="max number of steps to train",
+    )
+    parser.add_argument(
+        "--num_epochs", type=int, default=None, required=False, help="number of epochs to train",
+    )
+    parser.add_argument(
+        "--model_config", type=str, required=True, help="model configuration file: model.yaml",
+    )
 
     # Create new args
     parser.add_argument("--exp_name", default="Waveglow", type=str)
@@ -55,10 +52,11 @@ def parse_args():
     if args.eval_freq % 25 != 0:
         raise ValueError("eval_freq should be a multiple of 25.")
 
-    exp_directory = [f"{args.exp_name}-lr_{args.lr}-bs_{args.batch_size}",
-                     "",
-                     (f"-wd_{args.weight_decay}-opt_{args.optimizer}"
-                      f"-ips_{args.iter_per_step}")]
+    exp_directory = [
+        f"{args.exp_name}-lr_{args.lr}-bs_{args.batch_size}",
+        "",
+        (f"-wd_{args.weight_decay}-opt_{args.optimizer}" f"-ips_{args.iter_per_step}"),
+    ]
     if args.max_steps:
         exp_directory[1] = f"-s_{args.max_steps}"
     elif args.num_epochs:
@@ -70,7 +68,8 @@ def parse_args():
 
 def create_NMs(waveglow_params):
     data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
-        **waveglow_params["AudioToMelSpectrogramPreprocessor"])
+        **waveglow_params["AudioToMelSpectrogramPreprocessor"]
+    )
     waveglow = nemo_tts.WaveGlowNM(**waveglow_params["WaveGlowNM"])
     waveglow_loss = nemo_tts.WaveGlowLoss()
 
@@ -80,13 +79,9 @@ def create_NMs(waveglow_params):
     return (data_preprocessor, waveglow, waveglow_loss)
 
 
-def create_train_dag(neural_factory,
-                     neural_modules,
-                     waveglow_params,
-                     train_dataset,
-                     batch_size,
-                     checkpoint_save_freq,
-                     cpu_per_dl=1):
+def create_train_dag(
+    neural_factory, neural_modules, waveglow_params, train_dataset, batch_size, checkpoint_save_freq, cpu_per_dl=1,
+):
     data_preprocessor, waveglow, waveglow_loss = neural_modules
 
     train_dl_params = copy.deepcopy(waveglow_params["AudioDataLayer"])
@@ -95,10 +90,7 @@ def create_train_dag(neural_factory,
     del train_dl_params["eval"]
 
     data_layer = nemo_tts.AudioDataLayer(
-        manifest_filepath=train_dataset,
-        batch_size=batch_size,
-        num_workers=cpu_per_dl,
-        **train_dl_params,
+        manifest_filepath=train_dataset, batch_size=batch_size, num_workers=cpu_per_dl, **train_dl_params,
     )
 
     N = len(data_layer)
@@ -107,42 +99,28 @@ def create_train_dag(neural_factory,
 
     # Train DAG
     audio, audio_len, = data_layer()
-    spec_target, spec_target_len = data_preprocessor(
-        input_signal=audio,
-        length=audio_len)
+    spec_target, spec_target_len = data_preprocessor(input_signal=audio, length=audio_len)
 
-    z, log_s_list, log_det_W_list = waveglow(
-        mel_spectrogram=spec_target, audio=audio)
-    loss_t = waveglow_loss(
-        z=z,
-        log_s_list=log_s_list,
-        log_det_W_list=log_det_W_list)
+    z, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target, audio=audio)
+    loss_t = waveglow_loss(z=z, log_s_list=log_s_list, log_det_W_list=log_det_W_list)
 
     # Callbacks needed to print info to console and Tensorboard
     train_callback = nemo.core.SimpleLossLoggerCallback(
         tensors=[loss_t, z, spec_target, spec_target_len],
         print_func=lambda x: print(f"Loss: {x[0].data}"),
-        log_to_tb_func=partial(
-            waveglow_log_to_tb_func,
-            log_images=False),
+        log_to_tb_func=partial(waveglow_log_to_tb_func, log_images=False),
         tb_writer=neural_factory.tb_writer,
     )
 
-    chpt_callback = nemo.core.CheckpointCallback(
-        folder=neural_factory.checkpoint_dir,
-        step_freq=checkpoint_save_freq)
+    chpt_callback = nemo.core.CheckpointCallback(folder=neural_factory.checkpoint_dir, step_freq=checkpoint_save_freq)
 
     callbacks = [train_callback, chpt_callback]
     return loss_t, callbacks, steps_per_epoch
 
 
-def create_eval_dags(neural_factory,
-                     neural_modules,
-                     waveglow_params,
-                     eval_datasets,
-                     eval_batch_size,
-                     eval_freq,
-                     cpu_per_dl=1):
+def create_eval_dags(
+    neural_factory, neural_modules, waveglow_params, eval_datasets, eval_batch_size, eval_freq, cpu_per_dl=1,
+):
     data_preprocessor, waveglow, _ = neural_modules
 
     eval_dl_params = copy.deepcopy(waveglow_params["AudioDataLayer"])
@@ -154,19 +132,13 @@ def create_eval_dags(neural_factory,
     # assemble eval DAGs
     for eval_dataset in eval_datasets:
         data_layer_eval = nemo_tts.AudioDataLayer(
-            manifest_filepath=eval_dataset,
-            batch_size=eval_batch_size,
-            num_workers=cpu_per_dl,
-            **eval_dl_params,
+            manifest_filepath=eval_dataset, batch_size=eval_batch_size, num_workers=cpu_per_dl, **eval_dl_params,
         )
 
         audio, audio_len, = data_layer_eval()
-        spec_target, spec_target_len = data_preprocessor(
-            input_signal=audio,
-            length=audio_len)
+        spec_target, spec_target_len = data_preprocessor(input_signal=audio, length=audio_len)
 
-        audio_pred, log_s_list, log_det_W_list = waveglow(
-            mel_spectrogram=spec_target, audio=audio)
+        audio_pred, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target, audio=audio)
 
         # create corresponding eval callback
         tagname = os.path.basename(eval_dataset).split(".")[0]
@@ -174,26 +146,26 @@ def create_eval_dags(neural_factory,
             eval_tensors=[audio_pred, spec_target, spec_target_len],
             user_iter_callback=waveglow_process_eval_batch,
             user_epochs_done_callback=lambda x: x,
-            tb_writer_func=partial(
-                waveglow_eval_log_to_tb_func,
-                tag=tagname,
-                mel_fb=data_preprocessor.filter_banks),
+            tb_writer_func=partial(waveglow_eval_log_to_tb_func, tag=tagname, mel_fb=data_preprocessor.filter_banks,),
             eval_step=eval_freq,
-            tb_writer=neural_factory.tb_writer)
+            tb_writer=neural_factory.tb_writer,
+        )
 
         callbacks.append(eval_callback)
     return callbacks
 
 
-def create_all_dags(neural_factory,
-                    neural_modules,
-                    waveglow_params,
-                    train_dataset,
-                    batch_size,
-                    checkpoint_save_freq,
-                    eval_datasets=None,
-                    eval_batch_size=None,
-                    eval_freq=None):
+def create_all_dags(
+    neural_factory,
+    neural_modules,
+    waveglow_params,
+    train_dataset,
+    batch_size,
+    checkpoint_save_freq,
+    eval_datasets=None,
+    eval_batch_size=None,
+    eval_freq=None,
+):
     # Calculate num_workers for dataloader
     cpu_per_dl = max(int(os.cpu_count() / neural_factory.world_size), 1)
 
@@ -204,7 +176,8 @@ def create_all_dags(neural_factory,
         train_dataset=train_dataset,
         batch_size=batch_size,
         checkpoint_save_freq=checkpoint_save_freq,
-        cpu_per_dl=cpu_per_dl)
+        cpu_per_dl=cpu_per_dl,
+    )
 
     eval_callbacks = []
     if eval_datasets:
@@ -215,7 +188,8 @@ def create_all_dags(neural_factory,
             eval_datasets=eval_datasets,
             eval_batch_size=eval_batch_size,
             eval_freq=eval_freq,
-            cpu_per_dl=cpu_per_dl)
+            cpu_per_dl=cpu_per_dl,
+        )
     else:
         nemo.logging.info("There were no val datasets passed")
 
@@ -240,7 +214,8 @@ def main():
         create_tb_writer=args.create_tb_writer,
         files_to_copy=[args.model_config, __file__],
         cudnn_benchmark=args.cudnn_benchmark,
-        tensorboard_dir=args.tensorboard_dir)
+        tensorboard_dir=args.tensorboard_dir,
+    )
 
     if args.local_rank is not None:
         nemo.logging.info('Doing ALL GPU')
@@ -261,7 +236,8 @@ def main():
         checkpoint_save_freq=args.checkpoint_save_freq,
         eval_datasets=args.eval_datasets,
         eval_batch_size=args.eval_batch_size,
-        eval_freq=args.eval_freq)
+        eval_freq=args.eval_freq,
+    )
 
     # train model
     neural_factory.train(
@@ -273,8 +249,10 @@ def main():
             "max_steps": args.max_steps,
             "lr": args.lr,
             "weight_decay": args.weight_decay,
-            "grad_norm_clip": None},
-        batches_per_step=args.iter_per_step)
+            "grad_norm_clip": None,
+        },
+        batches_per_step=args.iter_per_step,
+    )
 
 
 if __name__ == '__main__':
diff --git a/nemo/__init__.py b/nemo/__init__.py
index 60655235fb8d..a56e3e28ee6b 100644
--- a/nemo/__init__.py
+++ b/nemo/__init__.py
@@ -18,22 +18,24 @@
 
 import os
 
-from .package_info import __shortversion__
-from .package_info import __version__
-
-from .package_info import __package_name__
-from .package_info import __contact_names__
-from .package_info import __contact_emails__
-from .package_info import __homepage__
-from .package_info import __repository_url__
-from .package_info import __download_url__
-from .package_info import __description__
-from .package_info import __license__
-from .package_info import __keywords__
+from .package_info import (
+    __contact_emails__,
+    __contact_names__,
+    __description__,
+    __download_url__,
+    __homepage__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __repository_url__,
+    __shortversion__,
+    __version__,
+)
 
 if "NEMO_PACKAGE_BUILDING" not in os.environ:
 
     import logging
+
     logging = logging.getLogger(__name__)
 
     from nemo import backends
diff --git a/nemo/backends/pytorch/__init__.py b/nemo/backends/pytorch/__init__.py
index eb96fa45b69b..05688901f671 100644
--- a/nemo/backends/pytorch/__init__.py
+++ b/nemo/backends/pytorch/__init__.py
@@ -3,7 +3,7 @@
 This package provides Neural Modules building blocks for building Software
 2.0 projects
 """
-from . import tutorials, torchvision
+from . import torchvision, tutorials
 from .actions import PtActions
-from .nm import TrainableNM, NonTrainableNM, DataLayerNM, LossNM
 from .common import *
+from .nm import DataLayerNM, LossNM, NonTrainableNM, TrainableNM
diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py
index 7bd6c92ea443..8daa4fdd980a 100644
--- a/nemo/backends/pytorch/actions.py
+++ b/nemo/backends/pytorch/actions.py
@@ -3,11 +3,10 @@
 import itertools
 import json
 import os
+from collections import defaultdict
 from pathlib import Path
-from typing import List, Optional, Dict
+from typing import Dict, List, Optional
 
-# import onnx
-from collections import defaultdict
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -15,16 +14,15 @@
 
 import nemo
 from nemo.backends.pytorch.nm import TrainableNM
-from .module_wrapper import TrainableNeuralModuleWrapper
-from .nm import DataLayerNM
-from .optimizers import Novograd, AdamW, master_params
-from ...core import NmTensor, DeviceType, NeuralModule, DeploymentFormat
-from ...core.neural_types import *
-from ...core.callbacks import (ActionCallback,
-                               EvaluatorCallback,
-                               SimpleLossLoggerCallback)
+
+from ...core import DeploymentFormat, DeviceType, NeuralModule, NmTensor
+from ...core.callbacks import ActionCallback, EvaluatorCallback, SimpleLossLoggerCallback
 from ...core.neural_factory import Actions, ModelMode, Optimization
+from ...core.neural_types import *
 from ...utils.helpers import get_checkpoint_from_dir
+from .module_wrapper import TrainableNeuralModuleWrapper
+from .nm import DataLayerNM
+from .optimizers import AdamW, Novograd, master_params
 
 # these imports will happen on as-needed basis
 amp = None
@@ -43,20 +41,18 @@
     Optimization.mxprO3: "O3",
 }
 
-_float_2_half_req = {Optimization.mxprO1,
-                     Optimization.mxprO2,
-                     Optimization.mxprO3}
+_float_2_half_req = {
+    Optimization.mxprO1,
+    Optimization.mxprO2,
+    Optimization.mxprO3,
+}
 
 
 class PtActions(Actions):
     def __init__(
-            self,
-            local_rank=None,
-            global_rank=None,
-            tb_writer=None,
-            optimization_level=Optimization.mxprO0):
-        need_apex = local_rank is not None or \
-            optimization_level != Optimization.mxprO0
+        self, local_rank=None, global_rank=None, tb_writer=None, optimization_level=Optimization.mxprO0,
+    ):
+        need_apex = local_rank is not None or optimization_level != Optimization.mxprO0
         if need_apex:
             try:
                 apex = importlib.import_module('apex')
@@ -74,8 +70,7 @@ def __init__(
                     parallel = importlib.import_module('apex.parallel')
                     apex_optimizer = importlib.import_module('apex.optimizers')
                     convert_syncbn = parallel.convert_syncbn_model
-                    create_syncbn_process_group = (
-                        parallel.create_syncbn_process_group)
+                    create_syncbn_process_group = parallel.create_syncbn_process_group
                     DDP = parallel.DistributedDataParallel
                     LARC = parallel.LARC
                     FusedLAMB = apex_optimizer.FusedLAMB
@@ -87,12 +82,12 @@ def __init__(
                     "NVIDIA Apex is necessary for distributed training and"
                     "mixed precision training. It only works on GPUs."
                     "Please install Apex from "
-                    "https://www.github.com/nvidia/apex")
+                    "https://www.github.com/nvidia/apex"
+                )
 
         super(PtActions, self).__init__(
-            local_rank=local_rank,
-            global_rank=global_rank,
-            optimization_level=optimization_level)
+            local_rank=local_rank, global_rank=global_rank, optimization_level=optimization_level,
+        )
 
         # will be [unique_instance_id -> (NMModule, PTModule)]
         self.module_reference_table = {}
@@ -124,10 +119,7 @@ def create_node(producer, producer_args):
             if producer_args is None:
                 return tuple((producer, ()))
             else:
-                return tuple(
-                    (producer,
-                     tuple([(k, v) for k, v in producer_args.items()]))
-                )
+                return tuple((producer, tuple([(k, v) for k, v in producer_args.items()]),))
 
         def is_in_degree_zero(node, processed_nodes):
             """A node has in degree of zero"""
@@ -168,15 +160,12 @@ def is_in_degree_zero(node, processed_nodes):
             # first make sure all keys are present per output port
             # and nm is inside all_nodes
             if node not in all_nodes:
-                all_nodes[node] = {
-                    k: None for k in
-                    nmtensor.producer.output_ports}
+                all_nodes[node] = {k: None for k in nmtensor.producer.output_ports}
             # second, populate output port with current nmtensor
             # where applicable
             all_nodes[node][nmtensor.name] = nmtensor
             processed_nmtensors.add(nmtensor)
-            if (nmtensor.producer_args is not None
-                    and nmtensor.producer_args != {}):
+            if nmtensor.producer_args is not None and nmtensor.producer_args != {}:
                 for _, new_nmtensor in nmtensor.producer_args.items():
                     if new_nmtensor not in processed_nmtensors:
                         # put in the start of list
@@ -186,11 +175,7 @@ def is_in_degree_zero(node, processed_nodes):
         # Iterate over all_nodes to create new nodes that include its output
         # now all nodes have (module, input tensors, output tensors)
         for node in all_nodes:
-            all_node_with_output.append(tuple((
-                node[0],
-                node[1],
-                all_nodes[node]
-            )))
+            all_node_with_output.append(tuple((node[0], node[1], all_nodes[node])))
 
         processed_nodes = []
         while len(all_node_with_output) > 0:
@@ -209,27 +194,25 @@ def is_in_degree_zero(node, processed_nodes):
             top_sorted_modules.append((m[0], dict(m[1]), m[2]))
             # Ensure that there is only one dataset in callchain
             if i > 0 and isinstance(m[0], DataLayerNM):
-                raise ValueError(
-                    "There were more than one DataLayer NeuralModule inside "
-                    "your DAG.")
+                raise ValueError("There were more than one DataLayer NeuralModule inside " "your DAG.")
 
         if not isinstance(top_sorted_modules[0][0], DataLayerNM):
-            raise ValueError(
-                "The first module in your DAG was not a DataLayer "
-                "NeuralModule.")
+            raise ValueError("The first module in your DAG was not a DataLayer " "NeuralModule.")
 
         tdataset = top_sorted_modules[0][0].dataset
 
         # populate self.module_reference_table
         for m in top_sorted_modules:
             if m[0].factory is None and self._local_rank is not None:
-                raise ValueError("Neural module {0} was created without "
-                                 "NeuralModuleFactory, but you are trying to"
-                                 "run in distributed mode. Please instantiate"
-                                 "NeuralModuleFactory first and pass its "
-                                 "instance as `factory` parameter to all your"
-                                 "Neural Module objects."
-                                 "".format(str(m[0])))
+                raise ValueError(
+                    "Neural module {0} was created without "
+                    "NeuralModuleFactory, but you are trying to"
+                    "run in distributed mode. Please instantiate"
+                    "NeuralModuleFactory first and pass its "
+                    "instance as `factory` parameter to all your"
+                    "Neural Module objects."
+                    "".format(str(m[0]))
+                )
             key = m[0].unique_instance_id
             if key not in self.module_reference_table:
                 if isinstance(m[0], TrainableNeuralModuleWrapper):
@@ -239,10 +222,7 @@ def is_in_degree_zero(node, processed_nodes):
 
         return top_sorted_modules, tdataset
 
-    def create_optimizer(self,
-                         optimizer,
-                         things_to_optimize,
-                         optimizer_params=None):
+    def create_optimizer(self, optimizer, things_to_optimize, optimizer_params=None):
         """
         Wrapper function around __setup_optimizer()
 
@@ -265,8 +245,7 @@ def create_optimizer(self,
         elif isinstance(optimizer, torch.optim.Optimizer):
             optimizer_instance = optimizer
         else:
-            raise ValueError("`optimizer` must be a string or an instance "
-                             "of torch.optim.Optimizer")
+            raise ValueError("`optimizer` must be a string or an instance " "of torch.optim.Optimizer")
 
         modules_to_optimize = []
         tensors_to_optimize = []
@@ -278,22 +257,19 @@ def create_optimizer(self,
             elif isinstance(thing, NmTensor):
                 tensors_to_optimize.append(thing)
             else:
-                raise ValueError("{} passed to create_optimizer() was neither "
-                                 "a neural module nor a neural module tensor")
+                raise ValueError(
+                    "{} passed to create_optimizer() was neither " "a neural module nor a neural module tensor"
+                )
 
         if tensors_to_optimize:
-            call_chain, _ = self.__get_top_sorted_modules_and_dataloader(
-                tensors_to_optimize)
+            call_chain, _ = self.__get_top_sorted_modules_and_dataloader(tensors_to_optimize)
 
             for module in call_chain:
                 if module[0] not in modules_to_optimize:
                     modules_to_optimize.append(module[0])
 
         # Extract trainable weights which will be optimized
-        params_list = [
-            p.parameters() for p in modules_to_optimize
-            if isinstance(p, TrainableNM) or p.is_trainable()
-        ]
+        params_list = [p.parameters() for p in modules_to_optimize if isinstance(p, TrainableNM) or p.is_trainable()]
         params_to_optimize = itertools.chain(*params_list)
 
         if optimizer_params is None:
@@ -303,16 +279,16 @@ def create_optimizer(self,
             optimizer_instance=optimizer_instance,
             optimizer_class=optimizer_class,
             optimization_params=optimizer_params,
-            params_to_optimize=params_to_optimize)
+            params_to_optimize=params_to_optimize,
+        )
 
         self.optimizers.append(optimizer)
         return optimizer
 
     @staticmethod
-    def __setup_optimizer(optimizer_instance,
-                          optimizer_class,
-                          optimization_params,
-                          params_to_optimize):
+    def __setup_optimizer(
+        optimizer_instance, optimizer_class, optimization_params, params_to_optimize,
+    ):
 
         if optimizer_instance is None:
             # Setup optimizer instance, by default it is SGD
@@ -326,18 +302,16 @@ def __setup_optimizer(optimizer_instance,
                 )
             elif optimizer_class.lower() == "adam":
                 optimizer = optim.Adam(
-                    params=params_to_optimize, lr=lr,
-                    betas=optimization_params.get("betas", (0.9, 0.999)))
+                    params=params_to_optimize, lr=lr, betas=optimization_params.get("betas", (0.9, 0.999)),
+                )
             elif optimizer_class.lower() == "fused_adam":
-                optimizer = FusedAdam(
-                    params=params_to_optimize,
-                    lr=lr)
+                optimizer = FusedAdam(params=params_to_optimize, lr=lr)
             elif optimizer_class.lower() == "adam_w":
                 optimizer = AdamW(
                     params=params_to_optimize,
                     lr=lr,
                     weight_decay=optimization_params.get("weight_decay", 0.0),
-                    betas=optimization_params.get("betas", (0.9, 0.999))
+                    betas=optimization_params.get("betas", (0.9, 0.999)),
                 )
             elif optimizer_class.lower() == "novograd":
                 optimizer = Novograd(
@@ -358,40 +332,30 @@ def __setup_optimizer(optimizer_instance,
                     betas=optimization_params.get("betas", (0.95, 0.25)),
                 )
             elif optimizer_class.lower() == "fused_lamb":
-                optimizer = FusedLAMB(
-                    params_to_optimize,
-                    lr=lr,
-                )
+                optimizer = FusedLAMB(params_to_optimize, lr=lr,)
             else:
-                raise ValueError(
-                    "Unknown optimizer class: {0}".format(optimizer_class))
+                raise ValueError("Unknown optimizer class: {0}".format(optimizer_class))
 
             if optimization_params.get("larc", False):
                 nemo.logging.info("Enabling larc")
-                optimizer = LARC(
-                    optimizer,
-                    trust_coefficient=optimization_params.get("larc_eta", 2e-2)
-                )
+                optimizer = LARC(optimizer, trust_coefficient=optimization_params.get("larc_eta", 2e-2),)
         else:
             nemo.logging.info("Optimizer instance: {0} is provided.")
             if optimizer_class is not None and optimizer_class != "":
-                nemo.logging.warning(
-                    "Ignoring `optimizer_class` parameter because"
-                    "`optimizer_instance` is provided")
+                nemo.logging.warning("Ignoring `optimizer_class` parameter because" "`optimizer_instance` is provided")
             if optimization_params is not None and optimization_params != {}:
                 nemo.logging.warning(
                     "Ignoring `optimization_params` parameter for "
-                    "optimizer because `optimizer_instance` is provided")
+                    "optimizer because `optimizer_instance` is provided"
+                )
             optimizer = optimizer_instance
         return optimizer
 
     def __initialize_amp(
-            self, optimizer, optim_level,
-            amp_max_loss_scale=2.**24, amp_min_loss_scale=1.0
+        self, optimizer, optim_level, amp_max_loss_scale=2.0 ** 24, amp_min_loss_scale=1.0,
     ):
         if optim_level not in AmpOptimizations:
-            raise ValueError(f"__initialize_amp() was called with unknown "
-                             "optim_level={optim_level}")
+            raise ValueError(f"__initialize_amp() was called with unknown " "optim_level={optim_level}")
         # in this case, nothing to do here
         if optim_level == Optimization.mxprO0:
             return optimizer
@@ -402,8 +366,7 @@ def __initialize_amp(
         for module in self.modules:
             if isinstance(module, nn.Module):
                 pt_modules.append(module)
-            elif isinstance(module,
-                            TrainableNeuralModuleWrapper):
+            elif isinstance(module, TrainableNeuralModuleWrapper):
                 pt_modules.append(module._pt_module)
 
         _, optimizer = amp.initialize(
@@ -416,12 +379,9 @@ def __initialize_amp(
         self.amp_initialized = True
         return optimizer
 
-    def __nm_graph_forward_pass(self,
-                                call_chain,
-                                registered_tensors,
-                                mode=ModelMode.train,
-                                disable_allreduce=False,
-                                use_cache=False):
+    def __nm_graph_forward_pass(
+        self, call_chain, registered_tensors, mode=ModelMode.train, disable_allreduce=False, use_cache=False,
+    ):
         for ind in range(1, len(call_chain)):
             if use_cache:
                 in_cache = True
@@ -464,10 +424,7 @@ def __nm_graph_forward_pass(self,
                 key = nmtensor.unique_name
                 call_set[tensor_name] = registered_tensors[key]
             # actual PyTorch module call with signature
-            if isinstance(
-                    self.module_reference_table[m_id][0],
-                    TrainableNeuralModuleWrapper
-            ):
+            if isinstance(self.module_reference_table[m_id][0], TrainableNeuralModuleWrapper,):
                 new_tensors = pmodule(**call_set)
             else:
                 new_tensors = pmodule(force_pt=True, **call_set)
@@ -477,16 +434,14 @@ def __nm_graph_forward_pass(self,
                     new_tensors = [new_tensors]
                 else:
                     new_tensors = list(new_tensors)
-            for t_tensor, nm_tensor in zip(
-                    new_tensors, call_chain[ind][2].values()):
+            for t_tensor, nm_tensor in zip(new_tensors, call_chain[ind][2].values()):
                 if nm_tensor is None:
                     continue
                 t_name = nm_tensor.unique_name
                 if t_name not in registered_tensors:
                     registered_tensors[t_name] = t_tensor
                 else:
-                    raise ValueError("A NMTensor was produced twice in "
-                                     f"the same DAG. {t_name}")
+                    raise ValueError("A NMTensor was produced twice in " f"the same DAG. {t_name}")
 
     @staticmethod
     def pad_tensor(t: torch.Tensor, target_size: torch.Size):
@@ -540,9 +495,7 @@ def _eval(self, tensors_2_evaluate, callback, step, verbose=False):
         with torch.no_grad():
             # each call chain corresponds to a tensor in tensors_2_evaluate
             dl_nm = None
-            call_chain, _ = self.__get_top_sorted_modules_and_dataloader(
-                hook=tensors_2_evaluate
-            )
+            call_chain, _ = self.__get_top_sorted_modules_and_dataloader(hook=tensors_2_evaluate)
             dl_nm = call_chain[0][0]
 
             # Prepare eval_dataloader
@@ -560,15 +513,11 @@ def _eval(self, tensors_2_evaluate, callback, step, verbose=False):
                 #     )
                 # )
                 if dl_nm.dataset is not None:
-                    sampler = torch.utils.data.distributed.DistributedSampler(
-                        dl_nm.dataset
-                    )
+                    sampler = torch.utils.data.distributed.DistributedSampler(dl_nm.dataset)
                     eval_dataloader = torch.utils.data.DataLoader(
                         dataset=dl_nm.dataset,
                         sampler=sampler,
-                        num_workers=dl_nm.local_parameters.get(
-                            "num_workers", os.cpu_count()
-                        ),
+                        num_workers=dl_nm.local_parameters.get("num_workers", os.cpu_count()),
                         batch_size=dl_nm.local_parameters["batch_size"],
                         shuffle=(sampler is None),
                     )
@@ -582,14 +531,9 @@ def _eval(self, tensors_2_evaluate, callback, step, verbose=False):
                     eval_dataloader = torch.utils.data.DataLoader(
                         dataset=dl_nm.dataset,
                         sampler=None,  # not distributed sampler
-                        num_workers=call_chain[0][0].local_parameters.get(
-                            "num_workers", os.cpu_count()
-                        ),
-                        batch_size=call_chain[0][0].local_parameters[
-                            "batch_size"],
-                        shuffle=call_chain[0][0].local_parameters.get(
-                            "shuffle",
-                            False),
+                        num_workers=call_chain[0][0].local_parameters.get("num_workers", os.cpu_count()),
+                        batch_size=call_chain[0][0].local_parameters["batch_size"],
+                        shuffle=call_chain[0][0].local_parameters.get("shuffle", False),
                     )
                 else:
                     eval_dataloader = dl_nm.data_iterator
@@ -603,12 +547,8 @@ def _eval(self, tensors_2_evaluate, callback, step, verbose=False):
             # Evaluation mini-batch for loop
             num_batches = len(eval_dataloader)
             for epoch_i, data in enumerate(eval_dataloader, 0):
-                if verbose and (
-                        num_batches < 10 or (
-                        epoch_i % int(num_batches / 10) == 0)
-                ):
-                    nemo.logging.info(
-                        f"Evaluating batch {epoch_i} out of {num_batches}")
+                if verbose and (num_batches < 10 or (epoch_i % int(num_batches / 10) == 0)):
+                    nemo.logging.info(f"Evaluating batch {epoch_i} out of {num_batches}")
                 tensors = []
                 if isinstance(data, torch.Tensor):
                     data = (data,)
@@ -618,14 +558,11 @@ def _eval(self, tensors_2_evaluate, callback, step, verbose=False):
                     else:
                         tensors.append(d)
 
-                registered_e_tensors = {t.unique_name: d for t, d in
-                                        zip(call_chain[0][2].values(), tensors)
-                                        if t is not None
-                                        }
+                registered_e_tensors = {
+                    t.unique_name: d for t, d in zip(call_chain[0][2].values(), tensors) if t is not None
+                }
                 self.__nm_graph_forward_pass(
-                    call_chain=call_chain,
-                    registered_tensors=registered_e_tensors,
-                    mode=ModelMode.eval,
+                    call_chain=call_chain, registered_tensors=registered_e_tensors, mode=ModelMode.eval,
                 )
 
                 if not is_distributed or self.global_rank == 0:
@@ -635,11 +572,7 @@ def _eval(self, tensors_2_evaluate, callback, step, verbose=False):
                 for t2e in tensors_2_evaluate:
                     key = t2e.unique_name
                     if key not in registered_e_tensors.keys():
-                        nemo.logging.info(
-                            "WARNING: Tensor {} was not found during "
-                            "eval".format(
-                                key)
-                        )
+                        nemo.logging.info("WARNING: Tensor {} was not found during " "eval".format(key))
                         continue
                     if is_distributed:
                         # where we will all_gather results from all workers
@@ -647,17 +580,11 @@ def _eval(self, tensors_2_evaluate, callback, step, verbose=False):
                         # where we will all_gather tensor sizes
                         tensor_on_worker = registered_e_tensors[key]
                         if tensor_on_worker.shape != torch.Size([]):
-                            tensor_on_worker_size_as_tensor = torch.tensor(
-                                tensor_on_worker.shape
-                            ).cuda()
+                            tensor_on_worker_size_as_tensor = torch.tensor(tensor_on_worker.shape).cuda()
                             sizes = []
                             for ind in range(world_size):
-                                sizes.append(
-                                    torch.empty_like(
-                                        tensor_on_worker_size_as_tensor)
-                                )
-                            dist.all_gather(sizes,
-                                            tensor_on_worker_size_as_tensor)
+                                sizes.append(torch.empty_like(tensor_on_worker_size_as_tensor))
+                            dist.all_gather(sizes, tensor_on_worker_size_as_tensor)
                             mx_dim, _ = torch.max(torch.stack(sizes), dim=0)
                         else:  # this is a singleton. For example, loss value
                             sizes = [torch.Size([])] * world_size
@@ -665,68 +592,51 @@ def _eval(self, tensors_2_evaluate, callback, step, verbose=False):
                         for ind in range(world_size):
                             # we have to use max shape for all_gather
                             if mx_dim is None:  # singletons
+                                tensors_list.append(torch.tensor(2).cuda().type_as(tensor_on_worker))
+                            else:  # non-singletons
                                 tensors_list.append(
-                                    torch.tensor(2).cuda().type_as(
-                                        tensor_on_worker)
+                                    torch.empty(mx_dim.cpu().data.numpy().tolist()).cuda().type_as(tensor_on_worker)
                                 )
-                            else:  # non-singletons
-                                tensors_list.append(torch.empty(
-                                    mx_dim.cpu().data.numpy().tolist()).cuda()
-                                    .type_as(
-                                    tensor_on_worker))
 
                         if mx_dim is not None:
-                            t_to_send = self.pad_tensor(tensor_on_worker,
-                                                        mx_dim)
+                            t_to_send = self.pad_tensor(tensor_on_worker, mx_dim)
                         else:
                             t_to_send = tensor_on_worker
                         dist.all_gather(tensors_list, t_to_send)
-                        tensors_list = [
-                            self.depad_tensor(t, size)
-                            for t, size in zip(tensors_list, sizes)
-                        ]
+                        tensors_list = [self.depad_tensor(t, size) for t, size in zip(tensors_list, sizes)]
                         if self.global_rank == 0:
                             values_dict["IS_FROM_DIST_EVAL"] = True
                             values_dict[key] = tensors_list
                     else:  # NON-DISTRIBUTED TRAINING
                         values_dict["IS_FROM_DIST_EVAL"] = False
                         values_dict[key] = [registered_e_tensors[key]]
-                if callback.user_iter_callback and (
-                        self.global_rank is None or self.global_rank == 0
-                ):
+                if callback.user_iter_callback and (self.global_rank is None or self.global_rank == 0):
                     # values_dict will contain results from all workers
-                    callback.user_iter_callback(values_dict,
-                                                callback._global_var_dict)
+                    callback.user_iter_callback(values_dict, callback._global_var_dict)
 
             # final aggregation (over minibatches) and logging of results
             # should happend only on one worker
-            if callback.user_done_callback and (
-                    self.global_rank is None or self.global_rank == 0
-            ):
-                vals_to_log = callback.user_done_callback(
-                    callback._global_var_dict)
+            if callback.user_done_callback and (self.global_rank is None or self.global_rank == 0):
+                vals_to_log = callback.user_done_callback(callback._global_var_dict)
                 # log results to Tensorboard
                 if vals_to_log is not None and callback.swriter is not None:
                     if callback.tb_writer_func is not None:
-                        callback.tb_writer_func(
-                            callback.swriter, vals_to_log, step)
+                        callback.tb_writer_func(callback.swriter, vals_to_log, step)
                     else:
                         for key, val in vals_to_log.items():
                             callback.swriter.add_scalar(key, val, step)
 
-    def _infer(self,
-               tensors_to_return,
-               verbose=False,
-               cache=False,
-               use_cache=False,
-               offload_to_cpu=True):
+    def _infer(
+        self, tensors_to_return, verbose=False, cache=False, use_cache=False, offload_to_cpu=True,
+    ):
         """
         Does the same as _eval() just with tensors instead of eval callback.
         """
         # Checking that cache is used properly
         if cache and use_cache:
-            raise ValueError("cache and use_cache were both set. However cache"
-                             " must first be created prior to using it.")
+            raise ValueError(
+                "cache and use_cache were both set. However cache" " must first be created prior to using it."
+            )
         if cache:
             if self.cache is not None:
                 raise ValueError("cache was set but was not empty")
@@ -738,9 +648,7 @@ def _infer(self,
         with torch.no_grad():
             # each call chain corresponds to a tensor in tensors_2_evaluate
             dl_nm = None
-            call_chain, _ = self.__get_top_sorted_modules_and_dataloader(
-                hook=tensors_to_return
-            )
+            call_chain, _ = self.__get_top_sorted_modules_and_dataloader(hook=tensors_to_return)
             dl_nm = call_chain[0][0]
 
             # Prepare eval_dataloader
@@ -750,8 +658,7 @@ def _infer(self,
             world_size = None
             if dl_nm.placement == DeviceType.AllGpu:
                 if self.cache or use_cache:
-                    raise NotImplementedError(
-                        "Caching is not available for distributed training.")
+                    raise NotImplementedError("Caching is not available for distributed training.")
                 assert dist.is_initialized()
                 is_distributed = True
                 world_size = torch.distributed.get_world_size()
@@ -761,15 +668,11 @@ def _infer(self,
                 #     )
                 # )
                 if dl_nm.dataset is not None:
-                    sampler = torch.utils.data.distributed.DistributedSampler(
-                        dl_nm.dataset
-                    )
+                    sampler = torch.utils.data.distributed.DistributedSampler(dl_nm.dataset)
                     eval_dataloader = torch.utils.data.DataLoader(
                         dataset=dl_nm.dataset,
                         sampler=sampler,
-                        num_workers=dl_nm.local_parameters.get(
-                            "num_workers", os.cpu_count()
-                        ),
+                        num_workers=dl_nm.local_parameters.get("num_workers", os.cpu_count()),
                         batch_size=dl_nm.local_parameters["batch_size"],
                         shuffle=(sampler is None),
                     )
@@ -784,14 +687,9 @@ def _infer(self,
                     eval_dataloader = torch.utils.data.DataLoader(
                         dataset=dl_nm.dataset,
                         sampler=None,  # not distributed sampler
-                        num_workers=call_chain[0][0].local_parameters.get(
-                            "num_workers", os.cpu_count()
-                        ),
-                        batch_size=call_chain[0][0].local_parameters[
-                            "batch_size"],
-                        shuffle=call_chain[0][0].local_parameters.get(
-                            "shuffle",
-                            False),
+                        num_workers=call_chain[0][0].local_parameters.get("num_workers", os.cpu_count()),
+                        batch_size=call_chain[0][0].local_parameters["batch_size"],
+                        shuffle=call_chain[0][0].local_parameters.get("shuffle", False),
                     )
                 else:
                     eval_dataloader = dl_nm.data_iterator
@@ -814,12 +712,8 @@ def _infer(self,
                 loop_iterator = eval_dataloader
 
             for epoch_i, data in enumerate(loop_iterator, 0):
-                if verbose and (
-                        num_batches < 10 or (
-                        epoch_i % int(num_batches / 10) == 0)
-                ):
-                    nemo.logging.info(
-                        f"Evaluating batch {epoch_i} out of {num_batches}")
+                if verbose and (num_batches < 10 or (epoch_i % int(num_batches / 10) == 0)):
+                    nemo.logging.info(f"Evaluating batch {epoch_i} out of {num_batches}")
                 tensors = []
                 if use_cache:
                     registered_e_tensors = data
@@ -838,15 +732,13 @@ def _infer(self,
                             tensors.append(d)
 
                     registered_e_tensors = {
-                        t.unique_name: d for t, d in
-                        zip(call_chain[0][2].values(), tensors)
-                        if t is not None
+                        t.unique_name: d for t, d in zip(call_chain[0][2].values(), tensors) if t is not None
                     }
                 self.__nm_graph_forward_pass(
                     call_chain=call_chain,
                     registered_tensors=registered_e_tensors,
                     mode=ModelMode.eval,
-                    use_cache=use_cache
+                    use_cache=use_cache,
                 )
 
                 # if offload_to_cpu:
@@ -863,11 +755,7 @@ def _infer(self,
                 for t2e in tensors_to_return:
                     key = t2e.unique_name
                     if key not in registered_e_tensors.keys():
-                        nemo.logging.info(
-                            "WARNING: Tensor {} was not found during "
-                            "eval".format(
-                                key)
-                        )
+                        nemo.logging.info("WARNING: Tensor {} was not found during " "eval".format(key))
                         continue
                     if is_distributed:
                         # where we will all_gather results from all workers
@@ -875,17 +763,11 @@ def _infer(self,
                         # where we will all_gather tensor sizes
                         tensor_on_worker = registered_e_tensors[key]
                         if tensor_on_worker.shape != torch.Size([]):
-                            tensor_on_worker_size_as_tensor = torch.tensor(
-                                tensor_on_worker.shape
-                            ).cuda()
+                            tensor_on_worker_size_as_tensor = torch.tensor(tensor_on_worker.shape).cuda()
                             sizes = []
                             for ind in range(world_size):
-                                sizes.append(
-                                    torch.empty_like(
-                                        tensor_on_worker_size_as_tensor)
-                                )
-                            dist.all_gather(sizes,
-                                            tensor_on_worker_size_as_tensor)
+                                sizes.append(torch.empty_like(tensor_on_worker_size_as_tensor))
+                            dist.all_gather(sizes, tensor_on_worker_size_as_tensor)
                             mx_dim, _ = torch.max(torch.stack(sizes), dim=0)
                         else:  # this is a singleton. For example, loss value
                             sizes = [torch.Size([])] * world_size
@@ -893,26 +775,18 @@ def _infer(self,
                         for ind in range(world_size):
                             # we have to use max shape for all_gather
                             if mx_dim is None:  # singletons
+                                tensors_list.append(torch.tensor(2).cuda().type_as(tensor_on_worker))
+                            else:  # non-singletons
                                 tensors_list.append(
-                                    torch.tensor(2).cuda().type_as(
-                                        tensor_on_worker)
+                                    torch.empty(mx_dim.cpu().data.numpy().tolist()).cuda().type_as(tensor_on_worker)
                                 )
-                            else:  # non-singletons
-                                tensors_list.append(torch.empty(
-                                    mx_dim.cpu().data.numpy().tolist()).cuda()
-                                    .type_as(
-                                    tensor_on_worker))
 
                         if mx_dim is not None:
-                            t_to_send = self.pad_tensor(tensor_on_worker,
-                                                        mx_dim)
+                            t_to_send = self.pad_tensor(tensor_on_worker, mx_dim)
                         else:
                             t_to_send = tensor_on_worker
                         dist.all_gather(tensors_list, t_to_send)
-                        tensors_list = [
-                            self.depad_tensor(t, size)
-                            for t, size in zip(tensors_list, sizes)
-                        ]
+                        tensors_list = [self.depad_tensor(t, size) for t, size in zip(tensors_list, sizes)]
                         if offload_to_cpu:
                             tensors_list = [t.cpu() for t in tensors_list]
                         if self.global_rank == 0:
@@ -977,12 +851,10 @@ def restore_state_from(self, path: str):
             self.step = checkpoint["step"]
             self.epoch_num = checkpoint["epoch_num"]
             if checkpoint["optimizer_state"]:
-                for opt, opt_chkpt in zip(
-                        self.optimizers, checkpoint["optimizer_state"]):
+                for opt, opt_chkpt in zip(self.optimizers, checkpoint["optimizer_state"]):
                     opt.load_state_dict(opt_chkpt)
         else:
-            raise FileNotFoundError(
-                "Could not find checkpoint file: {0}".format(path))
+            raise FileNotFoundError("Could not find checkpoint file: {0}".format(path))
 
     @staticmethod
     def _check_all_tensors(list_of_tensors):
@@ -1001,13 +873,11 @@ def _check_tuples(list_of_tuples):
         first element, and a list of NmTensors in the second.
         """
         for tup in list_of_tuples:
-            if not (isinstance(tup[0], torch.optim.Optimizer)
-                    and PtActions._check_all_tensors(tup[1])):
+            if not (isinstance(tup[0], torch.optim.Optimizer) and PtActions._check_all_tensors(tup[1])):
                 return False
         return True
 
-    def _get_all_modules(
-            self, training_loop, callbacks, logging_callchain=None):
+    def _get_all_modules(self, training_loop, callbacks, logging_callchain=None):
         """Gets all neural modules that will be used by train() and eval() via
         EvaluatorCallbacks. Saves all modules to self.modules
         """
@@ -1028,43 +898,38 @@ def _get_all_modules(
         if callbacks is not None:
             for callback in callbacks:
                 if isinstance(callback, EvaluatorCallback):
-                    callchain, _ = \
-                        self.__get_top_sorted_modules_and_dataloader(
-                            hook=callback.eval_tensors)
+                    (callchain, _,) = self.__get_top_sorted_modules_and_dataloader(hook=callback.eval_tensors)
                     for module in callchain:
                         self.modules.add(module[0])
 
     @staticmethod
-    def __module_export(module,
-                        output,
-                        d_format: DeploymentFormat,
-                        input_example=None,
-                        output_example=None):
+    def __module_export(
+        module, output, d_format: DeploymentFormat, input_example=None, output_example=None,
+    ):
         # Check if output already exists
         destination = Path(output)
         if destination.exists():
-            raise FileExistsError(f"Destination {output} already exists. "
-                                  f"Aborting export.")
+            raise FileExistsError(f"Destination {output} already exists. " f"Aborting export.")
 
         input_names = list(module.input_ports.keys())
         output_names = list(module.output_ports.keys())
         dynamic_axes = defaultdict(list)
 
-        def __extract_dynamic_axes(port_name: str, ntype: NeuralType,
-                                   dynamic_axes: defaultdict):
+        def __extract_dynamic_axes(port_name: str, ntype: NeuralType, dynamic_axes: defaultdict):
             if ntype.axis2type:
                 for axis_id, axistype in ntype.axis2type.items():
-                    if issubclass(axistype.semantics, BatchTag) or issubclass(
-                            axistype.semantics, TimeTag):
+                    if issubclass(axistype.semantics, BatchTag) or issubclass(axistype.semantics, TimeTag):
                         dynamic_axes[port_name].append(axis_id)
 
         # This is a hack for Jasper to Jarvis export -- need re-design for this
         inputs_to_drop = set()
         outputs_to_drop = set()
         if type(module).__name__ == "JasperEncoder":
-            print(f"Module is JasperEncoder. We are removing"
-                  f"input and output length ports since they "
-                  f"are not needed for deployment")
+            print(
+                f"Module is JasperEncoder. We are removing"
+                f"input and output length ports since they "
+                f"are not needed for deployment"
+            )
             inputs_to_drop.add("length")
             outputs_to_drop.add("encoded_lengths")
 
@@ -1110,27 +975,28 @@ def __extract_dynamic_axes(port_name: str, ntype: NeuralType,
                     traced_m.save(output)
             elif d_format == DeploymentFormat.ONNX:
                 if input_example is None:
-                    raise ValueError(
-                        f'Example input is None, but ONNX tracing was'
-                        f' attempted')
+                    raise ValueError(f'Example input is None, but ONNX tracing was' f' attempted')
                 if output_example is None:
                     if isinstance(input_example, tuple):
                         output_example = module.forward(*input_example)
                     else:
                         output_example = module.forward(input_example)
                 with torch.jit.optimized_execution(True):
-                    jitted_model = torch.jit.trace(module,
-                                                   input_example)
-
-                torch.onnx.export(jitted_model, input_example, output,
-                                  input_names=input_names,
-                                  output_names=output_names,
-                                  verbose=True,
-                                  export_params=True,
-                                  do_constant_folding=True,
-                                  dynamic_axes=dynamic_axes,
-                                  opset_version=10,
-                                  example_outputs=output_example)
+                    jitted_model = torch.jit.trace(module, input_example)
+
+                torch.onnx.export(
+                    jitted_model,
+                    input_example,
+                    output,
+                    input_names=input_names,
+                    output_names=output_names,
+                    verbose=True,
+                    export_params=True,
+                    do_constant_folding=True,
+                    dynamic_axes=dynamic_axes,
+                    opset_version=10,
+                    example_outputs=output_example,
+                )
                 # fn = output + ".readable"
                 # with open(fn, 'w') as f:
                 #     tempModel = onnx.load(output)
@@ -1145,12 +1011,11 @@ def __extract_dynamic_axes(port_name: str, ntype: NeuralType,
                     json.dump(local_parameters, outfile)
 
             else:
-                raise NotImplementedError(
-                    f"Not supported deployment format: {d_format}")
+                raise NotImplementedError(f"Not supported deployment format: {d_format}")
         except Exception as e:  # nopep8
-            nemo.logging.error(f'ERROR: module export failed for {module} '
-                               f'with exception {e}')
+            nemo.logging.error(f'ERROR: module export failed for {module} ' f'with exception {e}')
         finally:
+
             def __old_call__(self, force_pt=False, *input, **kwargs):
                 pt_call = len(input) > 0 or force_pt
                 if pt_call:
@@ -1161,11 +1026,9 @@ def __old_call__(self, force_pt=False, *input, **kwargs):
             type(module).__call__ = __old_call__
 
     @staticmethod
-    def deployment_export(module,
-                          output: str,
-                          d_format: DeploymentFormat,
-                          input_example=None,
-                          output_example=None):
+    def deployment_export(
+        module, output: str, d_format: DeploymentFormat, input_example=None, output_example=None,
+    ):
         """Exports Neural Module instance for deployment.
 
         Args:
@@ -1183,20 +1046,23 @@ def deployment_export(module,
                 output=output,
                 d_format=d_format,
                 input_example=input_example,
-                output_example=output_example)
-
-    def train(self,
-              tensors_to_optimize,
-              optimizer=None,
-              optimization_params=None,
-              callbacks: Optional[List[ActionCallback]] = None,
-              lr_policy=None,
-              batches_per_step=None,
-              stop_on_nan_loss=False,
-              synced_batchnorm=False,
-              synced_batchnorm_groupsize=0,
-              gradient_predivide=False,
-              amp_max_loss_scale=2.**24):
+                output_example=output_example,
+            )
+
+    def train(
+        self,
+        tensors_to_optimize,
+        optimizer=None,
+        optimization_params=None,
+        callbacks: Optional[List[ActionCallback]] = None,
+        lr_policy=None,
+        batches_per_step=None,
+        stop_on_nan_loss=False,
+        synced_batchnorm=False,
+        synced_batchnorm_groupsize=0,
+        gradient_predivide=False,
+        amp_max_loss_scale=2.0 ** 24,
+    ):
         if not optimization_params:
             optimization_params = {}
         num_epochs = optimization_params.get("num_epochs", None)
@@ -1218,19 +1084,14 @@ def train(self,
             return
         # Check if tensors_to_optimize is just a list of NmTensors
         elif tensors_to_optimize is not None and (
-                isinstance(tensors_to_optimize[0],
-                           NmTensor) and PtActions._check_all_tensors(
-                tensors_to_optimize)):
+            isinstance(tensors_to_optimize[0], NmTensor) and PtActions._check_all_tensors(tensors_to_optimize)
+        ):
             # Parse graph into a topologically sorted sequence of neural
             # modules' calls
-            opt_call_chain, t_dataset = \
-                self.__get_top_sorted_modules_and_dataloader(
-                    hook=tensors_to_optimize
-                )
+            (opt_call_chain, t_dataset,) = self.__get_top_sorted_modules_and_dataloader(hook=tensors_to_optimize)
             # Extract trainable weights which will be optimized
             params_list = [
-                p[0].parameters() for p in opt_call_chain
-                if isinstance(p[0], TrainableNM) or p[0].is_trainable()
+                p[0].parameters() for p in opt_call_chain if isinstance(p[0], TrainableNM) or p[0].is_trainable()
             ]
             params_to_optimize = itertools.chain(*params_list)
 
@@ -1250,35 +1111,27 @@ def train(self,
                 params_to_optimize=params_to_optimize,
             )
 
-            training_loop = [
-                (optimizer, tensors_to_optimize, opt_call_chain)
-            ]
+            training_loop = [(optimizer, tensors_to_optimize, opt_call_chain)]
 
             self.optimizers.append(optimizer)
-            assert len(self.optimizers) == 1, \
-                ("There was more than one optimizer, was create_optimizer() "
-                 "called before train()?")
+            assert len(self.optimizers) == 1, (
+                "There was more than one optimizer, was create_optimizer() " "called before train()?"
+            )
 
         elif PtActions._check_tuples(tensors_to_optimize):
             if batches_per_step != 1:
-                raise ValueError("Gradient accumlation with multiple "
-                                 "optimizers is not supported")
+                raise ValueError("Gradient accumlation with multiple " "optimizers is not supported")
             datasets = []
             training_loop = []
             for step in tensors_to_optimize:
-                step_call_chain, dataset = \
-                    self.__get_top_sorted_modules_and_dataloader(
-                        hook=step[1]
-                    )
+                (step_call_chain, dataset,) = self.__get_top_sorted_modules_and_dataloader(hook=step[1])
                 datasets.append(dataset)
-                training_loop.append(
-                    (step[0], step[1], step_call_chain))
+                training_loop.append((step[0], step[1], step_call_chain))
 
             t_dataset = datasets[0]
             for dataset in datasets:
                 if type(dataset) is not type(t_dataset):
-                    raise ValueError(
-                        "There were two training datasets, we only support 1.")
+                    raise ValueError("There were two training datasets, we only support 1.")
         else:
             raise ValueError("tensors_to_optimize was not understood")
 
@@ -1287,20 +1140,16 @@ def train(self,
         if callbacks is not None:
             for callback in callbacks:
                 if not isinstance(callback, ActionCallback):
-                    raise ValueError("A callback was received that was not a "
-                                     "child of ActionCallback")
+                    raise ValueError("A callback was received that was not a " "child of ActionCallback")
                 elif isinstance(callback, SimpleLossLoggerCallback):
                     if logging_callchain:
-                        raise ValueError("We only support one logger callback "
-                                         "but more than one were found")
+                        raise ValueError("We only support one logger callback " "but more than one were found")
                     logger_step_freq = callback._step_freq
                     logging_tensors = callback.tensors
                     all_tensors = logging_tensors
                     for step in training_loop:
                         all_tensors = all_tensors + step[1]
-                    logging_callchain, _ = \
-                        self.__get_top_sorted_modules_and_dataloader(
-                            hook=all_tensors)
+                    (logging_callchain, _,) = self.__get_top_sorted_modules_and_dataloader(hook=all_tensors)
 
         self._get_all_modules(training_loop, callbacks, logging_callchain)
 
@@ -1314,12 +1163,15 @@ def train(self,
                 optimizer=self.optimizers,
                 optim_level=self._optim_level,
                 amp_max_loss_scale=amp_max_loss_scale,
-                amp_min_loss_scale=optimization_params.get(
-                    'amp_min_loss_scale', 1.0))
+                amp_min_loss_scale=optimization_params.get('amp_min_loss_scale', 1.0),
+            )
             # Use stored mapping to map amp_init opts to training loop
             for i, step in enumerate(training_loop):
                 training_loop[i] = (
-                    self.optimizers[training_loop_opts[i]], step[1], step[2])
+                    self.optimizers[training_loop_opts[i]],
+                    step[1],
+                    step[2],
+                )
 
         dataNM = training_loop[0][2][0][0]
         if dataNM.placement == DeviceType.AllGpu:
@@ -1329,16 +1181,11 @@ def train(self,
             #         "optimizers")
             nemo.logging.info("Doing distributed training")
             if t_dataset is not None:
-                train_sampler = \
-                    torch.utils.data.distributed.DistributedSampler(
-                        t_dataset
-                    )
+                train_sampler = torch.utils.data.distributed.DistributedSampler(t_dataset)
                 train_dataloader = torch.utils.data.DataLoader(
                     dataset=t_dataset,
                     sampler=train_sampler,
-                    num_workers=dataNM.local_parameters.get(
-                        "num_workers", os.cpu_count()
-                    ),
+                    num_workers=dataNM.local_parameters.get("num_workers", os.cpu_count()),
                     batch_size=dataNM.local_parameters["batch_size"],
                     shuffle=(train_sampler is None),
                 )
@@ -1354,33 +1201,28 @@ def train(self,
                 for i in range(1, len(call_chain) - 1):
                     key = call_chain[i][0].unique_instance_id
                     pmodule = self.module_reference_table[key][1]
-                    if (not isinstance(pmodule, DDP) and
-                            isinstance(pmodule, torch.nn.Module)):
+                    if not isinstance(pmodule, DDP) and isinstance(pmodule, torch.nn.Module):
                         gpf = 1
                         if gradient_predivide:
                             gpf = dist.get_world_size()
                         pmodule = DDP(pmodule, gradient_predivide_factor=gpf)
 
                     # Convert batchnorm modules to synced if applicable
-                    if (synced_batchnorm and
-                            isinstance(pmodule, torch.nn.Module)):
+                    if synced_batchnorm and isinstance(pmodule, torch.nn.Module):
                         world_size = dist.get_world_size()
-                        if (synced_batchnorm_groupsize > 0 and
-                                world_size % synced_batchnorm_groupsize != 0):
+                        if synced_batchnorm_groupsize > 0 and world_size % synced_batchnorm_groupsize != 0:
                             raise ValueError(
                                 f"Synchronized batch norm group size"
                                 f" ({synced_batchnorm_groupsize}) must be 0"
                                 f" or divide total number of GPUs"
                                 f" ({world_size})."
                             )
-                        process_group = create_syncbn_process_group(
-                            synced_batchnorm_groupsize)
-                        pmodule = convert_syncbn(
-                            pmodule,
-                            process_group=process_group)
+                        process_group = create_syncbn_process_group(synced_batchnorm_groupsize)
+                        pmodule = convert_syncbn(pmodule, process_group=process_group)
 
                     self.module_reference_table[key] = (
-                        self.module_reference_table[key][0], pmodule
+                        self.module_reference_table[key][0],
+                        pmodule,
                     )
         # single GPU/CPU training
         else:
@@ -1389,9 +1231,7 @@ def train(self,
                 train_dataloader = torch.utils.data.DataLoader(
                     dataset=t_dataset,
                     sampler=None,
-                    num_workers=dataNM.local_parameters.get(
-                        "num_workers", os.cpu_count()
-                    ),
+                    num_workers=dataNM.local_parameters.get("num_workers", os.cpu_count()),
                     batch_size=dataNM.local_parameters["batch_size"],
                     shuffle=dataNM.local_parameters.get("shuffle", True),
                 )
@@ -1422,17 +1262,14 @@ def train(self,
 
                 if batch_counter == 0:
                     # Started step, zero gradients
-                    curr_optimizer = training_loop[
-                        self.step % len(training_loop)][0]
+                    curr_optimizer = training_loop[self.step % len(training_loop)][0]
                     curr_optimizer.zero_grad()
                     # Register iteration start with callbacks
                     self._perform_on_iteration_start(callbacks=callbacks)
 
                 # set learning rate policy
                 if lr_policy is not None:
-                    adjusted_lr = lr_policy(
-                        optimization_params["lr"], self.step, self.epoch_num
-                    )
+                    adjusted_lr = lr_policy(optimization_params["lr"], self.step, self.epoch_num)
                     for param_group in curr_optimizer.param_groups:
                         param_group["lr"] = adjusted_lr
                 if self.tb_writer is not None:
@@ -1442,8 +1279,7 @@ def train(self,
                 # registered_tensors will contain created tensors
                 # named by output port and uuid of module which created them
                 # Get and properly name tensors returned by data layer
-                curr_call_chain = training_loop[
-                    self.step % len(training_loop)][2]
+                curr_call_chain = training_loop[self.step % len(training_loop)][2]
                 dl_device = curr_call_chain[0][0]._device
                 if logging_callchain and self.step % logger_step_freq == 0:
                     curr_call_chain = logging_callchain
@@ -1457,26 +1293,23 @@ def train(self,
                         tensors.append(d)
 
                 registered_tensors = {
-                    t.unique_name: d for t, d in
-                    zip(curr_call_chain[0][2].values(), tensors)
-                    if t is not None
+                    t.unique_name: d for t, d in zip(curr_call_chain[0][2].values(), tensors) if t is not None
                 }
                 disable_allreduce = batch_counter < (batches_per_step - 1)
                 self.__nm_graph_forward_pass(
                     call_chain=curr_call_chain,
                     registered_tensors=registered_tensors,
-                    disable_allreduce=disable_allreduce
+                    disable_allreduce=disable_allreduce,
                 )
 
-                curr_tensors_to_optimize = training_loop[
-                    self.step % len(training_loop)][1]
+                curr_tensors_to_optimize = training_loop[self.step % len(training_loop)][1]
                 final_loss = 0
                 nan = False
                 for tensor in curr_tensors_to_optimize:
-                    if torch.isnan(
-                            registered_tensors[tensor.unique_name]).any() \
-                            or torch.isinf(
-                            registered_tensors[tensor.unique_name]).any():
+                    if (
+                        torch.isnan(registered_tensors[tensor.unique_name]).any()
+                        or torch.isinf(registered_tensors[tensor.unique_name]).any()
+                    ):
                         if stop_on_nan_loss:
                             raise ValueError('Loss is NaN or inf - exiting')
                         nemo.logging.warning('WARNING: Loss is NaN or inf')
@@ -1486,47 +1319,35 @@ def train(self,
                     final_loss += registered_tensors[tensor.unique_name]
                 if nan:
                     continue
-                if self._optim_level in AmpOptimizations \
-                        and self._optim_level != Optimization.mxprO0:
-                    with amp.scale_loss(
-                            final_loss,
-                            curr_optimizer,
-                            delay_unscale=disable_allreduce
-                    ) as scaled_loss:
-                        if torch.isnan(scaled_loss).any() \
-                                or torch.isinf(scaled_loss).any():
+                if self._optim_level in AmpOptimizations and self._optim_level != Optimization.mxprO0:
+                    with amp.scale_loss(final_loss, curr_optimizer, delay_unscale=disable_allreduce,) as scaled_loss:
+                        if torch.isnan(scaled_loss).any() or torch.isinf(scaled_loss).any():
                             if stop_on_nan_loss:
-                                raise ValueError('Loss is NaN or inf -'
-                                                 ' exiting')
+                                raise ValueError('Loss is NaN or inf -' ' exiting')
                             nemo.logging.warning('WARNING: Loss is NaN or inf')
                             curr_optimizer.zero_grad()
                             continue
-                        scaled_loss.backward(
-                            bps_scale.to(scaled_loss.get_device()))
+                        scaled_loss.backward(bps_scale.to(scaled_loss.get_device()))
                 # no AMP optimizations needed
                 else:
                     # multi-GPU, float32
                     if self._local_rank is not None:
-                        final_loss.backward(
-                            bps_scale.to(final_loss.get_device()))
+                        final_loss.backward(bps_scale.to(final_loss.get_device()))
                     # single device (CPU or GPU)
                     else:
-                        final_loss.backward(
-                            bps_scale.to(final_loss.get_device()))
+                        final_loss.backward(bps_scale.to(final_loss.get_device()))
 
                 batch_counter += 1
 
                 if batch_counter == batches_per_step:
                     # Ended step. Do optimizer update
                     if grad_norm_clip is not None:
-                        torch.nn.utils.clip_grad_norm_(
-                            master_params(curr_optimizer), grad_norm_clip)
+                        torch.nn.utils.clip_grad_norm_(master_params(curr_optimizer), grad_norm_clip)
                     curr_optimizer.step()
                     batch_counter = 0
                     # Register iteration end with callbacks
                     self._update_callbacks(
-                        callbacks=callbacks,
-                        registered_tensors=registered_tensors,
+                        callbacks=callbacks, registered_tensors=registered_tensors,
                     )
                     self._perform_on_iteration_end(callbacks=callbacks)
                     self.step += 1
@@ -1536,21 +1357,21 @@ def train(self,
             self.epoch_num += 1
         self._perform_on_action_end(callbacks=callbacks)
 
-    def infer(self,
-              tensors,
-              checkpoint_dir=None,
-              ckpt_pattern='',
-              verbose=True,
-              cache=False,
-              use_cache=False,
-              offload_to_cpu=True,
-              modules_to_restore=None):
+    def infer(
+        self,
+        tensors,
+        checkpoint_dir=None,
+        ckpt_pattern='',
+        verbose=True,
+        cache=False,
+        use_cache=False,
+        offload_to_cpu=True,
+        modules_to_restore=None,
+    ):
         """See NeuralModuleFactory.infer()
         """
 
-        call_chain, _ = self.__get_top_sorted_modules_and_dataloader(
-            hook=tensors
-        )
+        call_chain, _ = self.__get_top_sorted_modules_and_dataloader(hook=tensors)
         if checkpoint_dir:
             # Find all modules that need to be restored
             if modules_to_restore is None:
@@ -1565,44 +1386,40 @@ def infer(self,
             modules_to_restore_name = []
             for mod in modules_to_restore:
                 if not isinstance(mod, NeuralModule):
-                    raise ValueError("Found something that was not a Neural "
-                                     "Module inside modules_to_restore")
+                    raise ValueError("Found something that was not a Neural " "Module inside modules_to_restore")
                 elif mod.num_weights == 0:
-                    raise ValueError("Found a Neural Module with 0 weights "
-                                     "inside modules_to_restore")
+                    raise ValueError("Found a Neural Module with 0 weights " "inside modules_to_restore")
                 modules_to_restore_name.append(str(mod))
 
-            module_checkpoints = get_checkpoint_from_dir(
-                modules_to_restore_name, checkpoint_dir, ckpt_pattern
-            )
+            module_checkpoints = get_checkpoint_from_dir(modules_to_restore_name, checkpoint_dir, ckpt_pattern)
 
             for mod, checkpoint in zip(modules_to_restore, module_checkpoints):
                 nemo.logging.info(f"Restoring {mod} from {checkpoint}")
                 mod.restore_from(checkpoint, self._local_rank)
 
         # Init Amp
-        if (self._optim_level in AmpOptimizations
-                and self._optim_level != Optimization.mxprO0
-                and not self.amp_initialized):
+        if (
+            self._optim_level in AmpOptimizations
+            and self._optim_level != Optimization.mxprO0
+            and not self.amp_initialized
+        ):
             pt_modules = []
             for i in range(len(call_chain)):
                 if isinstance(call_chain[i][0], nn.Module):
                     pt_modules.append(call_chain[i][0])
-                elif isinstance(call_chain[i][0],
-                                TrainableNeuralModuleWrapper):
+                elif isinstance(call_chain[i][0], TrainableNeuralModuleWrapper):
                     pt_modules.append(call_chain[i][0]._pt_module)
 
             amp.initialize(
-                min_loss_scale=1.0,
-                models=pt_modules,
-                optimizers=None,
-                opt_level=AmpOptimizations[self._optim_level],
+                min_loss_scale=1.0, models=pt_modules, optimizers=None, opt_level=AmpOptimizations[self._optim_level],
             )
             self.amp_initialized = True
 
         # Run infer
-        return self._infer(tensors_to_return=tensors,
-                           verbose=verbose,
-                           cache=cache,
-                           use_cache=use_cache,
-                           offload_to_cpu=offload_to_cpu)
+        return self._infer(
+            tensors_to_return=tensors,
+            verbose=verbose,
+            cache=cache,
+            use_cache=use_cache,
+            offload_to_cpu=offload_to_cpu,
+        )
diff --git a/nemo/backends/pytorch/common/losses.py b/nemo/backends/pytorch/common/losses.py
index 85ab9e793682..295c09ba1ce4 100644
--- a/nemo/backends/pytorch/common/losses.py
+++ b/nemo/backends/pytorch/common/losses.py
@@ -2,12 +2,7 @@
 from torch import nn
 
 from nemo.backends.pytorch.nm import LossNM
-from nemo.core.neural_types import (NeuralType,
-                                    AxisType,
-                                    BatchTag,
-                                    TimeTag,
-                                    ChannelTag,
-                                    RegressionTag)
+from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, RegressionTag, TimeTag
 
 __all__ = ['SequenceLoss', 'CrossEntropyLoss', 'MSELoss']
 
@@ -54,15 +49,8 @@ def input_ports(self):
 
         """
         return {
-            'log_probs': NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            }),
-            'targets': NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            'log_probs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            'targets': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
     @property
@@ -73,15 +61,19 @@ def output_ports(self):
             NeuralType(None)
 
         """
-        return {
-            "loss": NeuralType(None)
-        }
-
-    def __init__(self, pad_id=0, smoothing_coef=0.0, sample_wise=False,
-                 aux_ctc=False, ctc_initial_coef=0.1, ctc_blank_id=None,
-                 **kwargs):
-        assert (not aux_ctc) or (ctc_blank_id is not None), \
-            "Should be a blank id if using CTC loss"
+        return {"loss": NeuralType(None)}
+
+    def __init__(
+        self,
+        pad_id=0,
+        smoothing_coef=0.0,
+        sample_wise=False,
+        aux_ctc=False,
+        ctc_initial_coef=0.1,
+        ctc_blank_id=None,
+        **kwargs
+    ):
+        assert (not aux_ctc) or (ctc_blank_id is not None), "Should be a blank id if using CTC loss"
 
         super().__init__(**kwargs)
 
@@ -92,8 +84,7 @@ def __init__(self, pad_id=0, smoothing_coef=0.0, sample_wise=False,
         self.ctc_coef = ctc_initial_coef
 
         if aux_ctc:
-            self.ctc = nn.CTCLoss(blank=ctc_blank_id,
-                                  reduction='none', zero_infinity=True)
+            self.ctc = nn.CTCLoss(blank=ctc_blank_id, reduction='none', zero_infinity=True)
             self.ctc = self.ctc.to(self._device)
 
     def _loss_function(self, log_probs, targets):
@@ -112,9 +103,7 @@ def _loss_function(self, log_probs, targets):
 
     def _ce_loss(self, log_probs, targets, pad_mask):
         target_log_probs = log_probs.gather(2, targets.unsqueeze(2)).squeeze(2)
-        loss = \
-            (1.0 - self.smoothing_coef) * target_log_probs \
-            + self.smoothing_coef * log_probs.mean(-1)
+        loss = (1.0 - self.smoothing_coef) * target_log_probs + self.smoothing_coef * log_probs.mean(-1)
         pad_mask = pad_mask.float()
         loss = -torch.sum(loss * pad_mask)
         if self.sample_wise:
@@ -150,13 +139,8 @@ def input_ports(self):
 
         """
         return {
-            "logits": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag)
-            }),
-            "labels": NeuralType({
-                0: AxisType(BatchTag),
-            })
+            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag),}),
         }
 
     @property
@@ -166,9 +150,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, weight=None, **kwargs):
         LossNM.__init__(self, **kwargs)
@@ -176,15 +158,12 @@ def __init__(self, weight=None, **kwargs):
             weight = torch.FloatTensor(weight).to(self._device)
         self._criterion = nn.CrossEntropyLoss(weight=weight)
 
-    def _loss_function(self,
-                       logits,
-                       labels):
+    def _loss_function(self, logits, labels):
         loss = self._criterion(logits, labels)
         return loss
 
 
 class MSELoss(LossNM):
-
     @property
     def input_ports(self):
         """Returns definitions of module input ports.
@@ -196,12 +175,8 @@ def input_ports(self):
             0: AxisType(RegressionTag)
         """
         return {
-            "preds": NeuralType({
-                0: AxisType(RegressionTag)
-            }),
-            "labels": NeuralType({
-                0: AxisType(RegressionTag)
-            })
+            "preds": NeuralType({0: AxisType(RegressionTag)}),
+            "labels": NeuralType({0: AxisType(RegressionTag)}),
         }
 
     @property
@@ -211,9 +186,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, **kwargs):
         LossNM.__init__(self, **kwargs)
diff --git a/nemo/backends/pytorch/common/metrics.py b/nemo/backends/pytorch/common/metrics.py
index d5bc9e495543..adf2a83b7e3c 100644
--- a/nemo/backends/pytorch/common/metrics.py
+++ b/nemo/backends/pytorch/common/metrics.py
@@ -1,8 +1,9 @@
 import torch
 
 
-def char_lm_metrics(chars_log_probs_batches, chars_targets_batches,
-                    targets_texts_batches, pad_id):
+def char_lm_metrics(
+    chars_log_probs_batches, chars_targets_batches, targets_texts_batches, pad_id,
+):
     """Calculate metrics for language modeling.
 
     Args:
@@ -19,19 +20,12 @@ def char_lm_metrics(chars_log_probs_batches, chars_targets_batches,
     """
 
     bpcs, ppls = [], []
-    for log_probs, targets, texts in zip(
-            chars_log_probs_batches,
-            chars_targets_batches,
-            targets_texts_batches
-    ):
+    for log_probs, targets, texts in zip(chars_log_probs_batches, chars_targets_batches, targets_texts_batches):
         target_log_probs = log_probs.gather(2, targets.unsqueeze(2)).squeeze(2)
         pad_mask = (targets != pad_id).long()
         nll = -(target_log_probs * pad_mask.float()).sum(-1)
         char_lens = pad_mask.float().sum(-1)
-        word_lens = torch.tensor(
-            [len(text.split()) for text in texts],
-            dtype=torch.float, device=char_lens.device
-        )
+        word_lens = torch.tensor([len(text.split()) for text in texts], dtype=torch.float, device=char_lens.device,)
         bpc = nll / char_lens
         ppl = 2 ** (nll / word_lens)
         # ppl = 2 ** (bpc * ENG_MWN)  # ~5.3
diff --git a/nemo/backends/pytorch/common/other.py b/nemo/backends/pytorch/common/other.py
index 1829094f9038..982abd100446 100644
--- a/nemo/backends/pytorch/common/other.py
+++ b/nemo/backends/pytorch/common/other.py
@@ -1,14 +1,16 @@
 # Copyright (c) 2019 NVIDIA Corporation
 """Core PyTorch-base Neural Modules"""
-__all__ = ['SimpleCombiner',
-           'ArgMaxSimple',
-           'TableLookUp',
-           'TableLookUp2',
-           'SequenceEmbedding',
-           'SequenceProjection',
-           'ZerosLikeNM']
-
-from typing import Iterable, Optional, Mapping, Set, Dict
+__all__ = [
+    'SimpleCombiner',
+    'ArgMaxSimple',
+    'TableLookUp',
+    'TableLookUp2',
+    'SequenceEmbedding',
+    'SequenceProjection',
+    'ZerosLikeNM',
+]
+
+from typing import Dict, Iterable, Mapping, Optional, Set
 
 import torch
 import torch.nn as nn
@@ -38,10 +40,7 @@ def input_ports(self):
         x2:
             Empty?!?
         """
-        return {
-            "x1": NeuralType({}),
-            "x2": NeuralType({})
-        }
+        return {"x1": NeuralType({}), "x2": NeuralType({})}
 
     @property
     def output_ports(self):
@@ -50,9 +49,7 @@ def output_ports(self):
         combined:
             None
         """
-        return {
-            "combined": None
-        }
+        return {"combined": None}
 
     def __init__(self, mode="add", **kwargs):
         TrainableNM.__init__(self, **kwargs)
@@ -64,9 +61,7 @@ def forward(self, x1, x2):
         elif self._mode == "max":
             return torch.max(x1, x2, out=None)
         else:
-            raise NotImplementedError(
-                "SimpleCombiner does not have {0} mode".format(self._mode)
-            )
+            raise NotImplementedError("SimpleCombiner does not have {0} mode".format(self._mode))
 
 
 class ArgMaxSimple(TrainableNM):  # Notice TWO base classes
@@ -82,12 +77,7 @@ def input_ports(self):
 
             1: AxisType(ChannelTag)
         """
-        return {
-            "x": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag)
-            })
-        }
+        return {"x": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})}
 
     @property
     def output_ports(self):
@@ -133,10 +123,7 @@ def input_ports(self):
 
             1: AxisType(BatchTag)
         """
-        return {
-            "indices": NeuralType(
-                {0: AxisType(TimeTag), 1: AxisType(BatchTag)})
-        }
+        return {"indices": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag)})}
 
     @property
     def output_ports(self):
@@ -146,13 +133,9 @@ def output_ports(self):
                 0: AxisType(BatchTag)
                 1: AxisType(TimeTag)
         """
-        return {
-            "indices": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(TimeTag)})
-        }
+        return {"indices": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})}
 
-    def set_weights(self, name2weight: Dict[(str, bool)],
-                    name2name_and_transform):
+    def set_weights(self, name2weight: Dict[(str, bool)], name2name_and_transform):
         pass
 
     def tie_weights_with(self, module, weight_names):
@@ -188,8 +171,7 @@ def __call__(self, force_pt=False, *input, **kwargs):
                         sublst.append(tid)
                     else:
                         break
-                result.append(
-                    list(map(lambda x: self._ids2classes[x], sublst)))
+                result.append(list(map(lambda x: self._ids2classes[x], sublst)))
             return [result]
         else:
             return NeuralModule.__call__(self, **kwargs)
@@ -204,8 +186,7 @@ def get_weights(self) -> Iterable[Optional[Mapping]]:
 class TableLookUp2(NeuralModule):
     """Performs a table lookup. For example, convert class ids to names"""
 
-    def set_weights(self, name2weight: Dict[(str, bool)],
-                    name2name_and_transform):
+    def set_weights(self, name2weight: Dict[(str, bool)], name2name_and_transform):
         pass
 
     def tie_weights_with(self, module, weight_names):
@@ -228,8 +209,7 @@ def input_ports(self):
         """Returns definitions of module input ports.
 
         """
-        return {
-        }
+        return {}
 
     @property
     def output_ports(self):
@@ -238,9 +218,7 @@ def output_ports(self):
         classes:
             None
         """
-        return {
-            "classes": None
-        }
+        return {"classes": None}
 
     def __init__(self, detokenizer=None, **kwargs):
         NeuralModule.__init__(self, **kwargs)
@@ -274,7 +252,6 @@ def get_weights(self) -> Iterable[Optional[Mapping]]:
 
 
 class SequenceEmbedding(TrainableNM):
-
     @property
     def input_ports(self):
         """Returns definitions of module input ports.
@@ -284,12 +261,7 @@ def input_ports(self):
 
             1: AxisType(BatchTag)
         """
-        return {
-            "input_seq": NeuralType({
-                0: AxisType(TimeTag),
-                1: AxisType(BatchTag)
-            })
-        }
+        return {"input_seq": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag)})}
 
     @property
     def output_ports(self):
@@ -302,13 +274,7 @@ def output_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "outputs": NeuralType({
-                0: AxisType(TimeTag),
-                1: AxisType(BatchTag),
-                2: AxisType(ChannelTag)
-            })
-        }
+        return {"outputs": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag), 2: AxisType(ChannelTag),})}
 
     def __init__(self, *, voc_size, hidden_size, dropout=0.0, **kwargs):
         TrainableNM.__init__(self, **kwargs)
@@ -328,7 +294,6 @@ def forward(self, input_seq):
 
 
 class SequenceProjection(TrainableNM):
-
     @property
     def input_ports(self):
         """Returns definitions of module input ports.
@@ -336,9 +301,7 @@ def input_ports(self):
         input_seq:
             Empty Type?!?
         """
-        return {
-            "input_seq": NeuralType({})
-        }
+        return {"input_seq": NeuralType({})}
 
     @property
     def output_ports(self):
@@ -347,9 +310,7 @@ def output_ports(self):
         outputs:
             None
         """
-        return {
-            "outputs": None
-        }
+        return {"outputs": None}
 
     def __init__(self, *, from_dim, to_dim, dropout=0.0, **kwargs):
         TrainableNM.__init__(self, **kwargs)
@@ -369,7 +330,6 @@ def forward(self, input_seq):
 
 
 class ZerosLikeNM(TrainableNM):
-
     @property
     def input_ports(self):
         """Returns definitions of module input ports.
@@ -379,12 +339,7 @@ def input_ports(self):
 
             1: AxisType(TimeTag)
         """
-        return {
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-            })
-        }
+        return {"input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag),})}
 
     @property
     def output_ports(self):
@@ -395,13 +350,7 @@ def output_ports(self):
 
             1: AxisType(TimeTag)
         """
-        return {
-            "input_type_ids":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-            })
-        }
+        return {"input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag),})}
 
     def __init__(self, **kwargs):
         TrainableNM.__init__(self, **kwargs)
diff --git a/nemo/backends/pytorch/common/parts.py b/nemo/backends/pytorch/common/parts.py
index e1e41a7916f4..5c1a2e99351c 100644
--- a/nemo/backends/pytorch/common/parts.py
+++ b/nemo/backends/pytorch/common/parts.py
@@ -3,7 +3,6 @@
 import os
 
 import torch
-# noinspection PyPep8Naming
 from torch import nn
 
 
@@ -33,19 +32,13 @@ def forward(self, query, context):
             query = self.linear_in(query)
             query = query.view(batch_size, output_len, dims)
 
-        attention_scores = torch.bmm(
-            query, context.transpose(1, 2).contiguous()
-        )
+        attention_scores = torch.bmm(query, context.transpose(1, 2).contiguous())
 
-        attention_scores = attention_scores.view(
-            batch_size * output_len, query_len
-        )
+        attention_scores = attention_scores.view(batch_size * output_len, query_len)
         attention_weights = self.softmax(attention_scores)
         if self.dropout.p != 0.0:
             attention_weights = self.dropout(attention_weights)
-        attention_weights = attention_weights.view(
-            batch_size, output_len, query_len
-        )
+        attention_weights = attention_weights.view(batch_size, output_len, query_len)
 
         mix = torch.bmm(attention_weights, context)
 
@@ -72,19 +65,15 @@ class MultiLayerPerceptron(nn.Module):
         log_softmax (bool): whether to add a log_softmax layer before output
     """
 
-    def __init__(self,
-                 hidden_size,
-                 num_classes,
-                 device,
-                 num_layers=2,
-                 activation='relu',
-                 log_softmax=True):
+    def __init__(
+        self, hidden_size, num_classes, device, num_layers=2, activation='relu', log_softmax=True,
+    ):
         super().__init__()
         self.layers = 0
         for _ in range(num_layers - 1):
             layer = nn.Linear(hidden_size, hidden_size).to(device)
             setattr(self, f'layer{self.layers}', layer)
-            setattr(self, f'layer{self.layers+1}', getattr(torch, activation))
+            setattr(self, f'layer{self.layers + 1}', getattr(torch, activation))
             self.layers += 2
         layer = nn.Linear(hidden_size, num_classes).to(device)
         setattr(self, f'layer{self.layers}', layer)
@@ -93,7 +82,7 @@ def __init__(self,
 
     @property
     def last_linear_layer(self):
-        return getattr(self, f'layer{self.layers-1}')
+        return getattr(self, f'layer{self.layers - 1}')
 
     def forward(self, hidden_states):
         output_states = hidden_states[:]
@@ -101,7 +90,6 @@ def forward(self, hidden_states):
             output_states = getattr(self, f'layer{i}')(output_states)
 
         if self.log_softmax:
-            output_states = torch.log_softmax(
-                output_states.float(), dim=-1).to(hidden_states.dtype)
+            output_states = torch.log_softmax(output_states.float(), dim=-1).to(hidden_states.dtype)
             # TODO: make it work with float16
         return output_states
diff --git a/nemo/backends/pytorch/common/rnn.py b/nemo/backends/pytorch/common/rnn.py
index cc0e5d745851..c7f6fc66f5bc 100644
--- a/nemo/backends/pytorch/common/rnn.py
+++ b/nemo/backends/pytorch/common/rnn.py
@@ -3,14 +3,12 @@
 import random
 
 import torch
-# noinspection PyPep8Naming
-import torch.nn.functional as F
+import torch.nn.functional as pt_f
 from torch import nn
 
 from nemo.backends.pytorch.common.parts import Attention
 from nemo.backends.pytorch.nm import TrainableNM
-from nemo.core.neural_types import NeuralType, AxisType, BatchTag, TimeTag, \
-    ChannelTag
+from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
 from nemo.utils.misc import pad_to
 
 
@@ -67,15 +65,10 @@ def input_ports(self):
             2: AxisType(ChannelTag)
         """
         return {
-            'targets': NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            'encoder_outputs': NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            }, optional=True)
+            'targets': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            'encoder_outputs': NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}, optional=True,
+            ),
         }
 
     @property
@@ -97,33 +90,29 @@ def output_ports(self):
             2: AxisType(TimeTag)
         """
         return {
-            'log_probs': NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            }),
-            'attention_weights': NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(TimeTag)
-            }, optional=True)
+            'log_probs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            'attention_weights': NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(TimeTag),}, optional=True,
+            ),
         }
 
-    def __init__(self,
-                 voc_size,
-                 bos_id,
-                 hidden_size,
-                 attention_method='general',
-                 attention_type='post',
-                 in_dropout=0.2,
-                 gru_dropout=0.2,
-                 attn_dropout=0.0,
-                 teacher_forcing=1.0,
-                 curriculum_learning=0.5,
-                 rnn_type='gru',
-                 n_layers=2,
-                 tie_emb_out_weights=True,
-                 **kwargs):
+    def __init__(
+        self,
+        voc_size,
+        bos_id,
+        hidden_size,
+        attention_method='general',
+        attention_type='post',
+        in_dropout=0.2,
+        gru_dropout=0.2,
+        attn_dropout=0.0,
+        teacher_forcing=1.0,
+        curriculum_learning=0.5,
+        rnn_type='gru',
+        n_layers=2,
+        tie_emb_out_weights=True,
+        **kwargs
+    ):
         super().__init__(**kwargs)
 
         self.bos_id = bos_id
@@ -137,37 +126,30 @@ def __init__(self,
         # noinspection PyTypeChecker
         self.in_dropout = nn.Dropout(in_dropout)
         rnn_class = getattr(nn, rnn_type.upper())
-        self.rnn = rnn_class(hidden_size, hidden_size, n_layers,
-                             dropout=(0 if n_layers == 1 else gru_dropout),
-                             batch_first=True)
+        self.rnn = rnn_class(
+            hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else gru_dropout), batch_first=True,
+        )
         self.out = nn.Linear(hidden_size, voc_size)
         if tie_emb_out_weights:
             self.out.weight = self.embedding.weight  # Weight tying
-        self.attention = Attention(hidden_size, attention_method,
-                                   dropout=attn_dropout)
+        self.attention = Attention(hidden_size, attention_method, dropout=attn_dropout)
 
         # self.apply(init_weights)
         # self.gru.apply(init_weights)
         self.to(self._device)
 
     def forward(self, targets, encoder_outputs=None):
-        if (not self.training) \
-                or (random.random() <= self.teacher_forcing):  # Fast option
+        if (not self.training) or (random.random() <= self.teacher_forcing):  # Fast option
             # Removing last char (dont need to calculate loss) and add bos
             # noinspection PyTypeChecker
-            decoder_inputs = F.pad(
-                targets[:, :-1], (1, 0), value=self.bos_id
-            )  # BT
-            log_probs, _, attention_weights = \
-                self.forward_step(decoder_inputs, encoder_outputs)
+            decoder_inputs = pt_f.pad(targets[:, :-1], (1, 0), value=self.bos_id)  # BT
+            log_probs, _, attention_weights = self.forward_step(decoder_inputs, encoder_outputs)
         else:
-            log_probs, attention_weights = \
-                self.forward_cl(targets, encoder_outputs)
+            log_probs, attention_weights = self.forward_cl(targets, encoder_outputs)
 
         return log_probs, attention_weights
 
-    def forward_step(self, decoder_inputs,
-                     encoder_outputs=None, decoder_hidden=None):
+    def forward_step(self, decoder_inputs, encoder_outputs=None, decoder_hidden=None):
         """(BT, BTC@?, hBC@?) -> (BTC, hBC, BTT@?)"""
 
         # Inputs
@@ -178,32 +160,25 @@ def forward_step(self, decoder_inputs,
         # RNN
         if self.rnn_type == 'gru' and decoder_hidden is not None:
             decoder_hidden = decoder_hidden[0]
-        decoder_outputs, decoder_hidden = self.rnn(
-            decoder_inputs, decoder_hidden
-        )
+        decoder_outputs, decoder_hidden = self.rnn(decoder_inputs, decoder_hidden)
         if self.rnn_type == 'gru':
             decoder_hidden = (decoder_hidden,)
 
         # Outputs
         attention_weights = None
         if self.attention_type == 'post':
-            decoder_outputs, attention_weights = self.attention(
-                decoder_outputs, encoder_outputs
-            )
+            decoder_outputs, attention_weights = self.attention(decoder_outputs, encoder_outputs)
         decoder_outputs = self.out(decoder_outputs)
 
         # Log probs
-        log_probs = F.log_softmax(decoder_outputs, dim=-1)
+        log_probs = pt_f.log_softmax(decoder_outputs, dim=-1)
 
         return log_probs, decoder_hidden, attention_weights
 
     def forward_cl(self, targets, encoder_outputs=None):
         """(BT, BTC@?) -> (BTC, BTT@?)"""
 
-        decoder_input = torch.empty(
-            targets.size(0), 1,
-            dtype=torch.long, device=self._device
-        ).fill_(self.bos_id)
+        decoder_input = torch.empty(targets.size(0), 1, dtype=torch.long, device=self._device).fill_(self.bos_id)
         decoder_hidden = None
         log_probs = []
         attention_weights = []
@@ -211,10 +186,9 @@ def forward_cl(self, targets, encoder_outputs=None):
         max_len = targets.size(1)
         rands = torch.rand(max_len)  # Precalculate randomness
         for i in range(max_len):
-            step_log_prob, decoder_hidden, step_attention_weights = \
-                self.forward_step(
-                    decoder_input, encoder_outputs, decoder_hidden
-                )
+            (step_log_prob, decoder_hidden, step_attention_weights,) = self.forward_step(
+                decoder_input, encoder_outputs, decoder_hidden
+            )
             log_probs.append(step_log_prob)
             attention_weights.append(step_attention_weights)
 
diff --git a/nemo/backends/pytorch/common/search.py b/nemo/backends/pytorch/common/search.py
index 87c3f74a59fb..812c22ce2cfd 100644
--- a/nemo/backends/pytorch/common/search.py
+++ b/nemo/backends/pytorch/common/search.py
@@ -3,13 +3,10 @@
 import torch
 
 from nemo.backends.pytorch.nm import NonTrainableNM
-from nemo.core.neural_types import (NeuralType,
-                                    AxisType, BatchTag,
-                                    TimeTag,
-                                    ChannelTag)
+from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
 
 INF = float('inf')
-BIG_NUM = 1e+4
+BIG_NUM = 1e4
 
 
 # TODO: Validate, compare to `BeamSearch`
@@ -43,11 +40,9 @@ def input_ports(self):
             2: AxisType(ChannelTag)
         """
         return {
-            'encoder_outputs': NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag),
-            }, optional=True)
+            'encoder_outputs': NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}, optional=True,
+            )
         }
 
     @property
@@ -67,19 +62,11 @@ def output_ports(self):
             2: AxisType(TimeTag)
         """
         return {
-            'predictions': NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            'attention_weights': NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(TimeTag)
-            })
+            'predictions': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            'attention_weights': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(TimeTag),}),
         }
 
-    def __init__(self, decoder, pad_id, bos_id, eos_id, max_len,
-                 batch_size=None, **kwargs):
+    def __init__(self, decoder, pad_id, bos_id, eos_id, max_len, batch_size=None, **kwargs):
         super().__init__(**kwargs)
 
         self.decoder = decoder
@@ -92,24 +79,17 @@ def __init__(self, decoder, pad_id, bos_id, eos_id, max_len,
     @torch.no_grad()
     def forward(self, encoder_output):
         batch_size = encoder_output.size(0)
-        predictions = torch.empty(
-            batch_size, 1,
-            dtype=torch.long, device=self._device
-        ).fill_(self.bos_id)
+        predictions = torch.empty(batch_size, 1, dtype=torch.long, device=self._device).fill_(self.bos_id)
         pad_profile = torch.zeros_like(predictions)
 
         last_hidden = None
         for i in range(self.max_len):
-            log_prob, last_hidden = self.decoder.forward_step(
-                predictions[:, -1:], last_hidden, encoder_output
-            )
+            log_prob, last_hidden = self.decoder.forward_step(predictions[:, -1:], last_hidden, encoder_output)
             next_pred = torch.argmax(log_prob.squueze(1), dim=-1, keepdim=True)
             # noinspection PyTypeChecker
-            next_pred = \
-                self.pad_id * pad_profile + next_pred * (1 - pad_profile)
+            next_pred = self.pad_id * pad_profile + next_pred * (1 - pad_profile)
             predictions = torch.cat((predictions, next_pred), dim=-1)
-            pad_profile = torch.max(pad_profile,
-                                    (next_pred == self.eos_id).long())
+            pad_profile = torch.max(pad_profile, (next_pred == self.eos_id).long())
 
             if pad_profile.sum() == batch_size:
                 break
@@ -138,10 +118,8 @@ class BeamSearch(GreedySearch):
 
     """
 
-    def __init__(self, decoder, pad_id, bos_id, eos_id, max_len,
-                 batch_size=None, beam_size=8, **kwargs):
-        super().__init__(decoder, pad_id, bos_id, eos_id, max_len,
-                         batch_size, **kwargs)
+    def __init__(self, decoder, pad_id, bos_id, eos_id, max_len, batch_size=None, beam_size=8, **kwargs):
+        super().__init__(decoder, pad_id, bos_id, eos_id, max_len, batch_size, **kwargs)
 
         self.beam_size = beam_size
 
@@ -154,33 +132,25 @@ def forward(self, encoder_outputs=None):
             # [BK]TC
             # encoder_output = encoder_output.repeat_interleave(k, 0)
             encoder_outputs = encoder_outputs.unsqueeze(1).repeat(1, k, 1, 1)
-            encoder_outputs = encoder_outputs.view(
-                -1, *encoder_outputs.shape[2:]
-            )
+            encoder_outputs = encoder_outputs.view(-1, *encoder_outputs.shape[2:])
         else:
             bs = self.batch_size
 
-        predictions = torch.empty(
-            bs * k, 1,
-            dtype=torch.long, device=self._device
-        ).fill_(self.bos_id)  # [BK]1
+        predictions = torch.empty(bs * k, 1, dtype=torch.long, device=self._device).fill_(self.bos_id)  # [BK]1
         scores = torch.zeros_like(predictions, dtype=fdtype)  # [BK]1
         pad_profile = torch.zeros_like(predictions)  # [BK]1
         if encoder_outputs is not None:
             t = encoder_outputs.shape[1]
             # [BK]1T
-            attention_weights = torch.empty(
-                bs * k, 1, t, dtype=fdtype, device=self._device
-            ).fill_(1. / t)
+            attention_weights = torch.empty(bs * k, 1, t, dtype=fdtype, device=self._device).fill_(1.0 / t)
         else:
             attention_weights = None
 
         last_hidden = None
         for i in range(self.max_len):
-            log_probs, last_hidden, attention_weights_i = \
-                self.decoder.forward_step(
-                    predictions[:, -1:], encoder_outputs, last_hidden
-                )  # [BK]1C, h[BK]C, [BK]1T
+            (log_probs, last_hidden, attention_weights_i,) = self.decoder.forward_step(
+                predictions[:, -1:], encoder_outputs, last_hidden
+            )  # [BK]1C, h[BK]C, [BK]1T
 
             log_probs = log_probs.squeeze(1)  # [BK]C
 
@@ -198,9 +168,7 @@ def forward(self, encoder_outputs=None):
             scores_i[mask, 0] = 0.0
             scores = scores + scores_i
             scores[mask, 1:] = -INF
-            scores, indices_i = torch.topk(
-                scores.view(-1, k ** 2), k
-            )  # BK, BK
+            scores, indices_i = torch.topk(scores.view(-1, k ** 2), k)  # BK, BK
             scores = scores.view(-1, 1)  # [BK]1
 
             pad_mask = pad_profile.repeat(1, k)  # [BK]K
@@ -209,12 +177,12 @@ def forward(self, encoder_outputs=None):
             predicted_i = pad_mask * self.pad_id + (1 - pad_mask) * predicted_i
             predictions = predictions.unsqueeze(1).repeat(1, k, 1)  # [BK]KL
             # [BK]K[L+1]
-            predictions = torch.cat(
-                (predictions, predicted_i.unsqueeze(2)), dim=-1
-            )
-            predictions = predictions.view(bs, k ** 2, -1).gather(
-                1, indices_i.unsqueeze(2).repeat(1, 1, predictions.size(-1))
-            ).view(-1, predictions.size(-1))  # [BK][L+1]
+            predictions = torch.cat((predictions, predicted_i.unsqueeze(2)), dim=-1)
+            predictions = (
+                predictions.view(bs, k ** 2, -1)
+                .gather(1, indices_i.unsqueeze(2).repeat(1, 1, predictions.size(-1)),)
+                .view(-1, predictions.size(-1))
+            )  # [BK][L+1]
 
             new_tensors = []
             for t in last_hidden:
@@ -222,23 +190,18 @@ def forward(self, encoder_outputs=None):
             last_hidden = tuple(new_tensors)
 
             if attention_weights_i is not None:
-                attention_weights = torch.cat(
-                    (attention_weights, attention_weights_i), dim=1
-                )
-                attention_weights = self.choose(attention_weights,
-                                                indices_i, 0)
+                attention_weights = torch.cat((attention_weights, attention_weights_i), dim=1)
+                attention_weights = self.choose(attention_weights, indices_i, 0)
 
-            pad_profile = \
-                ((predictions[:, -1:] == self.eos_id)
-                 | (predictions[:, -1:] == self.pad_id)).long()  # [BK]1
+            pad_profile = ((predictions[:, -1:] == self.eos_id) | (predictions[:, -1:] == self.pad_id)).long()  # [BK]1
 
             if pad_profile.sum() == bs * k:
                 break
 
         best_i = torch.argmax(scores.view(bs, k), dim=-1, keepdim=True)  # B1
-        predictions = predictions.view(bs, k, -1).gather(
-            1, best_i.repeat(1, predictions.size(1)).unsqueeze(1)
-        ).squeeze(1)  # BT
+        predictions = (
+            predictions.view(bs, k, -1).gather(1, best_i.repeat(1, predictions.size(1)).unsqueeze(1)).squeeze(1)
+        )  # BT
         attention_weights = attention_weights[:, 1:, :]  # -eos
         shape_suf = attention_weights.shape[1:]
         attention_weights = attention_weights.view(bs, k, *shape_suf)
diff --git a/nemo/backends/pytorch/common/zero_data.py b/nemo/backends/pytorch/common/zero_data.py
index a2eba89d1798..8b7b2c08ce6a 100644
--- a/nemo/backends/pytorch/common/zero_data.py
+++ b/nemo/backends/pytorch/common/zero_data.py
@@ -1,13 +1,11 @@
 import torch
 from torch.utils.data import Dataset
 
-from ...pytorch.nm import DataLayerNM
 from ....core.neural_types import *
+from ...pytorch.nm import DataLayerNM
 
 
-def neuralType2TensorShape(
-        neural_type: NeuralType, default_dim=32, skip_batch_axis=True
-) -> torch.Size:
+def neuralType2TensorShape(neural_type: NeuralType, default_dim=32, skip_batch_axis=True) -> torch.Size:
     """
     Converts Neural Type to torch tensor shape.
     Args:
@@ -69,8 +67,7 @@ class ZerosDataLayer(DataLayerNM):
             Defaults to None.
     """
 
-    def __init__(self, *, size, output_ports, dtype, batch_size, shapes=None,
-                 **kwargs):
+    def __init__(self, *, size, output_ports, dtype, batch_size, shapes=None, **kwargs):
         DataLayerNM.__init__(self, **kwargs)
         self._size = size
         self._output_ports = output_ports
@@ -78,18 +75,13 @@ def __init__(self, *, size, output_ports, dtype, batch_size, shapes=None,
         self._batch_size = batch_size
         self._shapes = shapes
         if self._shapes is None:
-            self._shapes = [
-                neuralType2TensorShape(pval)
-                for pname, pval in self._output_ports.items()
-            ]
+            self._shapes = [neuralType2TensorShape(pval) for pname, pval in self._output_ports.items()]
 
-        self._dataset = _ZeroDS(size=self._size, shapes=self._shapes,
-                                dtype=self._type)
+        self._dataset = _ZeroDS(size=self._size, shapes=self._shapes, dtype=self._type)
 
     @property
     def input_ports(self):
-        return {
-        }
+        return {}
 
     @property
     def output_ports(self):
diff --git a/nemo/backends/pytorch/module_wrapper.py b/nemo/backends/pytorch/module_wrapper.py
index 09a55ef2e339..c233f77ee01e 100644
--- a/nemo/backends/pytorch/module_wrapper.py
+++ b/nemo/backends/pytorch/module_wrapper.py
@@ -2,7 +2,7 @@
 import torch as t
 import torch.nn as nn
 
-from ...core import NeuralModule, DeviceType
+from ...core import DeviceType, NeuralModule
 from ...utils.helpers import rgetattr, rsetattr
 
 
@@ -10,16 +10,12 @@ class TrainableNeuralModuleWrapper(NeuralModule, nn.Module):
     """This class wraps an instance of Pytorch's nn.Module and
     returns NeuralModule's instance."""
 
-    def __init__(self, pt_nn_module, input_ports_dict, output_ports_dict,
-                 **kwargs):
+    def __init__(self, pt_nn_module, input_ports_dict, output_ports_dict, **kwargs):
         NeuralModule.__init__(self, **kwargs)
         nn.Module.__init__(self)
         self._input_ports = input_ports_dict
         self._output_ports = output_ports_dict
-        self._device = t.device(
-            "cuda" if self.placement in [DeviceType.GPU, DeviceType.AllGpu]
-            else "cpu"
-        )
+        self._device = t.device("cuda" if self.placement in [DeviceType.GPU, DeviceType.AllGpu] else "cpu")
         self._pt_module = pt_nn_module
         self._pt_module.to(self._device)
 
@@ -86,9 +82,7 @@ def get_weights(self):
         return result
 
     def set_weights(self, name2weight, name2name_and_transform=None):
-        self._pt_module.load_state_dict(
-            {key: name2weight[key][0] for key in name2weight.keys()}
-        )
+        self._pt_module.load_state_dict({key: name2weight[key][0] for key in name2weight.keys()})
 
     def tie_weights_with(self, module, weight_names):
         for name in weight_names:
@@ -96,5 +90,4 @@ def tie_weights_with(self, module, weight_names):
 
     @property
     def num_weights(self):
-        return sum(
-            p.numel() for p in self._pt_module.parameters() if p.requires_grad)
+        return sum(p.numel() for p in self._pt_module.parameters() if p.requires_grad)
diff --git a/nemo/backends/pytorch/nm.py b/nemo/backends/pytorch/nm.py
index 3328f0be486c..2cd70c3695d5 100644
--- a/nemo/backends/pytorch/nm.py
+++ b/nemo/backends/pytorch/nm.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2019 NVIDIA Corporation
 from abc import abstractmethod
-from typing import Dict, Set, Tuple, Optional, List
+from typing import Dict, List, Optional, Set, Tuple
 
 import torch as t
 import torch.nn as nn
 
-from ...core import NeuralModule, DeviceType, WeightShareTransform
-from ...utils.helpers import rgetattr, rsetattr, get_cuda_device
+from ...core import DeviceType, NeuralModule, WeightShareTransform
+from ...utils.helpers import get_cuda_device, rgetattr, rsetattr
 
 
 class TrainableNM(NeuralModule, nn.Module):
@@ -51,17 +51,12 @@ def get_weights(self):
     def set_weights(self, name2weight, name2name_and_transform=None):
         if name2weight is not None and len(name2weight) > 0:
             if name2name_and_transform is None:
-                self.load_state_dict(
-                    {key: name2weight[key][0] for key in name2weight.keys()}
-                )
+                self.load_state_dict({key: name2weight[key][0] for key in name2weight.keys()})
             else:
-                self.load_state_dict(
-                    {key: name2weight[key][0] for key in name2weight.keys()}
-                )
+                self.load_state_dict({key: name2weight[key][0] for key in name2weight.keys()})
 
     @t.jit.ignore
-    def tie_weights_with(self, module, weight_names,
-                         name2name_and_transform=None):
+    def tie_weights_with(self, module, weight_names, name2name_and_transform=None):
         if module is None:
             raise ValueError("Module to tie weights can't be None")
         if weight_names is None or len(weight_names) == 0:
@@ -73,23 +68,12 @@ def tie_weights_with(self, module, weight_names,
         else:
             for self_w_name in weight_names:
                 if self_w_name in name2name_and_transform:
-                    if (
-                            name2name_and_transform[self_w_name][1]
-                            == WeightShareTransform.SAME
-                    ):
+                    if name2name_and_transform[self_w_name][1] == WeightShareTransform.SAME:
                         rsetattr(
-                            self,
-                            self_w_name,
-                            rgetattr(module,
-                                     name2name_and_transform[self_w_name][0]),
-                        )
-                    elif (
-                            name2name_and_transform[self_w_name][1]
-                            == WeightShareTransform.TRANSPOSE
-                    ):
-                        raise NotImplementedError(
-                            "Sorry, currently this is not implemented."
+                            self, self_w_name, rgetattr(module, name2name_and_transform[self_w_name][0]),
                         )
+                    elif name2name_and_transform[self_w_name][1] == WeightShareTransform.TRANSPOSE:
+                        raise NotImplementedError("Sorry, currently this is not implemented.")
                 else:
                     rsetattr(self, self_w_name, rgetattr(module, self_w_name))
 
@@ -157,14 +141,19 @@ def forward(self, *input):
     def get_weights(self) -> Optional[Dict[(str, bool)]]:
         None
 
-    def set_weights(self, name2weight: Dict[(str, Tuple[str, bool])],
-                    name2name_and_transform: Dict[
-                        (str, Tuple[str, WeightShareTransform])] = None):
+    def set_weights(
+        self,
+        name2weight: Dict[(str, Tuple[str, bool])],
+        name2name_and_transform: Dict[(str, Tuple[str, WeightShareTransform])] = None,
+    ):
         pass
 
-    def tie_weights_with(self, module, weight_names=List[str],
-                         name2name_and_transform: Dict[
-                             (str, Tuple[str, WeightShareTransform])] = None):
+    def tie_weights_with(
+        self,
+        module,
+        weight_names=List[str],
+        name2name_and_transform: Dict[(str, Tuple[str, WeightShareTransform])] = None,
+    ):
         pass
 
     def save_to(self, path: str):
@@ -214,8 +203,7 @@ def get_weights(self):
         # )
         return None
 
-    def set_weights(self, name2weight: Dict[(str, bool)],
-                    name2name_and_transform):
+    def set_weights(self, name2weight: Dict[(str, bool)], name2name_and_transform):
         # nemo.logging.warning(
         #     "Data Layer does not have any weights to set. "
         #     "This set_weights call is ignored."
@@ -237,9 +225,7 @@ def save_to(self, path):
         return None
 
     def restore_from(self, path):
-        raise NotImplementedError(
-            "Data Layer could not be restored from any saved " "state."
-        )
+        raise NotImplementedError("Data Layer could not be restored from any saved " "state.")
         return None
 
     def freeze(self, weights: Set[str] = None):
@@ -299,8 +285,7 @@ def get_weights(self):
         #      "to return. This get_weights call returns None.")
         return None
 
-    def set_weights(self, name2weight: Dict[(str, bool)],
-                    name2name_and_transform):
+    def set_weights(self, name2weight: Dict[(str, bool)], name2name_and_transform):
         # nemo.logging.warning(
         #     "Loss function module does not have any weights to set. "
         #     "This set_weights call is ignored."
@@ -322,10 +307,7 @@ def save_to(self, path):
         return None
 
     def restore_from(self, path):
-        raise NotImplementedError(
-            "Loss function module could not be restored from " "any saved "
-            "state."
-        )
+        raise NotImplementedError("Loss function module could not be restored from " "any saved " "state.")
         return None
 
     def freeze(self, weights: Set[str] = None):
diff --git a/nemo/backends/pytorch/optimizers.py b/nemo/backends/pytorch/optimizers.py
index 9196bc810229..a9977b4ae365 100644
--- a/nemo/backends/pytorch/optimizers.py
+++ b/nemo/backends/pytorch/optimizers.py
@@ -43,20 +43,10 @@ class AdamW(Optimizer):
     """
 
     def __init__(
-            self,
-            params,
-            lr=1e-3,
-            betas=(0.9, 0.999),
-            eps=1e-8,
-            weight_decay=0,
-            amsgrad=False,
+        self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False,
     ):
         _check_valid_opt_params(lr, eps, betas)
-        defaults = dict(lr=lr,
-                        betas=betas,
-                        eps=eps,
-                        weight_decay=weight_decay,
-                        amsgrad=amsgrad)
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad,)
         super(AdamW, self).__init__(params, defaults)
 
     def __setstate__(self, state):
@@ -81,10 +71,7 @@ def step(self, closure=None):
                     continue
                 grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError(
-                        "Adam does not support sparse gradients, please "
-                        "consider SparseAdam instead"
-                    )
+                    raise RuntimeError("Adam does not support sparse gradients, please " "consider SparseAdam instead")
                 amsgrad = group["amsgrad"]
                 state = self.state[p]
 
@@ -122,15 +109,11 @@ def step(self, closure=None):
 
                 bias_correction1 = 1 - beta1 ** state["step"]
                 bias_correction2 = 1 - beta2 ** state["step"]
-                step_size = group["lr"] * math.sqrt(
-                    bias_correction2) / bias_correction1
+                step_size = group["lr"] * math.sqrt(bias_correction2) / bias_correction1
 
                 # p.data.addcdiv_(-step_size, exp_avg, denom)
                 p.data.add_(
-                    -step_size,
-                    torch.mul(p.data, group["weight_decay"]).addcdiv_(
-                        1, exp_avg, denom
-                    ),
+                    -step_size, torch.mul(p.data, group["weight_decay"]).addcdiv_(1, exp_avg, denom),
                 )
 
         return loss
@@ -156,25 +139,22 @@ class Novograd(Optimizer):
     """
 
     def __init__(
-            self,
-            params,
-            lr=1e-3,
-            betas=(0.95, 0.98),
-            eps=1e-8,
-            weight_decay=0,
-            grad_averaging=False,
-            amsgrad=False,
-            luc=False,
-            luc_trust=1e-3,
-            luc_eps=1e-8,
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.95, 0.98),
+        eps=1e-8,
+        weight_decay=0,
+        grad_averaging=False,
+        amsgrad=False,
+        luc=False,
+        luc_trust=1e-3,
+        luc_eps=1e-8,
     ):
         _check_valid_opt_params(lr, eps, betas)
-        defaults = dict(lr=lr,
-                        betas=betas,
-                        eps=eps,
-                        weight_decay=weight_decay,
-                        grad_averaging=grad_averaging,
-                        amsgrad=amsgrad)
+        defaults = dict(
+            lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, grad_averaging=grad_averaging, amsgrad=amsgrad,
+        )
         self.luc = luc
         self.luc_trust = luc_trust
         self.luc_eps = luc_eps
@@ -212,13 +192,10 @@ def step(self, closure=None):
                     # Exponential moving average of gradient values
                     state["exp_avg"] = torch.zeros_like(p.data)
                     # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros([]).to(
-                        state["exp_avg"].device)
+                    state["exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device)
                     if amsgrad:
                         # Maintains max of all exp moving avg of squared grad
-                        state["max_exp_avg_sq"] = torch.zeros([]).to(
-                            state["exp_avg"].device
-                        )
+                        state["max_exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device)
 
                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                 if amsgrad:
@@ -253,8 +230,7 @@ def step(self, closure=None):
                     # Clip update so that updates are less than eta*weights
                     data_norm = torch.norm(p.data)
                     grad_norm = torch.norm(exp_avg.data)
-                    luc_factor = self.luc_trust * data_norm / (
-                        grad_norm + self.luc_eps)
+                    luc_factor = self.luc_trust * data_norm / (grad_norm + self.luc_eps)
                     luc_factor = min(luc_factor, group["lr"])
                     p.data.add_(-luc_factor, exp_avg)
                 else:
diff --git a/nemo/backends/pytorch/torchvision/data/image_folder.py b/nemo/backends/pytorch/torchvision/data/image_folder.py
index f58680b929d8..8f762e6bfbbc 100644
--- a/nemo/backends/pytorch/torchvision/data/image_folder.py
+++ b/nemo/backends/pytorch/torchvision/data/image_folder.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2019 NVIDIA Corporation
-from torchvision import transforms, datasets
+from torchvision import datasets, transforms
 
-from ...nm import DataLayerNM
 from .....core import *
+from ...nm import DataLayerNM
 
 
 class ImageFolderDataLayer(DataLayerNM):
@@ -38,8 +38,7 @@ def output_ports(self):
             "label": NeuralType({0: AxisType(BatchTag)}),
         }
 
-    def __init__(self, *, input_size=32, batch_size, path, shuffle=True,
-                 is_eval=False, **kwargs):
+    def __init__(self, *, input_size=32, batch_size, path, shuffle=True, is_eval=False, **kwargs):
         DataLayerNM.__init__(self, **kwargs)
 
         self._input_size = input_size
@@ -54,8 +53,7 @@ def __init__(self, *, input_size=32, batch_size, path, shuffle=True,
                     transforms.RandomResizedCrop(self._input_size),
                     transforms.RandomHorizontalFlip(),
                     transforms.ToTensor(),
-                    transforms.Normalize([0.485, 0.456, 0.406],
-                                         [0.229, 0.224, 0.225]),
+                    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
                 ]
             )
 
@@ -66,8 +64,7 @@ def __init__(self, *, input_size=32, batch_size, path, shuffle=True,
                     transforms.Resize(256),
                     transforms.CenterCrop(self._input_size),
                     transforms.ToTensor(),
-                    transforms.Normalize([0.485, 0.456, 0.406],
-                                         [0.229, 0.224, 0.225]),
+                    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
                 ]
             )
 
diff --git a/nemo/backends/pytorch/tutorials/chatbot/data.py b/nemo/backends/pytorch/tutorials/chatbot/data.py
index d27eb8b50352..a4ea9124e4cb 100644
--- a/nemo/backends/pytorch/tutorials/chatbot/data.py
+++ b/nemo/backends/pytorch/tutorials/chatbot/data.py
@@ -19,8 +19,11 @@ def __init__(self, name):
         self.trimmed = False
         self.word2index = {}
         self.word2count = {}
-        self.index2word = {PAD_token: "PAD", SOS_token: "SOS",
-                           EOS_token: "EOS"}
+        self.index2word = {
+            PAD_token: "PAD",
+            SOS_token: "SOS",
+            EOS_token: "EOS",
+        }
         self.num_words = 3  # Count SOS, EOS, PAD
 
     def addSentence(self, sentence):
@@ -50,17 +53,18 @@ def trim(self, min_count):
 
         print(
             "keep_words {} / {} = {:.4f}".format(
-                len(keep_words),
-                len(self.word2index),
-                len(keep_words) / len(self.word2index),
+                len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index),
             )
         )
 
         # Reinitialize dictionaries
         self.word2index = {}
         self.word2count = {}
-        self.index2word = {PAD_token: "PAD", SOS_token: "SOS",
-                           EOS_token: "EOS"}
+        self.index2word = {
+            PAD_token: "PAD",
+            SOS_token: "SOS",
+            EOS_token: "EOS",
+        }
         self.num_words = 3  # Count default tokens
 
         for word in keep_words:
@@ -75,10 +79,7 @@ def trim(self, min_count):
 
 
 def unicodeToAscii(s):
-    return "".join(
-        c for c in unicodedata.normalize("NFD", s) if
-        unicodedata.category(c) != "Mn"
-    )
+    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
 
 
 # Lowercase, trim, and remove non-letter characters
@@ -111,8 +112,7 @@ def readVocs(datafile, corpus_name):
 
 def filterPair(p):
     # Input sequences need to preserve the last word for EOS token
-    return len(p[0].split(" ")) < MAX_LENGTH and len(
-        p[1].split(" ")) < MAX_LENGTH
+    return len(p[0].split(" ")) < MAX_LENGTH and len(p[1].split(" ")) < MAX_LENGTH
 
 
 # Filter pairs using filterPair condition
diff --git a/nemo/backends/pytorch/tutorials/chatbot/modules.py b/nemo/backends/pytorch/tutorials/chatbot/modules.py
index 3d8965356bdc..de98c5799edb 100644
--- a/nemo/backends/pytorch/tutorials/chatbot/modules.py
+++ b/nemo/backends/pytorch/tutorials/chatbot/modules.py
@@ -2,16 +2,16 @@
 https://pytorch.org/tutorials/beginner/chatbot_tutorial.html
 """
 import random
-from typing import Mapping, Iterable, Optional
+from typing import Iterable, Mapping, Optional
 
 import torch as t
 import torch.nn as nn
 import torch.nn.functional as F
 
-from ..chatbot import data
-from ...nm import TrainableNM, DataLayerNM, LossNM
 from .....core import DeviceType
 from .....core.neural_types import *
+from ...nm import DataLayerNM, LossNM, TrainableNM
+from ..chatbot import data
 
 
 class DialogDataLayer(DataLayerNM):
@@ -50,14 +50,7 @@ def output_ports(self):
             "max_tgt_lengths": NeuralType(None),
         }
 
-    def __init__(
-            self, *,
-            batch_size,
-            corpus_name,
-            datafile,
-            min_count=3,
-            **kwargs
-    ):
+    def __init__(self, *, batch_size, corpus_name, datafile, min_count=3, **kwargs):
         DataLayerNM.__init__(self, **kwargs)
 
         self._batch_size = batch_size
@@ -68,8 +61,7 @@ def __init__(
         self.voc = voc
         self.pairs = data.trimRareWords(voc, pairs, self._min_count)
 
-        self._device = t.device(
-            "cuda" if self.placement == DeviceType.GPU else "cpu")
+        self._device = t.device("cuda" if self.placement == DeviceType.GPU else "cpu")
         self._dataloader = []
         for i in range(self.__len__()):
             self._dataloader.append(self.__getitem__(i))
@@ -80,10 +72,7 @@ def __len__(self):
     def __getitem__(self, idx):
         return [
             x.to(self._device) if isinstance(x, t.Tensor) else x
-            for x in data.batch2TrainData(
-                self.voc,
-                [random.choice(self.pairs) for _ in range(self._batch_size)]
-            )
+            for x in data.batch2TrainData(self.voc, [random.choice(self.pairs) for _ in range(self._batch_size)],)
         ]
 
     def get_weights(self) -> Iterable[Optional[Mapping]]:
@@ -115,8 +104,7 @@ def input_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "input_seq": NeuralType(
-                {0: AxisType(TimeTag), 1: AxisType(BatchTag)}),
+            "input_seq": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag)}),
             "input_lengths": NeuralType({0: AxisType(BatchTag)}),
         }
 
@@ -137,23 +125,11 @@ def output_ports(self):
             1: AxisType(ChannelTag)
         """
         return {
-            "outputs": NeuralType(
-                {0: AxisType(TimeTag), 1: AxisType(BatchTag),
-                 2: AxisType(ChannelTag)}
-            ),
-            "hidden": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "outputs": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag), 2: AxisType(ChannelTag),}),
+            "hidden": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
         }
 
-    def __init__(
-            self, *,
-            voc_size,
-            encoder_n_layers,
-            hidden_size,
-            dropout,
-            bidirectional=True,
-            **kwargs
-    ):
+    def __init__(self, *, voc_size, encoder_n_layers, hidden_size, dropout, bidirectional=True, **kwargs):
         TrainableNM.__init__(self, **kwargs)
 
         self.voc_size = voc_size
@@ -176,8 +152,7 @@ def __init__(
             dropout=(0 if self.n_layers == 1 else self.dropout),
             bidirectional=self.bidirectional,
         )
-        self._device = t.device(
-            "cuda" if self.placement == DeviceType.GPU else "cpu")
+        self._device = t.device("cuda" if self.placement == DeviceType.GPU else "cpu")
         self.to(self._device)
 
     def forward(self, input_seq, input_lengths, hidden=None):
@@ -190,14 +165,12 @@ def forward(self, input_seq, input_lengths, hidden=None):
         # Unpack padding
         outputs, _ = t.nn.utils.rnn.pad_packed_sequence(outputs)
         # Sum bidirectional GRU outputs
-        outputs = outputs[:, :, : self.hidden_size] + \
-            outputs[:, :, self.hidden_size:]
+        outputs = outputs[:, :, : self.hidden_size] + outputs[:, :, self.hidden_size :]
         # Return output and final hidden state
         return outputs, hidden
 
 
 class LuongAttnDecoderRNN(TrainableNM):
-
     @property
     def input_ports(self):
         """Returns definitions of module input ports.
@@ -218,12 +191,8 @@ def input_ports(self):
             None
         """
         return {
-            "targets": NeuralType(
-                {0: AxisType(TimeTag), 1: AxisType(BatchTag)}),
-            "encoder_outputs": NeuralType(
-                {0: AxisType(TimeTag), 1: AxisType(BatchTag),
-                 2: AxisType(ChannelTag)}
-            ),
+            "targets": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag)}),
+            "encoder_outputs": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag), 2: AxisType(ChannelTag),}),
             "max_target_len": NeuralType(None),
         }
 
@@ -244,23 +213,11 @@ def output_ports(self):
             1: AxisType(ChannelTag)
         """
         return {
-            "outputs": NeuralType(
-                {0: AxisType(TimeTag), 1: AxisType(BatchTag),
-                 2: AxisType(ChannelTag)}
-            ),
-            "hidden": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "outputs": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag), 2: AxisType(ChannelTag),}),
+            "hidden": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
         }
 
-    def __init__(
-            self, *,
-            attn_model,
-            hidden_size,
-            voc_size,
-            decoder_n_layers,
-            dropout,
-            **kwargs
-    ):
+    def __init__(self, *, attn_model, hidden_size, voc_size, decoder_n_layers, dropout, **kwargs):
         TrainableNM.__init__(self, **kwargs)
 
         self.attn_model = attn_model
@@ -275,10 +232,7 @@ def __init__(
         self.embedding = nn.Embedding(self.voc_size, self.hidden_size)
         self.embedding_dropout = nn.Dropout(self.dropout)
         self.gru = nn.GRU(
-            self.hidden_size,
-            self.hidden_size,
-            self.n_layers,
-            dropout=(0 if self.n_layers == 1 else self.dropout),
+            self.hidden_size, self.hidden_size, self.n_layers, dropout=(0 if self.n_layers == 1 else self.dropout),
         )
         self.concat = nn.Linear(self.hidden_size * 2, self.hidden_size)
         self.out = nn.Linear(self.hidden_size, self.output_size)
@@ -289,9 +243,7 @@ def __init__(self, method, hidden_size):
                 super(Attn, self).__init__()
                 self.method = method
                 if self.method not in ["dot", "general", "concat"]:
-                    raise ValueError(
-                        self.method, "is not an appropriate attention method."
-                    )
+                    raise ValueError(self.method, "is not an appropriate attention method.")
                 self.hidden_size = hidden_size
                 if self.method == "general":
                     self.attn = t.nn.Linear(self.hidden_size, hidden_size)
@@ -307,13 +259,7 @@ def general_score(self, hidden, encoder_output):
                 return t.sum(hidden * energy, dim=2)
 
             def concat_score(self, hidden, encoder_output):
-                energy = self.attn(
-                    t.cat(
-                        (hidden.expand(encoder_output.size(0), -1, -1),
-                         encoder_output),
-                        2,
-                    )
-                ).tanh()
+                energy = self.attn(t.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output,), 2,)).tanh()
                 return t.sum(self.v * energy, dim=2)
 
             def forward(self, hidden, encoder_outputs):
@@ -335,8 +281,7 @@ def forward(self, hidden, encoder_outputs):
 
         self.attn = Attn(self.attn_model, self.hidden_size)
 
-        self._device = t.device(
-            "cuda" if self.placement == DeviceType.GPU else "cpu")
+        self._device = t.device("cuda" if self.placement == DeviceType.GPU else "cpu")
         self.to(self._device)
 
     def one_step_forward(self, embedded, last_hidden, encoder_outputs):
@@ -362,18 +307,14 @@ def one_step_forward(self, embedded, last_hidden, encoder_outputs):
     def forward(self, targets, encoder_outputs, max_target_len):
         SOS_token = 1  # Start-of-sentence token
 
-        decoder_input = t.LongTensor(
-            [[SOS_token for _ in range(encoder_outputs.shape[1])]]
-        )
+        decoder_input = t.LongTensor([[SOS_token for _ in range(encoder_outputs.shape[1])]])
         decoder_input = decoder_input.to(self._device)
         decoder_hidden = None
         decoder_output = []
         for step_t in range(max_target_len):
             decoder_inpt_embd = self.embedding(decoder_input)
             decoder_step_output, decoder_hidden = self.one_step_forward(
-                embedded=decoder_inpt_embd,
-                last_hidden=decoder_hidden,
-                encoder_outputs=encoder_outputs,
+                embedded=decoder_inpt_embd, last_hidden=decoder_hidden, encoder_outputs=encoder_outputs,
             )
             decoder_output.append(decoder_step_output)
             # Teacher forcing: next input is current target
@@ -383,7 +324,6 @@ def forward(self, targets, encoder_outputs, max_target_len):
 
 
 class MaskedXEntropyLoss(LossNM):
-
     @property
     def input_ports(self):
         """Returns definitions of module input ports.
@@ -406,12 +346,8 @@ def input_ports(self):
             1: AxisType(BatchTag)
         """
         return {
-            "predictions": NeuralType(
-                {0: AxisType(TimeTag), 1: AxisType(BatchTag),
-                 2: AxisType(ChannelTag)}
-            ),
-            "target": NeuralType(
-                {0: AxisType(TimeTag), 1: AxisType(BatchTag)}),
+            "predictions": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag), 2: AxisType(ChannelTag),}),
+            "target": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag)}),
             "mask": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag)}),
         }
 
@@ -422,15 +358,12 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, **kwargs):
         LossNM.__init__(self, **kwargs)
 
-        self._device = t.device(
-            "cuda" if self.placement == DeviceType.GPU else "cpu")
+        self._device = t.device("cuda" if self.placement == DeviceType.GPU else "cpu")
 
     def _loss(self, inp, target, mask):
         inp = inp.view(-1, inp.shape[2])
@@ -445,7 +378,6 @@ def _loss_function(self, **kwargs):
 
 
 class GreedyLuongAttnDecoderRNN(TrainableNM):
-
     @property
     def input_ports(self):
         """Returns definitions of module input ports.
@@ -457,12 +389,7 @@ def input_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "encoder_outputs": NeuralType(
-                {0: AxisType(TimeTag), 1: AxisType(BatchTag),
-                 2: AxisType(ChannelTag)}
-            )
-        }
+        return {"encoder_outputs": NeuralType({0: AxisType(TimeTag), 1: AxisType(BatchTag), 2: AxisType(ChannelTag),})}
 
     @property
     def output_ports(self):
@@ -486,20 +413,10 @@ def output_ports(self):
                     # 2: AxisType(ChannelTag)
                 }
             ),
-            "hidden": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "hidden": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
         }
 
-    def __init__(
-            self, *,
-            attn_model,
-            hidden_size,
-            voc_size,
-            decoder_n_layers,
-            dropout,
-            max_dec_steps=10,
-            **kwargs
-    ):
+    def __init__(self, *, attn_model, hidden_size, voc_size, decoder_n_layers, dropout, max_dec_steps=10, **kwargs):
         TrainableNM.__init__(self, **kwargs)
 
         self.attn_model = attn_model
@@ -515,10 +432,7 @@ def __init__(
         self.embedding = nn.Embedding(self.voc_size, self.hidden_size)
         self.embedding_dropout = nn.Dropout(self.dropout)
         self.gru = nn.GRU(
-            self.hidden_size,
-            self.hidden_size,
-            self.n_layers,
-            dropout=(0 if self.n_layers == 1 else self.dropout),
+            self.hidden_size, self.hidden_size, self.n_layers, dropout=(0 if self.n_layers == 1 else self.dropout),
         )
         self.concat = nn.Linear(self.hidden_size * 2, self.hidden_size)
         self.out = nn.Linear(self.hidden_size, self.output_size)
@@ -529,9 +443,7 @@ def __init__(self, method, hidden_size):
                 super(Attn, self).__init__()
                 self.method = method
                 if self.method not in ["dot", "general", "concat"]:
-                    raise ValueError(
-                        self.method, "is not an appropriate attention method."
-                    )
+                    raise ValueError(self.method, "is not an appropriate attention method.")
                 self.hidden_size = hidden_size
                 if self.method == "general":
                     self.attn = t.nn.Linear(self.hidden_size, hidden_size)
@@ -547,13 +459,7 @@ def general_score(self, hidden, encoder_output):
                 return t.sum(hidden * energy, dim=2)
 
             def concat_score(self, hidden, encoder_output):
-                energy = self.attn(
-                    t.cat(
-                        (hidden.expand(encoder_output.size(0), -1, -1),
-                         encoder_output),
-                        2,
-                    )
-                ).tanh()
+                energy = self.attn(t.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output,), 2,)).tanh()
                 return t.sum(self.v * energy, dim=2)
 
             def forward(self, hidden, encoder_outputs):
@@ -575,8 +481,7 @@ def forward(self, hidden, encoder_outputs):
 
         self.attn = Attn(self.attn_model, self.hidden_size)
 
-        self._device = t.device(
-            "cuda" if self.placement == DeviceType.GPU else "cpu")
+        self._device = t.device("cuda" if self.placement == DeviceType.GPU else "cpu")
         self.to(self._device)
 
     def one_step_forward(self, embedded, last_hidden, encoder_outputs):
@@ -603,9 +508,7 @@ def forward(self, encoder_outputs):
         SOS_token = 1  # Start-of-sentence token
         encoder_outputs = encoder_outputs.detach()
 
-        decoder_input = t.LongTensor(
-            [[SOS_token for _ in range(encoder_outputs.shape[1])]]
-        )
+        decoder_input = t.LongTensor([[SOS_token for _ in range(encoder_outputs.shape[1])]])
         decoder_input = decoder_input.to(self._device)
         decoder_hidden = None
         decoder_output = []
@@ -613,9 +516,7 @@ def forward(self, encoder_outputs):
         for step_t in range(self.max_decoder_steps):
             decoder_inpt_embd = self.embedding(decoder_input)
             decoder_step_output, decoder_hidden = self.one_step_forward(
-                embedded=decoder_inpt_embd,
-                last_hidden=decoder_hidden,
-                encoder_outputs=encoder_outputs,
+                embedded=decoder_inpt_embd, last_hidden=decoder_hidden, encoder_outputs=encoder_outputs,
             )
             decoder_output.append(decoder_step_output)
             # Teacher forcing: next input is current target
@@ -623,8 +524,7 @@ def forward(self, encoder_outputs):
             topi = topi.detach()
             # if topi.item() == EOS_token:
             #  break
-            decoder_input = t.LongTensor(
-                [[topi[i][0] for i in range(topi.shape[0])]])
+            decoder_input = t.LongTensor([[topi[i][0] for i in range(topi.shape[0])]])
             decoder_input = decoder_input.to(self._device)
             # decoder_input = targets[step_t].view(1, -1)
         result_logits = t.stack(decoder_output, dim=0)
diff --git a/nemo/backends/pytorch/tutorials/toys.py b/nemo/backends/pytorch/tutorials/toys.py
index 3dbd3b2fe689..b2449c5ddfd5 100644
--- a/nemo/backends/pytorch/tutorials/toys.py
+++ b/nemo/backends/pytorch/tutorials/toys.py
@@ -5,9 +5,9 @@
 import torch.nn as nn
 import torch.utils.data as t_utils
 
-from ..nm import TrainableNM, DataLayerNM, LossNM
-from ....core import NeuralModule, DeviceType
+from ....core import DeviceType, NeuralModule
 from ....core.neural_types import *
+from ..nm import DataLayerNM, LossNM, TrainableNM
 
 
 class TaylorNet(TrainableNM):  # Note inheritance from TrainableNM
@@ -20,9 +20,7 @@ def input_ports(self):
         Returns:
           A (dict) of module's input ports names to NeuralTypes mapping
         """
-        return {
-            "x": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})
-        }
+        return {"x": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})}
 
     @property
     def output_ports(self):
@@ -31,10 +29,7 @@ def output_ports(self):
         Returns:
           A (dict) of module's output ports names to NeuralTypes mapping
         """
-        return {
-            "y_pred": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(ChannelTag)})
-        }
+        return {"y_pred": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})}
 
     def __init__(self, *, dim, **kwargs):
         # Part specific for Neural Modules API:
@@ -46,8 +41,7 @@ def __init__(self, *, dim, **kwargs):
         self._dim = dim
         self.fc1 = nn.Linear(self._dim, 1)
         t.nn.init.xavier_uniform_(self.fc1.weight)
-        self._device = t.device(
-            "cuda" if self.placement == DeviceType.GPU else "cpu")
+        self._device = t.device("cuda" if self.placement == DeviceType.GPU else "cpu")
         self.to(self._device)
 
     # IMPORTANT: input arguments to forward must match input input ports' names
@@ -90,13 +84,7 @@ def output_ports(self):
 
             1: AxisType(ChannelTag)
         """
-        return {
-            "y_pred": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag)},
-                optional=True
-            )
-        }
+        return {"y_pred": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}, optional=True)}
 
     def __init__(self, *, dim, **kwargs):
         # Part specific for Neural Modules API:
@@ -108,8 +96,7 @@ def __init__(self, *, dim, **kwargs):
         self._dim = dim
         self.fc1 = nn.Linear(self._dim, 1)
         t.nn.init.xavier_uniform_(self.fc1.weight)
-        self._device = t.device(
-            "cuda" if self.placement == DeviceType.GPU else "cpu")
+        self._device = t.device("cuda" if self.placement == DeviceType.GPU else "cpu")
         self.to(self._device)
 
     # IMPORTANT: input arguments to forward must match input input ports' names
@@ -167,18 +154,13 @@ def __init__(self, *, n, batch_size, f=t.sin, x_lo=-4, x_hi=4, **kwargs):
 
         self._n = n
         self._batch_size = batch_size
-        self._device = t.device(
-            "cuda" if self.placement == DeviceType.GPU else "cpu")
+        self._device = t.device("cuda" if self.placement == DeviceType.GPU else "cpu")
 
-        x_data = (
-            t.tensor(np.random.uniform(low=x_lo, high=x_hi, size=self._n))
-            .unsqueeze(-1).to(self._device)
-        )
+        x_data = t.tensor(np.random.uniform(low=x_lo, high=x_hi, size=self._n)).unsqueeze(-1).to(self._device)
         y_data = f(x_data)
 
         self._data_iterator = t_utils.DataLoader(
-            t_utils.TensorDataset(x_data.float(), y_data.float()),
-            batch_size=self._batch_size,
+            t_utils.TensorDataset(x_data.float(), y_data.float()), batch_size=self._batch_size,
         )
 
     @property
@@ -191,7 +173,6 @@ def dataset(self):
 
 
 class MSELoss(LossNM):
-
     @property
     def input_ports(self):
         """Returns definitions of module input ports.
@@ -207,10 +188,8 @@ def input_ports(self):
             1: AxisType(ChannelTag)
         """
         return {
-            "predictions": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
-            "target": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "predictions": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "target": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
         }
 
     @property
@@ -220,9 +199,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, **kwargs):
         LossNM.__init__(self, **kwargs)
@@ -233,7 +210,6 @@ def _loss_function(self, **kwargs):
 
 
 class L1Loss(LossNM):
-
     @property
     def input_ports(self):
         """Returns definitions of module input ports.
@@ -249,10 +225,8 @@ def input_ports(self):
             1: AxisType(ChannelTag)
         """
         return {
-            "predictions": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
-            "target": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "predictions": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "target": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
         }
 
     @property
@@ -262,9 +236,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, **kwargs):
         LossNM.__init__(self, **kwargs)
@@ -275,7 +247,6 @@ def _loss_function(self, **kwargs):
 
 
 class CrossEntropyLoss(LossNM):
-
     @property
     def input_ports(self):
         """Returns definitions of module input ports.
@@ -289,8 +260,7 @@ def input_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "predictions": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "predictions": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
             "labels": NeuralType({0: AxisType(BatchTag)}),
         }
 
@@ -301,9 +271,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, **kwargs):
         # Neural Module API specific
@@ -342,13 +310,9 @@ def input_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "belief_predictions": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}
-            ),
+            "belief_predictions": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
             "belief_labels": NeuralType({0: AxisType(BatchTag)}),
-            "affinity_predictions": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}
-            ),
+            "affinity_predictions": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
             "affinity_labels": NeuralType({0: AxisType(BatchTag)}),
         }
 
@@ -359,9 +323,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, **kwargs):
         # Neural Module API specific
@@ -374,19 +336,13 @@ def _loss_function(self, **kwargs):
         # Belief maps loss
         # output, each belief map layers.
         for l in kwargs["belief_predictions"]:
-            loss_tmp = (
-                (l - kwargs["belief_labels"]) * (
-                    l - kwargs["belief_labels"])
-            ).mean()
+            loss_tmp = ((l - kwargs["belief_labels"]) * (l - kwargs["belief_labels"])).mean()
             loss += loss_tmp
 
         # Affinities loss
         # output, each belief map layers.
         for l in kwargs["affinity_predictions"]:
-            loss_tmp = (
-                (l - kwargs["affinity_labels"]) * (
-                    l - kwargs["affinity_labels"])
-            ).mean()
+            loss_tmp = ((l - kwargs["affinity_labels"]) * (l - kwargs["affinity_labels"])).mean()
             loss += loss_tmp
 
         return loss
diff --git a/nemo/collections/asr/__init__.py b/nemo/collections/asr/__init__.py
index e370d7fffff9..31e59311d986 100644
--- a/nemo/collections/asr/__init__.py
+++ b/nemo/collections/asr/__init__.py
@@ -16,30 +16,31 @@
 
 from .audio_preprocessing import *
 from .beam_search_decoder import BeamSearchDecoderWithLM
-from .data_layer import (
-        AudioToTextDataLayer, KaldiFeatureDataLayer, TranscriptDataLayer)
+from .data_layer import AudioToTextDataLayer, KaldiFeatureDataLayer, TranscriptDataLayer
 from .greedy_ctc_decoder import GreedyCTCDecoder
-from .jasper import JasperEncoder, JasperDecoderForCTC
+from .jasper import JasperDecoderForCTC, JasperEncoder
 from .las.misc import JasperRNNConnector
 from .losses import CTCLossNM
 
-__all__ = ['Backend',
-           'AudioToTextDataLayer',
-           'AudioPreprocessing',
-           'AudioPreprocessor',
-           'AudioToMFCCPreprocessor',
-           'AudioToMelSpectrogramPreprocessor',
-           'AudioToSpectrogramPreprocessor',
-           'MultiplyBatch',
-           'SpectrogramAugmentation',
-           'KaldiFeatureDataLayer',
-           'TranscriptDataLayer',
-           'GreedyCTCDecoder',
-           'BeamSearchDecoderWithLM',
-           'JasperEncoder',
-           'JasperDecoderForCTC',
-           'JasperRNNConnector',
-           'CTCLossNM']
+__all__ = [
+    'Backend',
+    'AudioToTextDataLayer',
+    'AudioPreprocessing',
+    'AudioPreprocessor',
+    'AudioToMFCCPreprocessor',
+    'AudioToMelSpectrogramPreprocessor',
+    'AudioToSpectrogramPreprocessor',
+    'MultiplyBatch',
+    'SpectrogramAugmentation',
+    'KaldiFeatureDataLayer',
+    'TranscriptDataLayer',
+    'GreedyCTCDecoder',
+    'BeamSearchDecoderWithLM',
+    'JasperEncoder',
+    'JasperDecoderForCTC',
+    'JasperRNNConnector',
+    'CTCLossNM',
+]
 
 
 backend = Backend.PyTorch
diff --git a/nemo/collections/asr/audio_preprocessing.py b/nemo/collections/asr/audio_preprocessing.py
index 650d7d651ad1..17beab78a8ae 100644
--- a/nemo/collections/asr/audio_preprocessing.py
+++ b/nemo/collections/asr/audio_preprocessing.py
@@ -14,21 +14,32 @@
 """
 This file contains neural modules responsible for preprocessing audio data.
 """
-__all__ = ['AudioPreprocessing',
-           'AudioPreprocessor',
-           'AudioToMFCCPreprocessor',
-           'AudioToMelSpectrogramPreprocessor',
-           'AudioToSpectrogramPreprocessor',
-           'MultiplyBatch',
-           'SpectrogramAugmentation']
+__all__ = [
+    'AudioPreprocessing',
+    'AudioPreprocessor',
+    'AudioToMFCCPreprocessor',
+    'AudioToMelSpectrogramPreprocessor',
+    'AudioToSpectrogramPreprocessor',
+    'MultiplyBatch',
+    'SpectrogramAugmentation',
+]
 
-from abc import abstractmethod
 import math
 import warnings
+from abc import abstractmethod
 
 import torch
+
+from nemo.backends.pytorch import NonTrainableNM
+from nemo.core import Optimization
+from nemo.core.neural_types import *
+
+from .parts.features import FilterbankFeatures
+from .parts.spectr_augment import SpecAugment, SpecCutout
+
 try:
     import torchaudio
+
     HAVE_TORCHAUDIO = True
 except ModuleNotFoundError:
     HAVE_TORCHAUDIO = False
@@ -36,15 +47,7 @@
 try:
     from apex import amp
 except (AttributeError, ModuleNotFoundError) as e:
-    warnings.warn(
-        "Unable to import APEX. Mixed precision and distributed training "
-        "will not work.")
-
-from nemo.backends.pytorch import NonTrainableNM
-from nemo.core import Optimization
-from nemo.core.neural_types import *
-from .parts.features import FilterbankFeatures
-from .parts.spectr_augment import SpecAugment, SpecCutout
+    warnings.warn("Unable to import APEX. Mixed precision and distributed training " "will not work.")
 
 
 class AudioPreprocessor(NonTrainableNM):
@@ -59,7 +62,7 @@ def __init__(self, win_length, hop_length, **kwargs):
         self.win_length = win_length
         self.hop_length = hop_length
 
-        self.disable_casts = (self._opt_level == Optimization.mxprO1)
+        self.disable_casts = self._opt_level == Optimization.mxprO1
 
         self.torch_windows = {
             'hann': torch.hann_window,
@@ -67,15 +70,14 @@ def __init__(self, win_length, hop_length, **kwargs):
             'blackman': torch.blackman_window,
             'bartlett': torch.bartlett_window,
             'ones': torch.ones,
-            None: torch.ones
+            None: torch.ones,
         }
 
     @torch.no_grad()
     def forward(self, input_signal, length):
         if self.disable_casts:
             with amp.disable_casts():
-                processed_signal = self.get_features(
-                    input_signal.to(torch.float), length)
+                processed_signal = self.get_features(input_signal.to(torch.float), length)
         else:
             processed_signal = self.get_features(input_signal, length)
 
@@ -130,9 +132,7 @@ def input_ports(self):
 
         """
         return {
-            "input_signal": NeuralType({0: AxisType(BatchTag),
-                                        1: AxisType(TimeTag)}),
-
+            "input_signal": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "length": NeuralType({0: AxisType(BatchTag)}),
         }
 
@@ -154,36 +154,37 @@ def output_ports(self):
 
         """
         return {
-            "processed_signal": NeuralType({0: AxisType(BatchTag),
-                                            1: AxisType(SpectrogramSignalTag),
-                                            2: AxisType(ProcessedTimeTag)}),
-
-            "processed_length": NeuralType({0: AxisType(BatchTag)})
+            "processed_signal": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(SpectrogramSignalTag), 2: AxisType(ProcessedTimeTag),}
+            ),
+            "processed_length": NeuralType({0: AxisType(BatchTag)}),
         }
 
     def __init__(
-            self, *,
-            sample_rate=16000,
-            window_size=0.02,
-            window_stride=0.01,
-            n_window_size=None,
-            n_window_stride=None,
-            n_fft=None,
-            window="hann",
-            normalized=True,
-            **kwargs
+        self,
+        *,
+        sample_rate=16000,
+        window_size=0.02,
+        window_stride=0.01,
+        n_window_size=None,
+        n_window_stride=None,
+        n_fft=None,
+        window="hann",
+        normalized=True,
+        **kwargs,
     ):
         if not HAVE_TORCHAUDIO:
             raise ModuleNotFoundError(
                 "torchaudio is not installed but is necessary for "
                 "AudioToSpectrogramPreprocessor. We recommend you try "
-                "building it from source for the PyTorch version you have.")
+                "building it from source for the PyTorch version you have."
+            )
         if window_size and n_window_size:
-            raise ValueError(f"{self} received both window_size and "
-                             f"n_window_size. Only one should be specified.")
+            raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.")
         if window_stride and n_window_stride:
-            raise ValueError(f"{self} received both window_stride and "
-                             f"n_window_stride. Only one should be specified.")
+            raise ValueError(
+                f"{self} received both window_stride and " f"n_window_stride. Only one should be specified."
+            )
         if window_size:
             n_window_size = int(window_size * sample_rate)
         if window_stride:
@@ -201,7 +202,8 @@ def __init__(
         if window_fn is None:
             raise ValueError(
                 f"Window argument for AudioProcessor is invalid: {window}."
-                f"For no window function, use 'ones' or None.")
+                f"For no window function, use 'ones' or None."
+            )
 
         # Create featurizer.
         # Calls torch.stft under the hood, and is hard-coded to use center=True
@@ -210,7 +212,7 @@ def __init__(
             win_length=self.win_length,
             hop_length=self.hop_length,
             window_fn=window_fn,
-            normalized=normalized
+            normalized=normalized,
         )
         self.featurizer.to(self._device)
 
@@ -295,9 +297,7 @@ def input_ports(self):
 
         """
         return {
-            "input_signal": NeuralType({0: AxisType(BatchTag),
-                                        1: AxisType(TimeTag)}),
-
+            "input_signal": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "length": NeuralType({0: AxisType(BatchTag)}),
         }
 
@@ -320,44 +320,43 @@ def output_ports(self):
         """
         return {
             "processed_signal": NeuralType(
-                {0: AxisType(BatchTag),
-                 1: AxisType(MelSpectrogramSignalTag),
-                 2: AxisType(ProcessedTimeTag)}),
-
-            "processed_length": NeuralType({0: AxisType(BatchTag)})
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(ProcessedTimeTag),}
+            ),
+            "processed_length": NeuralType({0: AxisType(BatchTag)}),
         }
 
     def __init__(
-            self, *,
-            sample_rate=16000,
-            window_size=0.02,
-            window_stride=0.01,
-            n_window_size=None,
-            n_window_stride=None,
-            window="hann",
-            normalize="per_feature",
-            n_fft=None,
-            preemph=0.97,
-            features=64,
-            lowfreq=0,
-            highfreq=None,
-            log=True,
-            log_zero_guard_type="add",
-            log_zero_guard_value=2**-24,
-            dither=1e-5,
-            pad_to=16,
-            frame_splicing=1,
-            stft_conv=False,
-            pad_value=0,
-            mag_power=2.,
-            **kwargs
+        self,
+        *,
+        sample_rate=16000,
+        window_size=0.02,
+        window_stride=0.01,
+        n_window_size=None,
+        n_window_stride=None,
+        window="hann",
+        normalize="per_feature",
+        n_fft=None,
+        preemph=0.97,
+        features=64,
+        lowfreq=0,
+        highfreq=None,
+        log=True,
+        log_zero_guard_type="add",
+        log_zero_guard_value=2 ** -24,
+        dither=1e-5,
+        pad_to=16,
+        frame_splicing=1,
+        stft_conv=False,
+        pad_value=0,
+        mag_power=2.0,
+        **kwargs,
     ):
         if window_size and n_window_size:
-            raise ValueError(f"{self} received both window_size and "
-                             f"n_window_size. Only one should be specified.")
+            raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.")
         if window_stride and n_window_stride:
-            raise ValueError(f"{self} received both window_stride and "
-                             f"n_window_stride. Only one should be specified.")
+            raise ValueError(
+                f"{self} received both window_stride and " f"n_window_stride. Only one should be specified."
+            )
         if window_size:
             n_window_size = int(window_size * sample_rate)
         if window_stride:
@@ -384,7 +383,7 @@ def __init__(
             frame_splicing=frame_splicing,
             stft_conv=stft_conv,
             pad_value=pad_value,
-            mag_power=mag_power
+            mag_power=mag_power,
         )
         self.featurizer.to(self._device)
 
@@ -450,9 +449,7 @@ def input_ports(self):
 
         """
         return {
-            "input_signal": NeuralType({0: AxisType(BatchTag),
-                                        1: AxisType(TimeTag)}),
-
+            "input_signal": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "length": NeuralType({0: AxisType(BatchTag)}),
         }
 
@@ -474,41 +471,43 @@ def output_ports(self):
 
         """
         return {
-            "processed_signal": NeuralType({0: AxisType(BatchTag),
-                                            1: AxisType(MFCCSignalTag),
-                                            2: AxisType(ProcessedTimeTag)}),
-
-            "processed_length": NeuralType({0: AxisType(BatchTag)})
+            "processed_signal": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(MFCCSignalTag), 2: AxisType(ProcessedTimeTag),}
+            ),
+            "processed_length": NeuralType({0: AxisType(BatchTag)}),
         }
 
     def __init__(
-            self, *,
-            sample_rate=16000,
-            window_size=0.02,
-            window_stride=0.01,
-            n_window_size=None,
-            n_window_stride=None,
-            window='hann',
-            n_fft=None,
-            lowfreq=0.,
-            highfreq=None,
-            n_mels=64,
-            n_mfcc=64,
-            dct_type=2,
-            norm='ortho',
-            log=True,
-            **kwargs):
+        self,
+        *,
+        sample_rate=16000,
+        window_size=0.02,
+        window_stride=0.01,
+        n_window_size=None,
+        n_window_stride=None,
+        window='hann',
+        n_fft=None,
+        lowfreq=0.0,
+        highfreq=None,
+        n_mels=64,
+        n_mfcc=64,
+        dct_type=2,
+        norm='ortho',
+        log=True,
+        **kwargs,
+    ):
         if not HAVE_TORCHAUDIO:
             raise ModuleNotFoundError(
                 "torchaudio is not installed but is necessary for "
                 "AudioToMFCCPreprocessor. We recommend you try "
-                "building it from source for the PyTorch version you have.")
+                "building it from source for the PyTorch version you have."
+            )
         if window_size and n_window_size:
-            raise ValueError(f"{self} received both window_size and "
-                             f"n_window_size. Only one should be specified.")
+            raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.")
         if window_stride and n_window_stride:
-            raise ValueError(f"{self} received both window_stride and "
-                             f"n_window_stride. Only one should be specified.")
+            raise ValueError(
+                f"{self} received both window_stride and " f"n_window_stride. Only one should be specified."
+            )
         # Get win_length (n_window_size) and hop_length (n_window_stride)
         if window_size:
             n_window_size = int(window_size * sample_rate)
@@ -533,17 +532,13 @@ def __init__(
         if window_fn is None:
             raise ValueError(
                 f"Window argument for AudioProcessor is invalid: {window}."
-                f"For no window function, use 'ones' or None.")
+                f"For no window function, use 'ones' or None."
+            )
         mel_kwargs['window_fn'] = window_fn
 
         # Use torchaudio's implementation of MFCCs as featurizer
         self.featurizer = torchaudio.transforms.MFCC(
-            sample_rate=sample_rate,
-            n_mfcc=n_mfcc,
-            dct_type=dct_type,
-            norm=norm,
-            log_mels=log,
-            melkwargs=mel_kwargs
+            sample_rate=sample_rate, n_mfcc=n_mfcc, dct_type=dct_type, norm=norm, log_mels=log, melkwargs=mel_kwargs,
         )
         self.featurizer.to(self._device)
 
@@ -597,9 +592,7 @@ def input_ports(self):
 
         """
         return {
-            "input_spec": NeuralType({0: AxisType(BatchTag),
-                                      1: AxisType(SpectrogramSignalTag),
-                                      2: AxisType(TimeTag)})
+            "input_spec": NeuralType({0: AxisType(BatchTag), 1: AxisType(SpectrogramSignalTag), 2: AxisType(TimeTag),})
         }
 
     @property
@@ -616,43 +609,35 @@ def output_ports(self):
 
         """
         return {
-            "augmented_spec": NeuralType({0: AxisType(BatchTag),
-                                          1: AxisType(SpectrogramSignalTag),
-                                          2: AxisType(ProcessedTimeTag)})
+            "augmented_spec": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(SpectrogramSignalTag), 2: AxisType(ProcessedTimeTag),}
+            )
         }
 
     def __init__(
-            self, *,
-            freq_masks=0,
-            time_masks=0,
-            freq_width=10,
-            time_width=10,
-            rect_masks=0,
-            rect_time=5,
-            rect_freq=20,
-            rng=None,
-            **kwargs
+        self,
+        *,
+        freq_masks=0,
+        time_masks=0,
+        freq_width=10,
+        time_width=10,
+        rect_masks=0,
+        rect_time=5,
+        rect_freq=20,
+        rng=None,
+        **kwargs,
     ):
         NonTrainableNM.__init__(self, **kwargs)
 
         if rect_masks > 0:
-            self.spec_cutout = SpecCutout(
-                rect_masks=rect_masks,
-                rect_time=rect_time,
-                rect_freq=rect_freq,
-                rng=rng
-            )
+            self.spec_cutout = SpecCutout(rect_masks=rect_masks, rect_time=rect_time, rect_freq=rect_freq, rng=rng,)
             self.spec_cutout.to(self._device)
         else:
             self.spec_cutout = lambda x: x
 
         if freq_masks + time_masks > 0:
             self.spec_augment = SpecAugment(
-                freq_masks=freq_masks,
-                time_masks=time_masks,
-                freq_width=freq_width,
-                time_width=time_width,
-                rng=rng
+                freq_masks=freq_masks, time_masks=time_masks, freq_width=freq_width, time_width=time_width, rng=rng,
             )
             self.spec_augment.to(self._device)
         else:
@@ -697,16 +682,10 @@ def input_ports(self):
 
         """
         return {
-            "in_x": NeuralType({0: AxisType(BatchTag),
-                                1: AxisType(SpectrogramSignalTag),
-                                2: AxisType(TimeTag)}),
-
+            "in_x": NeuralType({0: AxisType(BatchTag), 1: AxisType(SpectrogramSignalTag), 2: AxisType(TimeTag),}),
             "in_x_len": NeuralType({0: AxisType(BatchTag)}),
-
-            "in_y": NeuralType({0: AxisType(BatchTag),
-                                1: AxisType(TimeTag)}),
-
-            "in_y_len": NeuralType({0: AxisType(BatchTag)})
+            "in_y": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "in_y_len": NeuralType({0: AxisType(BatchTag)}),
         }
 
     @property
@@ -733,16 +712,10 @@ def output_ports(self):
 
         """
         return {
-            "out_x": NeuralType({0: AxisType(BatchTag),
-                                 1: AxisType(SpectrogramSignalTag),
-                                 2: AxisType(TimeTag)}),
-
+            "out_x": NeuralType({0: AxisType(BatchTag), 1: AxisType(SpectrogramSignalTag), 2: AxisType(TimeTag),}),
             "out_x_len": NeuralType({0: AxisType(BatchTag)}),
-
-            "out_y": NeuralType({0: AxisType(BatchTag),
-                                 1: AxisType(TimeTag)}),
-
-            "out_y_len": NeuralType({0: AxisType(BatchTag)})
+            "out_y": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "out_y_len": NeuralType({0: AxisType(BatchTag)}),
         }
 
     def __init__(self, *, mult_batch=1, **kwargs):
@@ -765,4 +738,5 @@ def AudioPreprocessing(*args, **kwargs):
         "AudioToMFCCPreprocessor, AudioToMelSpectrogramPreprocessor, and "
         "AudioToSpectrogramPreprocessor. For most ASR purposes "
         "AudioToMelSpectrogramPreprocessor does the same as the old "
-        "AudioPreprocessing.")
+        "AudioPreprocessing."
+    )
diff --git a/nemo/collections/asr/beam_search_decoder.py b/nemo/collections/asr/beam_search_decoder.py
index 7536e9bcaa7b..7c48eb61e88e 100644
--- a/nemo/collections/asr/beam_search_decoder.py
+++ b/nemo/collections/asr/beam_search_decoder.py
@@ -6,8 +6,7 @@
 
 from nemo.backends.pytorch.nm import NonTrainableNM
 from nemo.core import DeviceType
-from nemo.core.neural_types import (NeuralType, AxisType, BatchTag, TimeTag,
-                                    ChannelTag)
+from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
 
 
 class BeamSearchDecoderWithLM(NonTrainableNM):
@@ -53,10 +52,8 @@ def input_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "log_probs": NeuralType({0: AxisType(BatchTag),
-                                     1: AxisType(TimeTag),
-                                     2: AxisType(ChannelTag)}),
-            "log_probs_length": NeuralType({0: AxisType(BatchTag)})
+            "log_probs": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            "log_probs_length": NeuralType({0: AxisType(BatchTag)}),
         }
 
     @property
@@ -66,45 +63,32 @@ def output_ports(self):
         predictions:
             NeuralType(None)
         """
-        return {
-            "predictions": NeuralType(None)
-        }
+        return {"predictions": NeuralType(None)}
 
     def __init__(
-            self, *,
-            vocab,
-            beam_width,
-            alpha,
-            beta,
-            lm_path,
-            num_cpus,
-            cutoff_prob=1.0,
-            cutoff_top_n=40,
-            **kwargs):
+        self, *, vocab, beam_width, alpha, beta, lm_path, num_cpus, cutoff_prob=1.0, cutoff_top_n=40, **kwargs
+    ):
 
         try:
             from ctc_decoders import Scorer
             from ctc_decoders import ctc_beam_search_decoder_batch
         except ModuleNotFoundError:
-            raise ModuleNotFoundError("BeamSearchDecoderWithLM requires the "
-                                      "installation of ctc_decoders "
-                                      "from nemo/scripts/install_decoders.py")
+            raise ModuleNotFoundError(
+                "BeamSearchDecoderWithLM requires the "
+                "installation of ctc_decoders "
+                "from nemo/scripts/install_decoders.py"
+            )
 
         super().__init__(
             # Override default placement from neural factory
             placement=DeviceType.CPU,
-            **kwargs)
+            **kwargs
+        )
 
         if self._factory.world_size > 1:
-            raise ValueError(
-                "BeamSearchDecoderWithLM does not run in distributed mode")
-
-        self.scorer = Scorer(
-            alpha,
-            beta,
-            model_path=lm_path,
-            vocabulary=vocab
-        )
+            raise ValueError("BeamSearchDecoderWithLM does not run in distributed mode")
+
+        self.scorer = Scorer(alpha, beta, model_path=lm_path, vocabulary=vocab)
         self.beam_search_func = ctc_beam_search_decoder_batch
         self.vocab = vocab
         self.beam_width = beam_width
@@ -116,7 +100,7 @@ def forward(self, log_probs, log_probs_length):
         probs = torch.exp(log_probs)
         probs_list = []
         for i, prob in enumerate(probs):
-            probs_list.append(prob[:log_probs_length[i], :])
+            probs_list.append(prob[: log_probs_length[i], :])
         res = self.beam_search_func(
             probs_list,
             self.vocab,
@@ -124,6 +108,6 @@ def forward(self, log_probs, log_probs_length):
             num_processes=self.num_cpus,
             ext_scoring_func=self.scorer,
             cutoff_prob=self.cutoff_prob,
-            cutoff_top_n=self.cutoff_top_n
+            cutoff_top_n=self.cutoff_top_n,
         )
         return [res]
diff --git a/nemo/collections/asr/data_layer.py b/nemo/collections/asr/data_layer.py
index a6a917a58551..647ce2de188a 100644
--- a/nemo/collections/asr/data_layer.py
+++ b/nemo/collections/asr/data_layer.py
@@ -11,12 +11,7 @@
 from nemo.core.neural_types import *
 from nemo.utils.misc import pad_to
 
-from .parts.dataset import (
-    AudioDataset,
-    seq_collate_fn,
-    KaldiFeatureDataset,
-    TranscriptDataset,
-)
+from .parts.dataset import AudioDataset, KaldiFeatureDataset, TranscriptDataset, seq_collate_fn
 from .parts.features import WaveformFeaturizer
 
 __all__ = [
@@ -106,13 +101,9 @@ def output_ports(self):
 
         """
         return {
-            'audio_signal': NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(TimeTag)}
-            ),
+            'audio_signal': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             'a_sig_length': NeuralType({0: AxisType(BatchTag)}),
-            'transcripts': NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(TimeTag)}
-            ),
+            'transcripts': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             'transcript_length': NeuralType({0: AxisType(BatchTag)}),
         }
 
@@ -140,9 +131,7 @@ def __init__(
     ):
         super().__init__(**kwargs)
 
-        self._featurizer = WaveformFeaturizer(
-            sample_rate=sample_rate, int_values=int_values, augmentor=None
-        )
+        self._featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=None)
 
         # Set up dataset
         dataset_params = {
@@ -162,9 +151,7 @@ def __init__(
         # Set up data loader
         if self._placement == DeviceType.AllGpu:
             nemo.logging.info("Parallelizing Datalayer.")
-            sampler = torch.utils.data.distributed.DistributedSampler(
-                self._dataset
-            )
+            sampler = torch.utils.data.distributed.DistributedSampler(self._dataset)
         else:
             sampler = None
 
@@ -250,16 +237,10 @@ def output_ports(self):
         """
         return {
             'processed_signal': NeuralType(
-                {
-                    0: AxisType(BatchTag),
-                    1: AxisType(SpectrogramSignalTag),
-                    2: AxisType(ProcessedTimeTag),
-                }
+                {0: AxisType(BatchTag), 1: AxisType(SpectrogramSignalTag), 2: AxisType(ProcessedTimeTag),}
             ),
             'processed_length': NeuralType({0: AxisType(BatchTag)}),
-            'transcripts': NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(TimeTag)}
-            ),
+            'transcripts': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             'transcript_length': NeuralType({0: AxisType(BatchTag)}),
         }
 
@@ -292,9 +273,7 @@ def __init__(
         # Set up data loader
         if self._placement == DeviceType.AllGpu:
             nemo.logging.info("Parallelizing DATALAYER")
-            sampler = torch.utils.data.distributed.DistributedSampler(
-                self._dataset
-            )
+            sampler = torch.utils.data.distributed.DistributedSampler(self._dataset)
         else:
             sampler = None
 
@@ -420,9 +399,7 @@ def __init__(
 
         # Set up data loader
         if self._placement == DeviceType.AllGpu:
-            sampler = torch.utils.data.distributed.DistributedSampler(
-                self._dataset
-            )
+            sampler = torch.utils.data.distributed.DistributedSampler(self._dataset)
         else:
             sampler = None
 
@@ -453,10 +430,7 @@ def _collate_fn(batch, pad_id, pad8=False):
             texts[i].narrow(0, 0, s.size(0)).copy_(s)
 
         if len(texts.shape) != 2:
-            raise ValueError(
-                f"Texts in collate function have shape {texts.shape},"
-                f" should have 2 dimensions."
-            )
+            raise ValueError(f"Texts in collate function have shape {texts.shape}," f" should have 2 dimensions.")
 
         return texts, torch.stack(texts_len)
 
diff --git a/nemo/collections/asr/greedy_ctc_decoder.py b/nemo/collections/asr/greedy_ctc_decoder.py
index a3953b540a1e..03eb9862c47b 100644
--- a/nemo/collections/asr/greedy_ctc_decoder.py
+++ b/nemo/collections/asr/greedy_ctc_decoder.py
@@ -2,8 +2,7 @@
 import torch
 
 from nemo.backends.pytorch.nm import TrainableNM
-from nemo.core.neural_types import (NeuralType, AxisType, BatchTag, TimeTag,
-                                    ChannelTag)
+from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
 
 
 class GreedyCTCDecoder(TrainableNM):
@@ -22,13 +21,7 @@ def input_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "log_probs": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
+        return {"log_probs": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
 
     @property
     def output_ports(self):
@@ -39,12 +32,7 @@ def output_ports(self):
 
             1: AxisType(TimeTag)
         """
-        return {
-            "predictions": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
-        }
+        return {"predictions": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})}
 
     def __init__(self, **kwargs):
         TrainableNM.__init__(self, **kwargs)
diff --git a/nemo/collections/asr/helpers.py b/nemo/collections/asr/helpers.py
index 33c8e9a5d6ab..30200b8bc1f4 100644
--- a/nemo/collections/asr/helpers.py
+++ b/nemo/collections/asr/helpers.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2019 NVIDIA Corporation
 
 import torch
+
 import nemo
 
 from .metrics import word_error_rate
@@ -29,10 +30,7 @@ def __ctc_decoder_predictions_tensor(tensor, labels):
     return hypotheses
 
 
-def monitor_asr_train_progress(tensors: list,
-                               labels: list,
-                               eval_metric='WER',
-                               tb_logger=None):
+def monitor_asr_train_progress(tensors: list, labels: list, eval_metric='WER', tb_logger=None):
     """
     Takes output of greedy ctc decoder and performs ctc decoding algorithm to
     remove duplicates and special symbol. Prints sample to screen, computes
@@ -59,8 +57,7 @@ def monitor_asr_train_progress(tensors: list,
             target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist()
             reference = ''.join([labels_map[c] for c in target])
             references.append(reference)
-        hypotheses = __ctc_decoder_predictions_tensor(
-            tensors[1], labels=labels)
+        hypotheses = __ctc_decoder_predictions_tensor(tensors[1], labels=labels)
 
     eval_metric = eval_metric.upper()
     if eval_metric not in {'WER', 'CER'}:
@@ -88,8 +85,7 @@ def __gather_predictions(predictions_list: list, labels: list) -> list:
     return results
 
 
-def __gather_transcripts(transcript_list: list, transcript_len_list: list,
-                         labels: list) -> list:
+def __gather_transcripts(transcript_list: list, transcript_len_list: list, labels: list) -> list:
     results = []
     labels_map = dict([(i, labels[i]) for i in range(len(labels))])
     # iterate over workers
@@ -123,8 +119,7 @@ def process_evaluation_batch(tensors: dict, global_vars: dict, labels: list):
         if kv.startswith('loss'):
             global_vars['EvalLoss'] += __gather_losses(v)
         elif kv.startswith('predictions'):
-            global_vars['predictions'] += __gather_predictions(
-                v, labels=labels)
+            global_vars['predictions'] += __gather_predictions(v, labels=labels)
         elif kv.startswith('transcript_length'):
             transcript_len_list = v
         elif kv.startswith('transcript'):
@@ -132,14 +127,10 @@ def process_evaluation_batch(tensors: dict, global_vars: dict, labels: list):
         elif kv.startswith('output'):
             global_vars['logits'] += v
 
-    global_vars['transcripts'] += __gather_transcripts(transcript_list,
-                                                       transcript_len_list,
-                                                       labels=labels)
+    global_vars['transcripts'] += __gather_transcripts(transcript_list, transcript_len_list, labels=labels)
 
 
-def process_evaluation_epoch(global_vars: dict,
-                             eval_metric='WER',
-                             tag=None):
+def process_evaluation_epoch(global_vars: dict, eval_metric='WER', tag=None):
     """
     Calculates the aggregated loss and WER across the entire evaluation dataset
     """
@@ -152,29 +143,24 @@ def process_evaluation_epoch(global_vars: dict,
         raise ValueError('eval_metric must be \'WER\' or \'CER\'')
     use_cer = True if eval_metric == 'CER' else False
 
-    wer = word_error_rate(hypotheses=hypotheses,
-                          references=references,
-                          use_cer=use_cer)
+    wer = word_error_rate(hypotheses=hypotheses, references=references, use_cer=use_cer)
 
     if tag is None:
         nemo.logging.info(f"==========>>>>>>Evaluation Loss: {eloss}")
-        nemo.logging.info(f"==========>>>>>>Evaluation {eval_metric}: "
-                          f"{wer*100 : 5.2f}%")
+        nemo.logging.info(f"==========>>>>>>Evaluation {eval_metric}: " f"{wer*100 : 5.2f}%")
         return {"Evaluation_Loss": eloss, f"Evaluation_{eval_metric}": wer}
     else:
         nemo.logging.info(f"==========>>>>>>Evaluation Loss {tag}: {eloss}")
-        nemo.logging.info(f"==========>>>>>>Evaluation {eval_metric} {tag}: "
-                          f"{wer*100 : 5.2f}%")
-        return {f"Evaluation_Loss_{tag}": eloss,
-                f"Evaluation_{eval_metric}_{tag}": wer}
+        nemo.logging.info(f"==========>>>>>>Evaluation {eval_metric} {tag}: " f"{wer*100 : 5.2f}%")
+        return {
+            f"Evaluation_Loss_{tag}": eloss,
+            f"Evaluation_{eval_metric}_{tag}": wer,
+        }
 
 
 def post_process_predictions(predictions, labels):
     return __gather_predictions(predictions, labels=labels)
 
 
-def post_process_transcripts(
-        transcript_list, transcript_len_list, labels):
-    return __gather_transcripts(transcript_list,
-                                transcript_len_list,
-                                labels=labels)
+def post_process_transcripts(transcript_list, transcript_len_list, labels):
+    return __gather_transcripts(transcript_list, transcript_len_list, labels=labels)
diff --git a/nemo/collections/asr/jasper.py b/nemo/collections/asr/jasper.py
index e3589f9de82e..f0e0d95ac75f 100644
--- a/nemo/collections/asr/jasper.py
+++ b/nemo/collections/asr/jasper.py
@@ -1,15 +1,23 @@
 # Copyright (c) 2019 NVIDIA Corporation
+from typing import Optional
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from typing import Optional
 
 from nemo.backends.pytorch.nm import TrainableNM
-from nemo.core.neural_types import (NeuralType, AxisType, ChannelTag, BatchTag,
-                                    TimeTag, SpectrogramSignalTag,
-                                    ProcessedTimeTag, EncodedRepresentationTag)
+from nemo.core.neural_types import (
+    AxisType,
+    BatchTag,
+    ChannelTag,
+    EncodedRepresentationTag,
+    NeuralType,
+    ProcessedTimeTag,
+    SpectrogramSignalTag,
+    TimeTag,
+)
 
-from .parts.jasper import JasperBlock, jasper_activations, init_weights
+from .parts.jasper import JasperBlock, init_weights, jasper_activations
 
 
 class JasperEncoder(TrainableNM):
@@ -69,6 +77,7 @@ class JasperEncoder(TrainableNM):
             'kaiming_uniform','kaiming_normal'].
             Defaults to "xavier_uniform".
     """
+
     length: Optional[torch.Tensor]
 
     @property
@@ -86,10 +95,10 @@ def input_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "audio_signal": NeuralType({0: AxisType(BatchTag),
-                                        1: AxisType(SpectrogramSignalTag),
-                                        2: AxisType(ProcessedTimeTag)}),
-            "length": NeuralType({0: AxisType(BatchTag)})
+            "audio_signal": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(SpectrogramSignalTag), 2: AxisType(ProcessedTimeTag),}
+            ),
+            "length": NeuralType({0: AxisType(BatchTag)}),
         }
 
     @property
@@ -108,27 +117,25 @@ def output_ports(self):
 
         """
         return {
-            "outputs": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(EncodedRepresentationTag),
-                2: AxisType(ProcessedTimeTag)
-            }),
-
-            "encoded_lengths": NeuralType({0: AxisType(BatchTag)})
+            "outputs": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(EncodedRepresentationTag), 2: AxisType(ProcessedTimeTag),}
+            ),
+            "encoded_lengths": NeuralType({0: AxisType(BatchTag)}),
         }
 
     def __init__(
-            self, *,
-            jasper,
-            activation,
-            feat_in,
-            normalization_mode="batch",
-            residual_mode="add",
-            norm_groups=-1,
-            conv_mask=True,
-            frame_splicing=1,
-            init_mode='xavier_uniform',
-            **kwargs
+        self,
+        *,
+        jasper,
+        activation,
+        feat_in,
+        normalization_mode="batch",
+        residual_mode="add",
+        norm_groups=-1,
+        conv_mask=True,
+        frame_splicing=1,
+        init_mode='xavier_uniform',
+        **kwargs
     ):
         TrainableNM.__init__(self, **kwargs)
 
@@ -148,23 +155,26 @@ def __init__(
             separable = lcfg.get('separable', False)
             heads = lcfg.get('heads', -1)
             encoder_layers.append(
-                JasperBlock(feat_in,
-                            lcfg['filters'],
-                            repeat=lcfg['repeat'],
-                            kernel_size=lcfg['kernel'],
-                            stride=lcfg['stride'],
-                            dilation=lcfg['dilation'],
-                            dropout=lcfg['dropout'],
-                            residual=lcfg['residual'],
-                            groups=groups,
-                            separable=separable,
-                            heads=heads,
-                            residual_mode=residual_mode,
-                            normalization=normalization_mode,
-                            norm_groups=norm_groups,
-                            activation=activation,
-                            residual_panes=dense_res,
-                            conv_mask=conv_mask))
+                JasperBlock(
+                    feat_in,
+                    lcfg['filters'],
+                    repeat=lcfg['repeat'],
+                    kernel_size=lcfg['kernel'],
+                    stride=lcfg['stride'],
+                    dilation=lcfg['dilation'],
+                    dropout=lcfg['dropout'],
+                    residual=lcfg['residual'],
+                    groups=groups,
+                    separable=separable,
+                    heads=heads,
+                    residual_mode=residual_mode,
+                    normalization=normalization_mode,
+                    norm_groups=norm_groups,
+                    activation=activation,
+                    residual_panes=dense_res,
+                    conv_mask=conv_mask,
+                )
+            )
             feat_in = lcfg['filters']
 
         self.encoder = nn.Sequential(*encoder_layers)
@@ -207,11 +217,9 @@ def input_ports(self):
             2: AxisType(ProcessedTimeTag)
         """
         return {
-            "encoder_output": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(EncodedRepresentationTag),
-                2: AxisType(ProcessedTimeTag)
-            })
+            "encoder_output": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(EncodedRepresentationTag), 2: AxisType(ProcessedTimeTag),}
+            )
         }
 
     @property
@@ -225,33 +233,18 @@ def output_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "output": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
+        return {"output": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
 
-    def __init__(
-            self, *,
-            feat_in,
-            num_classes,
-            init_mode="xavier_uniform",
-            **kwargs
-    ):
+    def __init__(self, *, feat_in, num_classes, init_mode="xavier_uniform", **kwargs):
         TrainableNM.__init__(self, **kwargs)
 
         self._feat_in = feat_in
         # Add 1 for blank char
         self._num_classes = num_classes + 1
 
-        self.decoder_layers = nn.Sequential(
-            nn.Conv1d(self._feat_in, self._num_classes,
-                      kernel_size=1, bias=True))
+        self.decoder_layers = nn.Sequential(nn.Conv1d(self._feat_in, self._num_classes, kernel_size=1, bias=True))
         self.apply(lambda x: init_weights(x, mode=init_mode))
         self.to(self._device)
 
     def forward(self, encoder_output):
-        return F.log_softmax(self.decoder_layers(encoder_output).
-                             transpose(1, 2), dim=-1)
+        return F.log_softmax(self.decoder_layers(encoder_output).transpose(1, 2), dim=-1)
diff --git a/nemo/collections/asr/las/helpers.py b/nemo/collections/asr/las/helpers.py
index 17ea86e26aab..fde132321b50 100644
--- a/nemo/collections/asr/las/helpers.py
+++ b/nemo/collections/asr/las/helpers.py
@@ -2,16 +2,15 @@
 from pprint import pformat
 
 import torch
+
 import nemo
 from nemo.backends.pytorch.common.metrics import char_lm_metrics
-
 from nemo.collections.asr.metrics import word_error_rate
 
 ENG_MWN = 5.3
 
 
-def process_evaluation_batch(tensors, global_vars, labels, specials,
-                             tb_writer=None, write_attn=True):
+def process_evaluation_batch(tensors, global_vars, labels, specials, tb_writer=None, write_attn=True):
     loss, log_probs = ([],) * 2
     transcripts, transcript_texts = ([],) * 2
     predictions, prediction_texts = ([],) * 2
@@ -32,8 +31,7 @@ def process_evaluation_batch(tensors, global_vars, labels, specials,
 
     global_vars.setdefault('loss', [])
     global_vars['loss'].extend(loss)
-    bpc, ppl = char_lm_metrics(log_probs, transcripts,
-                               transcript_texts, specials['pad_id'])
+    bpc, ppl = char_lm_metrics(log_probs, transcripts, transcript_texts, specials['pad_id'])
     global_vars.setdefault('bpc', [])
     global_vars['bpc'].extend(bpc)
     global_vars.setdefault('ppl', [])
@@ -49,14 +47,13 @@ def process_evaluation_batch(tensors, global_vars, labels, specials,
         if sample_len > 0:
             attention_weights = attention_weights[0][0, :sample_len, :]
             tb_writer.add_image(
-                'image/eval_attention_weights', attention_weights,
-                dataformats='HW'
+                'image/eval_attention_weights', attention_weights, dataformats='HW',
             )
 
 
-def process_evaluation_epoch(global_vars,
-                             metrics=('loss', 'bpc', 'ppl'), calc_wer=False,
-                             mode='eval', tag='none'):
+def process_evaluation_epoch(
+    global_vars, metrics=('loss', 'bpc', 'ppl'), calc_wer=False, mode='eval', tag='none',
+):
     tag = '_'.join(tag.lower().strip().split())
     return_dict = {}
     for metric in metrics:
@@ -75,8 +72,7 @@ def process_evaluation_epoch(global_vars,
         nemo.logging.info(transcript_texts[:10])
         nemo.logging.info(prediction_texts[:10])
 
-        wer = word_error_rate(hypotheses=prediction_texts,
-                              references=transcript_texts)
+        wer = word_error_rate(hypotheses=prediction_texts, references=transcript_texts)
         return_dict[f'metric/{mode}_wer_{tag}'] = wer
 
     nemo.logging.info(pformat(return_dict))
@@ -85,16 +81,13 @@ def process_evaluation_epoch(global_vars,
 
 
 def __decode(tensors_list, labels, specials):
-    labels_map = dict([(i, labels[i]) for i in range(len(labels))
-                       if i not in set(specials.values())])
+    labels_map = dict([(i, labels[i]) for i in range(len(labels)) if i not in set(specials.values())])
     results = []
     for tensor in tensors_list:
         tensor = tensor.long().cpu()
         hypotheses = []
         for i in range(tensor.shape[0]):
-            hypothesis = ''.join([labels_map[c]
-                                  for c in tensor[i].numpy().tolist()
-                                  if c in labels_map])
+            hypothesis = ''.join([labels_map[c] for c in tensor[i].numpy().tolist() if c in labels_map])
             hypotheses.append(hypothesis)
 
         results.append(hypotheses)
diff --git a/nemo/collections/asr/las/misc.py b/nemo/collections/asr/las/misc.py
index cc9e745cf827..c1402f517a34 100644
--- a/nemo/collections/asr/las/misc.py
+++ b/nemo/collections/asr/las/misc.py
@@ -3,9 +3,8 @@
 from torch import nn
 
 from nemo.backends.pytorch.nm import TrainableNM
-from nemo.core.neural_types import NeuralType, AxisType, BatchTag, TimeTag, \
-    ChannelTag
 from nemo.collections.asr.jasper import init_weights as jasper_init_weights
+from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
 
 
 class JasperRNNConnector(TrainableNM):
@@ -29,13 +28,7 @@ def input_ports(self):
 
             2: AxisType(TimeTag)
         """
-        return {
-            'tensor': NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag),
-                2: AxisType(TimeTag)
-            })
-        }
+        return {'tensor': NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(TimeTag),})}
 
     @property
     def output_ports(self):
@@ -48,19 +41,12 @@ def output_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            'tensor': NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
+        return {'tensor': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
 
     def __init__(self, in_channels, out_channels, **kwargs):
         super().__init__(**kwargs)
 
-        self.icnn = nn.Conv1d(in_channels, out_channels,
-                              kernel_size=1, bias=True)
+        self.icnn = nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=True)
         self.bn = nn.BatchNorm1d(out_channels)
 
         self.apply(jasper_init_weights)
diff --git a/nemo/collections/asr/losses.py b/nemo/collections/asr/losses.py
index 8d4c7a4067d9..47dbaac2b6da 100644
--- a/nemo/collections/asr/losses.py
+++ b/nemo/collections/asr/losses.py
@@ -3,8 +3,7 @@
 import torch.nn as nn
 
 from nemo.backends.pytorch.nm import LossNM
-from nemo.core.neural_types import (NeuralType, AxisType, BatchTag, TimeTag,
-                                    ChannelTag)
+from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
 
 
 class CTCLossNM(LossNM):
@@ -39,16 +38,10 @@ def input_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "log_probs": NeuralType({1: AxisType(TimeTag),
-                                     0: AxisType(BatchTag),
-                                     2: AxisType(ChannelTag)}),
-
-            "targets": NeuralType({0: AxisType(BatchTag),
-                                   1: AxisType(TimeTag)}),
-
+            "log_probs": NeuralType({1: AxisType(TimeTag), 0: AxisType(BatchTag), 2: AxisType(ChannelTag),}),
+            "targets": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "input_length": NeuralType({0: AxisType(BatchTag)}),
-
-            "target_length": NeuralType({0: AxisType(BatchTag)})
+            "target_length": NeuralType({0: AxisType(BatchTag)}),
         }
 
     @property
@@ -58,25 +51,20 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, *, num_classes, **kwargs):
         LossNM.__init__(self, **kwargs)
 
         # self._blank = self.local_parameters.get('blank', 0)
         self._blank = num_classes
-        self._criterion = nn.CTCLoss(blank=self._blank,
-                                     reduction='none')
+        self._criterion = nn.CTCLoss(blank=self._blank, reduction='none')
 
     def _loss(self, log_probs, targets, input_length, target_length):
         input_length = input_length.long()
         target_length = target_length.long()
         targets = targets.long()
-        loss = self._criterion(log_probs.transpose(1, 0), targets,
-                               input_length,
-                               target_length)
+        loss = self._criterion(log_probs.transpose(1, 0), targets, input_length, target_length)
         # note that this is different from reduction = 'mean'
         # because we are not dividing by target lengths
         loss = torch.mean(loss)
diff --git a/nemo/collections/asr/metrics.py b/nemo/collections/asr/metrics.py
index b174a9b9af93..e44263f11198 100644
--- a/nemo/collections/asr/metrics.py
+++ b/nemo/collections/asr/metrics.py
@@ -25,9 +25,7 @@ def __levenshtein(a: List, b: List) -> int:
     return current[n]
 
 
-def word_error_rate(hypotheses: List[str],
-                    references: List[str],
-                    use_cer=False) -> float:
+def word_error_rate(hypotheses: List[str], references: List[str], use_cer=False) -> float:
     """
     Computes Average Word Error rate between two texts represented as
     corresponding lists of string. Hypotheses and references must have same
@@ -46,8 +44,8 @@ def word_error_rate(hypotheses: List[str],
         raise ValueError(
             "In word error rate calculation, hypotheses and reference"
             " lists must have the same number of elements. But I got:"
-            "{0} and {1} correspondingly".format(len(hypotheses),
-                                                 len(references)))
+            "{0} and {1} correspondingly".format(len(hypotheses), len(references))
+        )
     for h, r in zip(hypotheses, references):
         if use_cer:
             h_list = list(h)
diff --git a/nemo/collections/asr/parts/cleaners.py b/nemo/collections/asr/parts/cleaners.py
index 16afdec07b0e..66d0a62fd62f 100755
--- a/nemo/collections/asr/parts/cleaners.py
+++ b/nemo/collections/asr/parts/cleaners.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2019 NVIDIA Corporation
 
-import inflect
 import re
 
+import inflect
 from unidecode import unidecode
 
 NUM_CHECK = re.compile(r'([$]?)(^|\s)(\S*[0-9]\S*)(?=(\s|$)((\S*)(\s|$))?)')
@@ -13,61 +13,65 @@
 THREE_CHECK = re.compile(r'([0-9]{3})([.,][0-9]{1,2})?([!.?])?$')
 DECIMAL_CHECK = re.compile(r'([.,][0-9]{1,2})$')
 
-ABBREVIATIONS_COMMON = [(re.compile('\\b%s\\.' % x[0]), x[1]) for x in
-                        [
-                            ("ms", "miss"),
-                            ("mrs", "misess"),
-                            ("mr", "mister"),
-                            ("messrs", "messeurs"),
-                            ("dr", "doctor"),
-                            ("drs", "doctors"),
-                            ("st", "saint"),
-                            ("co", "company"),
-                            ("jr", "junior"),
-                            ("sr", "senior"),
-                            ("rev", "reverend"),
-                            ("hon", "honorable"),
-                            ("sgt", "sergeant"),
-                            ("capt", "captain"),
-                            ("maj", "major"),
-                            ("col", "colonel"),
-                            ("lt", "lieutenant"),
-                            ("gen", "general"),
-                            ("prof", "professor"),
-                            ("lb", "pounds"),
-                            ("rep", "representative"),
-                            ("st", "street"),
-                            ("ave", "avenue"),
-                            ("etc", "et cetera"),
-                            ("jan", "january"),
-                            ("feb", "february"),
-                            ("mar", "march"),
-                            ("apr", "april"),
-                            ("jun", "june"),
-                            ("jul", "july"),
-                            ("aug", "august"),
-                            ("sep", "september"),
-                            ("oct", "october"),
-                            ("nov", "november"),
-                            ("dec", "december"),
-                        ]]
-
-ABBREVIATIONS_EXPANDED = [(re.compile('\\b%s\\.' % x[0]), x[1]) for x in
-                          [
-                            ("ltd", "limited"),
-                            ("fig", "figure"),
-                            ("figs", "figures"),
-                            ("gent", "gentlemen"),
-                            ("ft", "fort"),
-                            ("esq", "esquire"),
-                            ("prep", "preperation"),
-                            ("bros", "brothers"),
-                            ("ind", "independent"),
-                            ("mme", "madame"),
-                            ("pro", "professional"),
-                            ("vs", "versus"),
-                            ("inc", "include"),
-                          ]]
+ABBREVIATIONS_COMMON = [
+    (re.compile('\\b%s\\.' % x[0]), x[1])
+    for x in [
+        ("ms", "miss"),
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("messrs", "messeurs"),
+        ("dr", "doctor"),
+        ("drs", "doctors"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("sr", "senior"),
+        ("rev", "reverend"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("maj", "major"),
+        ("col", "colonel"),
+        ("lt", "lieutenant"),
+        ("gen", "general"),
+        ("prof", "professor"),
+        ("lb", "pounds"),
+        ("rep", "representative"),
+        ("st", "street"),
+        ("ave", "avenue"),
+        ("etc", "et cetera"),
+        ("jan", "january"),
+        ("feb", "february"),
+        ("mar", "march"),
+        ("apr", "april"),
+        ("jun", "june"),
+        ("jul", "july"),
+        ("aug", "august"),
+        ("sep", "september"),
+        ("oct", "october"),
+        ("nov", "november"),
+        ("dec", "december"),
+    ]
+]
+
+ABBREVIATIONS_EXPANDED = [
+    (re.compile('\\b%s\\.' % x[0]), x[1])
+    for x in [
+        ("ltd", "limited"),
+        ("fig", "figure"),
+        ("figs", "figures"),
+        ("gent", "gentlemen"),
+        ("ft", "fort"),
+        ("esq", "esquire"),
+        ("prep", "preperation"),
+        ("bros", "brothers"),
+        ("ind", "independent"),
+        ("mme", "madame"),
+        ("pro", "professional"),
+        ("vs", "versus"),
+        ("inc", "include"),
+    ]
+]
 
 inflect = inflect.engine()
 
@@ -86,8 +90,7 @@ def clean_text(string, table, punctuation_to_replace):
 
 def warn_common_chars(string):
     if re.search(r'[£€]', string):
-        print("WARNING: Your transcript contains one of '£' or '€' which we do"
-              "not currently handle")
+        print("WARNING: Your transcript contains one of '£' or '€' which we do" "not currently handle")
 
 
 def clean_numbers(string):
@@ -107,14 +110,12 @@ def clean_abbreviations(string, expanded=False):
 
 def clean_punctuations(string, table, punctuation_to_replace):
     for punc, replacement in punctuation_to_replace.items():
-        string = re.sub('\\{}'.format(punc),
-                        " {} ".format(replacement),
-                        string)
+        string = re.sub('\\{}'.format(punc), " {} ".format(replacement), string)
     string = string.translate(table)
     return string
 
 
-class NumberCleaner():
+class NumberCleaner:
     def __init__(self):
         super().__init__()
         self.reset()
@@ -141,6 +142,7 @@ def format_final_number(self, whole_num, decimal):
             # Check if there are non-numbers
             def convert_to_word(match):
                 return " " + inflect.number_to_words(match.group(0)) + " "
+
             return re.sub(r'[0-9,]+', convert_to_word, whole_num)
 
     def clean(self, match):
@@ -183,6 +185,6 @@ def clean(self, match):
             decimal_match = DECIMAL_CHECK.search(whole_num)
             if decimal_match:
                 decimal = decimal_match.group(1)[1:]
-                whole_num = whole_num[:-len(decimal) - 1]
+                whole_num = whole_num[: -len(decimal) - 1]
             whole_num = re.sub(r'\.', '', whole_num)
             return ws + self.format_final_number(whole_num, decimal)
diff --git a/nemo/collections/asr/parts/collections.py b/nemo/collections/asr/parts/collections.py
index 312d0c6dbc31..39b4bca0c524 100644
--- a/nemo/collections/asr/parts/collections.py
+++ b/nemo/collections/asr/parts/collections.py
@@ -1,13 +1,12 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import collections
 import os
-from typing import Optional, List, Union
+from typing import List, Optional, Union
 
 import pandas as pd
 
 import nemo
-from nemo.collections.asr.parts import manifest
-from nemo.collections.asr.parts import parsers
+from nemo.collections.asr.parts import manifest, parsers
 
 
 class _Collection(collections.UserList):
@@ -77,10 +76,7 @@ def __parse_texts(file: str) -> List[str]:
 class AudioText(_Collection):
     """List of audio-transcript text correspondence with preprocessing."""
 
-    OUTPUT_TYPE = collections.namedtuple(
-        typename='AudioTextEntity',
-        field_names='audio_file duration text_tokens',
-    )
+    OUTPUT_TYPE = collections.namedtuple(typename='AudioTextEntity', field_names='audio_file duration text_tokens',)
 
     def __init__(
         self,
@@ -133,8 +129,7 @@ def __init__(
             data.sort(key=lambda entity: entity.duration)
 
         nemo.logging.info(
-            "Filtered duration for loading collection is %f.",
-            duration_filtered,
+            "Filtered duration for loading collection is %f.", duration_filtered,
         )
 
         super().__init__(data)
@@ -143,9 +138,7 @@ def __init__(
 class ASRAudioText(AudioText):
     """`AudioText` collector from asr structured json files."""
 
-    def __init__(
-        self, manifests_files: Union[str, List[str]], *args, **kwargs
-    ):
+    def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
         """Parse lists of audio files, durations and transcripts texts.
 
         Args:
diff --git a/nemo/collections/asr/parts/dataset.py b/nemo/collections/asr/parts/dataset.py
index f6905e2218b1..c36f04432c6a 100644
--- a/nemo/collections/asr/parts/dataset.py
+++ b/nemo/collections/asr/parts/dataset.py
@@ -8,8 +8,7 @@
 from torch.utils.data import Dataset
 
 import nemo
-from nemo.collections.asr.parts import collections
-from nemo.collections.asr.parts import parsers
+from nemo.collections.asr.parts import collections, parsers
 
 
 def seq_collate_fn(batch, token_pad_value=0):
@@ -40,9 +39,7 @@ def seq_collate_fn(batch, token_pad_value=0):
         tokens_i_len = tokens_i_len.item()
         if tokens_i_len < max_tokens_len:
             pad = (0, max_tokens_len - tokens_i_len)
-            tokens_i = torch.nn.functional.pad(
-                tokens_i, pad, value=token_pad_value
-            )
+            tokens_i = torch.nn.functional.pad(tokens_i, pad, value=token_pad_value)
         tokens.append(tokens_i)
 
     if has_audio:
@@ -139,10 +136,7 @@ def __init__(
         self.collection = collections.ASRAudioText(
             manifests_files=manifest_filepath.split(','),
             parser=parsers.ENCharParser(
-                labels=labels,
-                unk_id=unk_index,
-                blank_id=blank_index,
-                do_normalize=normalize,
+                labels=labels, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize,
             ),
             min_duration=min_duration,
             max_duration=max_duration,
@@ -158,12 +152,7 @@ def __init__(
     def __getitem__(self, index):
         sample = self.collection[index]
         if self.load_audio:
-            features = self.featurizer.process(
-                sample.audio_file,
-                offset=0,
-                duration=sample.duration,
-                trim=self.trim,
-            )
+            features = self.featurizer.process(sample.audio_file, offset=0, duration=sample.duration, trim=self.trim,)
             f, fl = features, torch.tensor(features.shape[0]).long()
         else:
             f, fl = None, None
@@ -225,10 +214,7 @@ def __init__(
 
         # Read Kaldi features (MFCC, PLP) using feats.scp
         feats_path = os.path.join(kaldi_dir, 'feats.scp')
-        id2feats = {
-            utt_id: torch.from_numpy(feats)
-            for utt_id, feats in kaldi_io.read_mat_scp(feats_path)
-        }
+        id2feats = {utt_id: torch.from_numpy(feats) for utt_id, feats in kaldi_io.read_mat_scp(feats_path)}
 
         # Get durations, if utt2dur exists
         utt2dur_path = os.path.join(kaldi_dir, 'utt2dur')
@@ -245,15 +231,12 @@ def __init__(
             )
         else:
             nemo.logging.info(
-                f"Did not find utt2dur when loading data from "
-                f"{kaldi_dir}. Skipping dataset duration calculations."
+                f"Did not find utt2dur when loading data from " f"{kaldi_dir}. Skipping dataset duration calculations."
             )
 
         # Match transcripts to features
         text_path = os.path.join(kaldi_dir, 'text')
-        parser = parsers.make_parser(
-            labels, 'en', unk_id=unk_index, blank_id=self.blank_index
-        )
+        parser = parsers.make_parser(labels, 'en', unk_id=unk_index, blank_id=self.blank_index)
         with open(text_path, 'r') as f:
             for line in f:
                 split_idx = line.find(' ')
@@ -296,8 +279,7 @@ def __init__(
         if id2dur:
             # utt2dur durations are in seconds
             nemo.logging.info(
-                f"Dataset loaded with {duration / 60 : .2f} hours. "
-                f"Filtered {filtered_duration / 60 : .2f} hours."
+                f"Dataset loaded with {duration / 60 : .2f} hours. " f"Filtered {filtered_duration / 60 : .2f} hours."
             )
 
         self.data = data
diff --git a/nemo/collections/asr/parts/features.py b/nemo/collections/asr/parts/features.py
index 679b38d114f8..b83d5b8c21c9 100644
--- a/nemo/collections/asr/parts/features.py
+++ b/nemo/collections/asr/parts/features.py
@@ -1,27 +1,27 @@
 # Taken straight from Patter https://github.com/ryanleary/patter
 # TODO: review, and copyright and fix/add comments
 import math
+
 import librosa
 import torch
 import torch.nn as nn
-from .perturb import AudioAugmentor
-from .segment import AudioSegment
 from torch_stft import STFT
 
 import nemo
 
+from .perturb import AudioAugmentor
+from .segment import AudioSegment
+
 CONSTANT = 1e-5
 
 
 def normalize_batch(x, seq_len, normalize_type):
     if normalize_type == "per_feature":
-        x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
-                             device=x.device)
-        x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
-                            device=x.device)
+        x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
+        x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
         for i in range(x.shape[0]):
-            x_mean[i, :] = x[i, :, :seq_len[i]].mean(dim=1)
-            x_std[i, :] = x[i, :, :seq_len[i]].std(dim=1)
+            x_mean[i, :] = x[i, :, : seq_len[i]].mean(dim=1)
+            x_std[i, :] = x[i, :, : seq_len[i]].std(dim=1)
         # make sure x_std is not zero
         x_std += CONSTANT
         return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2)
@@ -29,16 +29,15 @@ def normalize_batch(x, seq_len, normalize_type):
         x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
         x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
         for i in range(x.shape[0]):
-            x_mean[i] = x[i, :, :seq_len[i].item()].mean()
-            x_std[i] = x[i, :, :seq_len[i].item()].std()
+            x_mean[i] = x[i, :, : seq_len[i].item()].mean()
+            x_std[i] = x[i, :, : seq_len[i].item()].std()
         # make sure x_std is not zero
         x_std += CONSTANT
         return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1)
     elif "fixed_mean" in normalize_type and "fixed_std" in normalize_type:
         x_mean = torch.tensor(normalize_type["fixed_mean"], device=x.device)
         x_std = torch.tensor(normalize_type["fixed_std"], device=x.device)
-        return ((x - x_mean.view(x.shape[0], x.shape[1]).unsqueeze(2)) /
-                x_std.view(x.shape[0], x.shape[1]).unsqueeze(2))
+        return (x - x_mean.view(x.shape[0], x.shape[1]).unsqueeze(2)) / x_std.view(x.shape[0], x.shape[1]).unsqueeze(2)
     else:
         return x
 
@@ -58,8 +57,7 @@ def splice_frames(x, frame_splicing):
 
 class WaveformFeaturizer(object):
     def __init__(self, sample_rate=16000, int_values=False, augmentor=None):
-        self.augmentor = augmentor if augmentor is not None else \
-            AudioAugmentor()
+        self.augmentor = augmentor if augmentor is not None else AudioAugmentor()
         self.sample_rate = sample_rate
         self.int_values = int_values
 
@@ -71,7 +69,10 @@ def process(self, file_path, offset=0, duration=0, trim=False):
             file_path,
             target_sr=self.sample_rate,
             int_values=self.int_values,
-            offset=offset, duration=duration, trim=trim)
+            offset=offset,
+            duration=duration,
+            trim=trim,
+        )
         return self.process_segment(audio)
 
     def process_segment(self, audio_segment):
@@ -88,8 +89,7 @@ def from_config(cls, input_config, perturbation_configs=None):
         sample_rate = input_config.get("sample_rate", 16000)
         int_values = input_config.get("int_values", False)
 
-        return cls(sample_rate=sample_rate, int_values=int_values,
-                   augmentor=aa)
+        return cls(sample_rate=sample_rate, int_values=int_values, augmentor=aa)
 
 
 class FeaturizerFactory(object):
@@ -98,46 +98,51 @@ def __init__(self):
 
     @classmethod
     def from_config(cls, input_cfg, perturbation_configs=None):
-        return WaveformFeaturizer.from_config(
-            input_cfg,
-            perturbation_configs=perturbation_configs)
+        return WaveformFeaturizer.from_config(input_cfg, perturbation_configs=perturbation_configs)
 
 
 class FilterbankFeatures(nn.Module):
     """Featurizer that converts wavs to Mel Spectrograms.
     See AudioToMelSpectrogramPreprocessor for args.
     """
+
     def __init__(
-            self, *,
-            sample_rate=16000,
-            n_window_size=320,
-            n_window_stride=160,
-            window="hann",
-            normalize="per_feature",
-            n_fft=None,
-            preemph=0.97,
-            nfilt=64,
-            lowfreq=0,
-            highfreq=None,
-            log=True,
-            log_zero_guard_type="add",
-            log_zero_guard_value=2**-24,
-            dither=CONSTANT,
-            pad_to=16,
-            max_duration=16.7,
-            frame_splicing=1,
-            stft_conv=False,
-            pad_value=0,
-            mag_power=2.,
+        self,
+        *,
+        sample_rate=16000,
+        n_window_size=320,
+        n_window_stride=160,
+        window="hann",
+        normalize="per_feature",
+        n_fft=None,
+        preemph=0.97,
+        nfilt=64,
+        lowfreq=0,
+        highfreq=None,
+        log=True,
+        log_zero_guard_type="add",
+        log_zero_guard_value=2 ** -24,
+        dither=CONSTANT,
+        pad_to=16,
+        max_duration=16.7,
+        frame_splicing=1,
+        stft_conv=False,
+        pad_value=0,
+        mag_power=2.0,
     ):
         super(FilterbankFeatures, self).__init__()
-        if (n_window_size is None or n_window_stride is None
-                or not isinstance(n_window_size, int)
-                or not isinstance(n_window_stride, int)
-                or n_window_size <= 0 or n_window_stride <= 0):
+        if (
+            n_window_size is None
+            or n_window_stride is None
+            or not isinstance(n_window_size, int)
+            or not isinstance(n_window_stride, int)
+            or n_window_size <= 0
+            or n_window_stride <= 0
+        ):
             raise ValueError(
                 f"{self} got an invalid value for either n_window_size or "
-                f"n_window_stride. Both must be positive ints.")
+                f"n_window_stride. Both must be positive ints."
+            )
         nemo.logging.info(f"PADDING: {pad_to}")
 
         self.win_length = n_window_size
@@ -156,8 +161,7 @@ def __init__(self, *params, **kw_params):
                 def forward(self, input_data):
                     return super(STFTPatch, self).transform(input_data)[0]
 
-            self.stft = STFTPatch(self.n_fft, self.hop_length,
-                                  self.win_length, window)
+            self.stft = STFTPatch(self.n_fft, self.hop_length, self.win_length, window)
 
         else:
             print("STFT using torch")
@@ -169,15 +173,16 @@ def forward(self, input_data):
                 'none': None,
             }
             window_fn = torch_windows.get(window, None)
-            window_tensor = window_fn(self.win_length,
-                                      periodic=False) if window_fn else None
+            window_tensor = window_fn(self.win_length, periodic=False) if window_fn else None
             self.register_buffer("window", window_tensor)
             self.stft = lambda x: torch.stft(
-                            x, n_fft=self.n_fft,
-                            hop_length=self.hop_length,
-                            win_length=self.win_length,
-                            center=True,
-                            window=self.window.to(dtype=torch.float))
+                x,
+                n_fft=self.n_fft,
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                center=True,
+                window=self.window.to(dtype=torch.float),
+            )
 
         self.normalize = normalize
         self.log = log
@@ -189,16 +194,15 @@ def forward(self, input_data):
         highfreq = highfreq or sample_rate / 2
 
         filterbanks = torch.tensor(
-            librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt,
-                                fmin=lowfreq, fmax=highfreq),
-            dtype=torch.float).unsqueeze(0)
+            librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq,),
+            dtype=torch.float,
+        ).unsqueeze(0)
         # self.fb = filterbanks
         # self.window = window_tensor
         self.register_buffer("fb", filterbanks)
 
         # Calculate maximum sequence length
-        max_length = self.get_seq_len(
-            torch.tensor(max_duration * sample_rate, dtype=torch.float))
+        max_length = self.get_seq_len(torch.tensor(max_duration * sample_rate, dtype=torch.float))
         max_pad = pad_to - (max_length % pad_to) if pad_to > 0 else 0
         self.max_length = max_length + max_pad
         self.pad_value = pad_value
@@ -210,7 +214,8 @@ def forward(self, input_data):
             raise ValueError(
                 f"{self} received {log_zero_guard_type} for the "
                 f"log_zero_guard_type parameter. It must be either 'add' or "
-                f"'clamp'.")
+                f"'clamp'."
+            )
         # log_zero_guard_value is the the small we want to use, we support
         # an actual number, or "tiny", or "eps"
         self.log_zero_guard_value = lambda _: log_zero_guard_value
@@ -223,7 +228,8 @@ def forward(self, input_data):
                 raise ValueError(
                     f"{self} received {log_zero_guard_value} for the "
                     f"log_zero_guard_type parameter. It must be either a "
-                    f"number, 'tiny', or 'eps'")
+                    f"number, 'tiny', or 'eps'"
+                )
         self.log_zero_guard_type = log_zero_guard_type
 
     def get_seq_len(self, seq_len):
@@ -243,14 +249,12 @@ def forward(self, x, seq_len):
 
         # do preemphasis
         if self.preemph is not None:
-            x = torch.cat(
-                (x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]),
-                dim=1)
+            x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1,)
 
         x = self.stft(x)
 
         # get power spectrum
-        if self.mag_power != 1.:
+        if self.mag_power != 1.0:
             x = x.pow(self.mag_power)
         if not self.stft_conv:
             x = x.sum(-1)
@@ -280,19 +284,15 @@ def forward(self, x, seq_len):
         max_len = x.size(-1)
         mask = torch.arange(max_len).to(x.device)
         mask = mask.expand(x.size(0), max_len) >= seq_len.unsqueeze(1)
-        x = x.masked_fill(
-            mask.unsqueeze(1).type(torch.bool).to(device=x.device),
-            self.pad_value)
+        x = x.masked_fill(mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value,)
         del mask
         pad_to = self.pad_to
         if not self.training:
             pad_to = 16
         if pad_to == "max":
-            x = nn.functional.pad(x, (0, self.max_length - x.size(-1)),
-                                  value=self.pad_value)
+            x = nn.functional.pad(x, (0, self.max_length - x.size(-1)), value=self.pad_value)
         elif pad_to > 0:
             pad_amt = x.size(-1) % pad_to
             if pad_amt != 0:
-                x = nn.functional.pad(x, (0, pad_to - pad_amt),
-                                      value=self.pad_value)
+                x = nn.functional.pad(x, (0, pad_to - pad_amt), value=self.pad_value)
         return x
diff --git a/nemo/collections/asr/parts/jasper.py b/nemo/collections/asr/parts/jasper.py
index 8e367bc598e1..574926db4356 100644
--- a/nemo/collections/asr/parts/jasper.py
+++ b/nemo/collections/asr/parts/jasper.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Tuple, Optional, List
+from typing import List, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -60,9 +60,19 @@ def get_same_padding(kernel_size, stride, dilation):
 class MaskedConv1d(nn.Module):
     __constants__ = ["use_conv_mask", "real_out_channels", "heads"]
 
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, heads=-1, bias=False,
-                 use_mask=True):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        heads=-1,
+        bias=False,
+        use_mask=True,
+    ):
         super(MaskedConv1d, self).__init__()
 
         if not (heads == -1 or groups == in_channels):
@@ -74,27 +84,30 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
             out_channels = heads
             groups = heads
 
-        self.conv = nn.Conv1d(in_channels, out_channels,
-                              kernel_size,
-                              stride=stride,
-                              padding=padding, dilation=dilation,
-                              groups=groups, bias=bias)
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
         self.use_mask = use_mask
         self.heads = heads
 
     def get_seq_len(self, lens):
-        return ((lens + 2 * self.conv.padding[0] - self.conv.dilation[0] * (
-                self.conv.kernel_size[0] - 1) - 1) / self.conv.stride[0] + 1)
+        return (
+            lens + 2 * self.conv.padding[0] - self.conv.dilation[0] * (self.conv.kernel_size[0] - 1) - 1
+        ) / self.conv.stride[0] + 1
 
     def forward(self, x, lens):
         if self.use_mask:
             lens = lens.to(dtype=torch.long)
             max_len = x.size(2)
-            mask = torch.arange(max_len).to(lens.device) \
-                .expand(len(lens), max_len) >= lens.unsqueeze(1)
-            x = x.masked_fill(
-                mask.unsqueeze(1).to(device=x.device), 0
-            )
+            mask = torch.arange(max_len).to(lens.device).expand(len(lens), max_len) >= lens.unsqueeze(1)
+            x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0)
             # del mask
             lens = self.get_seq_len(lens)
 
@@ -111,7 +124,6 @@ def forward(self, x, lens):
 
 
 class GroupShuffle(nn.Module):
-
     def __init__(self, groups, channels):
         super(GroupShuffle, self).__init__()
 
@@ -133,12 +145,27 @@ def forward(self, x):
 class JasperBlock(nn.Module):
     __constants__ = ["conv_mask", "separable", "residual_mode", "res", "mconv"]
 
-    def __init__(self, inplanes, planes, repeat=3, kernel_size=11, stride=1,
-                 dilation=1, padding='same', dropout=0.2, activation=None,
-                 residual=True, groups=1, separable=False,
-                 heads=-1, normalization="batch",
-                 norm_groups=1, residual_mode='add',
-                 residual_panes=[], conv_mask=False):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        repeat=3,
+        kernel_size=11,
+        stride=1,
+        dilation=1,
+        padding='same',
+        dropout=0.2,
+        activation=None,
+        residual=True,
+        groups=1,
+        separable=False,
+        heads=-1,
+        normalization="batch",
+        norm_groups=1,
+        residual_mode='add',
+        residual_panes=[],
+        conv_mask=False,
+    ):
         super(JasperBlock, self).__init__()
 
         if padding != "same":
@@ -153,7 +180,28 @@ def __init__(self, inplanes, planes, repeat=3, kernel_size=11, stride=1,
         conv = nn.ModuleList()
 
         for _ in range(repeat - 1):
-            conv.extend(self._get_conv_bn_layer(
+            conv.extend(
+                self._get_conv_bn_layer(
+                    inplanes_loop,
+                    planes,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    dilation=dilation,
+                    padding=padding_val,
+                    groups=groups,
+                    heads=heads,
+                    separable=separable,
+                    normalization=normalization,
+                    norm_groups=norm_groups,
+                )
+            )
+
+            conv.extend(self._get_act_dropout_layer(drop_prob=dropout, activation=activation))
+
+            inplanes_loop = planes
+
+        conv.extend(
+            self._get_conv_bn_layer(
                 inplanes_loop,
                 planes,
                 kernel_size=kernel_size,
@@ -164,26 +212,9 @@ def __init__(self, inplanes, planes, repeat=3, kernel_size=11, stride=1,
                 heads=heads,
                 separable=separable,
                 normalization=normalization,
-                norm_groups=norm_groups))
-
-            conv.extend(self._get_act_dropout_layer(
-                drop_prob=dropout,
-                activation=activation))
-
-            inplanes_loop = planes
-
-        conv.extend(self._get_conv_bn_layer(
-            inplanes_loop,
-            planes,
-            kernel_size=kernel_size,
-            stride=stride,
-            dilation=dilation,
-            padding=padding_val,
-            groups=groups,
-            heads=heads,
-            separable=separable,
-            normalization=normalization,
-            norm_groups=norm_groups))
+                norm_groups=norm_groups,
+            )
+        )
 
         self.mconv = conv
 
@@ -196,78 +227,126 @@ def __init__(self, inplanes, planes, repeat=3, kernel_size=11, stride=1,
                 res_panes = [inplanes]
                 self.dense_residual = False
             for ip in res_panes:
-                res_list.append(nn.ModuleList(self._get_conv_bn_layer(
-                    ip,
-                    planes,
-                    kernel_size=1,
-                    normalization=normalization,
-                    norm_groups=norm_groups)))
+                res_list.append(
+                    nn.ModuleList(
+                        self._get_conv_bn_layer(
+                            ip, planes, kernel_size=1, normalization=normalization, norm_groups=norm_groups,
+                        )
+                    )
+                )
             self.res = res_list
         else:
             self.res = None
 
-        self.mout = nn.Sequential(
-            *self._get_act_dropout_layer(
-                drop_prob=dropout,
-                activation=activation)
-        )
-
-    def _get_conv(self, in_channels, out_channels, kernel_size=11,
-                  stride=1, dilation=1, padding=0, bias=False,
-                  groups=1, heads=-1, separable=False):
+        self.mout = nn.Sequential(*self._get_act_dropout_layer(drop_prob=dropout, activation=activation))
+
+    def _get_conv(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=11,
+        stride=1,
+        dilation=1,
+        padding=0,
+        bias=False,
+        groups=1,
+        heads=-1,
+        separable=False,
+    ):
         use_mask = self.conv_mask
         if use_mask:
-            return MaskedConv1d(in_channels, out_channels, kernel_size,
-                                stride=stride,
-                                dilation=dilation, padding=padding, bias=bias,
-                                groups=groups, heads=heads,
-                                use_mask=use_mask)
+            return MaskedConv1d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding,
+                bias=bias,
+                groups=groups,
+                heads=heads,
+                use_mask=use_mask,
+            )
         else:
-            return nn.Conv1d(in_channels, out_channels, kernel_size,
-                             stride=stride,
-                             dilation=dilation, padding=padding, bias=bias,
-                             groups=groups)
-
-    def _get_conv_bn_layer(self, in_channels, out_channels, kernel_size=11,
-                           stride=1, dilation=1, padding=0, bias=False,
-                           groups=1, heads=-1, separable=False,
-                           normalization="batch", norm_groups=1):
+            return nn.Conv1d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding,
+                bias=bias,
+                groups=groups,
+            )
+
+    def _get_conv_bn_layer(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=11,
+        stride=1,
+        dilation=1,
+        padding=0,
+        bias=False,
+        groups=1,
+        heads=-1,
+        separable=False,
+        normalization="batch",
+        norm_groups=1,
+    ):
         if norm_groups == -1:
             norm_groups = out_channels
 
         if separable:
             layers = [
-                self._get_conv(in_channels, in_channels, kernel_size,
-                               stride=stride,
-                               dilation=dilation, padding=padding, bias=bias,
-                               groups=in_channels, heads=heads),
-                self._get_conv(in_channels, out_channels, kernel_size=1,
-                               stride=1,
-                               dilation=1, padding=0, bias=bias, groups=groups)
+                self._get_conv(
+                    in_channels,
+                    in_channels,
+                    kernel_size,
+                    stride=stride,
+                    dilation=dilation,
+                    padding=padding,
+                    bias=bias,
+                    groups=in_channels,
+                    heads=heads,
+                ),
+                self._get_conv(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    dilation=1,
+                    padding=0,
+                    bias=bias,
+                    groups=groups,
+                ),
             ]
         else:
             layers = [
-                self._get_conv(in_channels, out_channels, kernel_size,
-                               stride=stride,
-                               dilation=dilation, padding=padding, bias=bias,
-                               groups=groups)
+                self._get_conv(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=stride,
+                    dilation=dilation,
+                    padding=padding,
+                    bias=bias,
+                    groups=groups,
+                )
             ]
 
         if normalization == "group":
-            layers.append(nn.GroupNorm(
-                num_groups=norm_groups, num_channels=out_channels))
+            layers.append(nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels))
         elif normalization == "instance":
-            layers.append(nn.GroupNorm(
-                num_groups=out_channels, num_channels=out_channels))
+            layers.append(nn.GroupNorm(num_groups=out_channels, num_channels=out_channels))
         elif normalization == "layer":
-            layers.append(nn.GroupNorm(
-                num_groups=1, num_channels=out_channels))
+            layers.append(nn.GroupNorm(num_groups=1, num_channels=out_channels))
         elif normalization == "batch":
             layers.append(nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.1))
         else:
             raise ValueError(
-                f"Normalization method ({normalization}) does not match"
-                f" one of [batch, layer, group, instance].")
+                f"Normalization method ({normalization}) does not match" f" one of [batch, layer, group, instance]."
+            )
 
         if groups > 1:
             layers.append(GroupShuffle(groups, out_channels))
@@ -276,10 +355,7 @@ def _get_conv_bn_layer(self, in_channels, out_channels, kernel_size=11,
     def _get_act_dropout_layer(self, drop_prob=0.2, activation=None):
         if activation is None:
             activation = nn.Hardtanh(min_val=0.0, max_val=20.0)
-        layers = [
-            activation,
-            nn.Dropout(p=drop_prob)
-        ]
+        layers = [activation, nn.Dropout(p=drop_prob)]
         return layers
 
     def forward(self, input_: Tuple[List[Tensor], Optional[Tensor]]):
diff --git a/nemo/collections/asr/parts/manifest.py b/nemo/collections/asr/parts/manifest.py
index 665494c15217..d91b2f80ed28 100644
--- a/nemo/collections/asr/parts/manifest.py
+++ b/nemo/collections/asr/parts/manifest.py
@@ -1,29 +1,23 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import json
-from typing import Union, Iterator, Dict, Any, List
+from typing import Any, Dict, Iterator, List, Union
 
 
 class ManifestBase:
     def __init__(self, *args, **kwargs):
         raise ValueError(
-            "This class is deprecated, look at "
-            "https://github.com/NVIDIA/NeMo/pull/284 for "
-            "correct behaviour."
+            "This class is deprecated, look at " "https://github.com/NVIDIA/NeMo/pull/284 for " "correct behaviour."
         )
 
 
 class ManifestEN:
     def __init__(self, *args, **kwargs):
         raise ValueError(
-            "This class is deprecated, look at "
-            "https://github.com/NVIDIA/NeMo/pull/284 for "
-            "correct behaviour."
+            "This class is deprecated, look at " "https://github.com/NVIDIA/NeMo/pull/284 for " "correct behaviour."
         )
 
 
-def item_iter(
-    manifests_files: Union[str, List[str]]
-) -> Iterator[Dict[str, Any]]:
+def item_iter(manifests_files: Union[str, List[str]]) -> Iterator[Dict[str, Any]]:
     """Iterate through json lines of provided manifests.
 
     NeMo ASR pipelines often assume certain manifest files structure. In
@@ -65,15 +59,13 @@ def __parse_item(line: str) -> Dict[str, Any]:
         item['audio_file'] = item.pop('audio_filepath')
     else:
         raise ValueError(
-            f"Manifest file {manifest_file} has invalid json line "
-            f"structure: {line} without proper audio file key."
+            f"Manifest file {manifest_file} has invalid json line " f"structure: {line} without proper audio file key."
         )
 
     # Duration.
     if 'duration' not in item:
         raise ValueError(
-            f"Manifest file {manifest_file} has invalid json line "
-            f"structure: {line} without proper duration key."
+            f"Manifest file {manifest_file} has invalid json line " f"structure: {line} without proper duration key."
         )
 
     # Text.
@@ -84,15 +76,11 @@ def __parse_item(line: str) -> Dict[str, Any]:
             item['text'] = f.read().replace('\n', '')
     else:
         raise ValueError(
-            f"Manifest file {manifest_file} has invalid json line "
-            f"structure: {line} without proper text key."
+            f"Manifest file {manifest_file} has invalid json line " f"structure: {line} without proper text key."
         )
 
     item = dict(
-        audio_file=item['audio_file'],
-        duration=item['duration'],
-        text=item['text'],
-        offset=item.get('offset', None),
+        audio_file=item['audio_file'], duration=item['duration'], text=item['text'], offset=item.get('offset', None),
     )
 
     return item
diff --git a/nemo/collections/asr/parts/parsers.py b/nemo/collections/asr/parts/parsers.py
index 1a139728e966..8ce1a2c9932b 100644
--- a/nemo/collections/asr/parts/parsers.py
+++ b/nemo/collections/asr/parts/parsers.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import string
-from typing import Optional, List
+from typing import List, Optional
 
 import frozendict
+
 from nemo.collections.asr.parts import cleaners
 
 
@@ -45,9 +46,7 @@ def __init__(
         self._do_lowercase = do_lowercase
 
         self._labels_map = {label: index for index, label in enumerate(labels)}
-        self._special_labels = set(
-            [label for label in labels if len(label) > 1]
-        )
+        self._special_labels = set([label for label in labels if len(label) > 1])
 
     def __call__(self, text: str) -> Optional[List[int]]:
         if self._do_normalize:
@@ -90,9 +89,7 @@ def _tokenize(self, text: str) -> List[int]:
 class ENCharParser(CharParser):
     """Incorporates english-specific parsing logic."""
 
-    PUNCTUATION_TO_REPLACE = frozendict.frozendict(
-        {'+': 'plus', '&': 'and', '%': 'percent'}
-    )
+    PUNCTUATION_TO_REPLACE = frozendict.frozendict({'+': 'plus', '&': 'and', '%': 'percent'})
 
     def __init__(self, *args, **kwargs):
         """Creates english-specific mapping char parser.
@@ -125,9 +122,7 @@ def _normalize(self, text: str) -> Optional[str]:
         # noinspection PyBroadException
         try:
             text = cleaners.clean_text(
-                string=text,
-                table=self._table,
-                punctuation_to_replace=self.PUNCTUATION_TO_REPLACE,
+                string=text, table=self._table, punctuation_to_replace=self.PUNCTUATION_TO_REPLACE,
             )
         except Exception:
             return None
@@ -135,14 +130,10 @@ def _normalize(self, text: str) -> Optional[str]:
         return text
 
 
-NAME_TO_PARSER = frozendict.frozendict(
-    {'base': CharParser, 'en': ENCharParser}
-)
+NAME_TO_PARSER = frozendict.frozendict({'base': CharParser, 'en': ENCharParser})
 
 
-def make_parser(
-    labels: Optional[List[str]] = None, name: str = 'base', **kwargs,
-) -> CharParser:
+def make_parser(labels: Optional[List[str]] = None, name: str = 'base', **kwargs,) -> CharParser:
     """Creates parser from labels, set of arguments and concise parser name.
 
     Args:
diff --git a/nemo/collections/asr/parts/perturb.py b/nemo/collections/asr/parts/perturb.py
index fb8ff48b05fd..63c16f53baad 100644
--- a/nemo/collections/asr/parts/perturb.py
+++ b/nemo/collections/asr/parts/perturb.py
@@ -5,8 +5,7 @@
 import librosa
 from scipy import signal
 
-from nemo.collections.asr.parts import collections
-from nemo.collections.asr.parts import parsers
+from nemo.collections.asr.parts import collections, parsers
 
 from .segment import AudioSegment
 
@@ -50,20 +49,14 @@ def perturb(self, data):
 
 class ImpulsePerturbation(Perturbation):
     def __init__(self, manifest_path=None, rng=None):
-        self._manifest = collections.ASRAudioText(
-            manifest_path, parser=parsers.make_parser([])
-        )
+        self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]))
         self._rng = random.Random() if rng is None else rng
 
     def perturb(self, data):
         impulse_record = self._rng.sample(self._manifest.data, 1)[0]
-        impulse = AudioSegment.from_file(
-            impulse_record['audio_filepath'], target_sr=data.sample_rate
-        )
+        impulse = AudioSegment.from_file(impulse_record['audio_filepath'], target_sr=data.sample_rate)
         # print("DEBUG: impulse:", impulse_record['audio_filepath'])
-        data._samples = signal.fftconvolve(
-            data.samples, impulse.samples, "full"
-        )
+        data._samples = signal.fftconvolve(data.samples, impulse.samples, "full")
 
 
 class ShiftPerturbation(Perturbation):
@@ -89,16 +82,9 @@ def perturb(self, data):
 
 class NoisePerturbation(Perturbation):
     def __init__(
-        self,
-        manifest_path=None,
-        min_snr_db=40,
-        max_snr_db=50,
-        max_gain_db=300.0,
-        rng=None,
+        self, manifest_path=None, min_snr_db=40, max_snr_db=50, max_gain_db=300.0, rng=None,
     ):
-        self._manifest = collections.ASRAudioText(
-            manifest_path, parser=parsers.make_parser([])
-        )
+        self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]))
         self._rng = random.Random() if rng is None else rng
         self._min_snr_db = min_snr_db
         self._max_snr_db = max_snr_db
@@ -107,20 +93,14 @@ def __init__(
     def perturb(self, data):
         snr_db = self._rng.uniform(self._min_snr_db, self._max_snr_db)
         noise_record = self._rng.sample(self._manifest.data, 1)[0]
-        noise = AudioSegment.from_file(
-            noise_record['audio_filepath'], target_sr=data.sample_rate
-        )
-        noise_gain_db = min(
-            data.rms_db - noise.rms_db - snr_db, self._max_gain_db
-        )
+        noise = AudioSegment.from_file(noise_record['audio_filepath'], target_sr=data.sample_rate)
+        noise_gain_db = min(data.rms_db - noise.rms_db - snr_db, self._max_gain_db)
         # print("DEBUG: noise:", snr_db, noise_gain_db, noise_record[
         # 'audio_filepath'])
 
         # calculate noise segment to use
         start_time = self._rng.uniform(0.0, noise.duration - data.duration)
-        noise.subsegment(
-            start_time=start_time, end_time=start_time + data.duration
-        )
+        noise.subsegment(start_time=start_time, end_time=start_time + data.duration)
 
         # adjust gain for snr purposes and superimpose
         noise.gain_db(noise_gain_db)
diff --git a/nemo/collections/asr/parts/segment.py b/nemo/collections/asr/parts/segment.py
index 9f25f1b7c938..fba34b196f78 100644
--- a/nemo/collections/asr/parts/segment.py
+++ b/nemo/collections/asr/parts/segment.py
@@ -16,8 +16,7 @@ class AudioSegment(object):
     :raises TypeError: If the sample data type is not float or int.
     """
 
-    def __init__(self, samples, sample_rate, target_sr=None, trim=False,
-                 trim_db=60):
+    def __init__(self, samples, sample_rate, target_sr=None, trim=False, trim_db=60):
         """Create audio segment from samples.
         Samples are convert float32 internally, with int scaled to [-1, 1].
         """
@@ -50,9 +49,13 @@ def __ne__(self, other):
 
     def __str__(self):
         """Return human-readable representation of segment."""
-        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
-                "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
-                                self.duration, self.rms_db))
+        return "%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " "rms=%.2fdB" % (
+            type(self),
+            self.num_samples,
+            self.sample_rate,
+            self.duration,
+            self.rms_db,
+        )
 
     @staticmethod
     def _convert_samples_to_float32(samples):
@@ -63,7 +66,7 @@ def _convert_samples_to_float32(samples):
         float32_samples = samples.astype('float32')
         if samples.dtype in np.sctypes['int']:
             bits = np.iinfo(samples.dtype).bits
-            float32_samples *= (1. / 2 ** (bits - 1))
+            float32_samples *= 1.0 / 2 ** (bits - 1)
         elif samples.dtype in np.sctypes['float']:
             pass
         else:
@@ -71,8 +74,9 @@ def _convert_samples_to_float32(samples):
         return float32_samples
 
     @classmethod
-    def from_file(cls, filename, target_sr=None, int_values=False, offset=0,
-                  duration=0, trim=False):
+    def from_file(
+        cls, filename, target_sr=None, int_values=False, offset=0, duration=0, trim=False,
+    ):
         """
         Load a file supported by librosa and return as an AudioSegment.
         :param filename: path of file to load
@@ -96,11 +100,7 @@ def from_file(cls, filename, target_sr=None, int_values=False, offset=0,
         return cls(samples, sample_rate, target_sr=target_sr, trim=trim)
 
     @classmethod
-    def segment_from_file(cls,
-                          filename,
-                          target_sr=None,
-                          n_segments=0,
-                          trim=False):
+    def segment_from_file(cls, filename, target_sr=None, n_segments=0, trim=False):
         """Grabs n_segments number of samples from filename randomly from the
         file as opposed to at a specified offset.
         """
@@ -139,7 +139,7 @@ def rms_db(self):
         return 10 * np.log10(mean_square)
 
     def gain_db(self, gain):
-        self._samples *= 10. ** (gain / 20.)
+        self._samples *= 10.0 ** (gain / 20.0)
 
     def pad(self, pad_size, symmetric=False):
         """Add zero padding to the sample. The pad size is given in number
@@ -148,9 +148,7 @@ def pad(self, pad_size, symmetric=False):
         `pad_size`
         zeros will be added only to the end.
         """
-        self._samples = np.pad(self._samples,
-                               (pad_size if symmetric else 0, pad_size),
-                               mode='constant')
+        self._samples = np.pad(self._samples, (pad_size if symmetric else 0, pad_size), mode='constant',)
 
     def subsegment(self, start_time=None, end_time=None):
         """Cut the AudioSegment between given boundaries.
@@ -170,19 +168,15 @@ def subsegment(self, start_time=None, end_time=None):
         if end_time < 0.0:
             end_time = self.duration + end_time
         if start_time < 0.0:
-            raise ValueError("The slice start position (%f s) is out of "
-                             "bounds." % start_time)
+            raise ValueError("The slice start position (%f s) is out of " "bounds." % start_time)
         if end_time < 0.0:
-            raise ValueError(
-                "The slice end position (%f s) is out of bounds." %
-                end_time)
+            raise ValueError("The slice end position (%f s) is out of bounds." % end_time)
         if start_time > end_time:
-            raise ValueError("The slice start position (%f s) is later than "
-                             "the end position (%f s)." % (
-                                 start_time, end_time))
+            raise ValueError(
+                "The slice start position (%f s) is later than " "the end position (%f s)." % (start_time, end_time)
+            )
         if end_time > self.duration:
-            raise ValueError("The slice end position (%f s) is out of bounds "
-                             "(> %f s)" % (end_time, self.duration))
+            raise ValueError("The slice end position (%f s) is out of bounds " "(> %f s)" % (end_time, self.duration))
         start_sample = int(round(start_time * self._sample_rate))
         end_sample = int(round(end_time * self._sample_rate))
         self._samples = self._samples[start_sample:end_sample]
diff --git a/nemo/collections/asr/parts/spectr_augment.py b/nemo/collections/asr/parts/spectr_augment.py
index 6f079137ff2a..ff733cc2f352 100755
--- a/nemo/collections/asr/parts/spectr_augment.py
+++ b/nemo/collections/asr/parts/spectr_augment.py
@@ -16,13 +16,9 @@ class SpecAugment(nn.Module):
     freq_width - maximum number of frequencies to be cut in one segment
     time_width - maximum number of time steps to be cut in one segment
     """
+
     def __init__(
-        self,
-        freq_masks=0,
-        time_masks=0,
-        freq_width=10,
-        time_width=10,
-        rng=None
+        self, freq_masks=0, time_masks=0, freq_width=10, time_width=10, rng=None,
     ):
         super(SpecAugment, self).__init__()
 
@@ -42,21 +38,18 @@ def forward(self, x):
 
         for idx in range(sh[0]):
             for i in range(self.freq_masks):
-                x_left = int(self._rng.uniform(
-                    0, sh[1] - self.freq_width))
+                x_left = int(self._rng.uniform(0, sh[1] - self.freq_width))
 
                 w = int(self._rng.uniform(0, self.freq_width))
 
-                mask[idx, x_left:x_left + w, :] = 1
+                mask[idx, x_left : x_left + w, :] = 1
 
             for i in range(self.time_masks):
-                y_left = int(self._rng.uniform(
-                    0, sh[2] - self.time_width))
+                y_left = int(self._rng.uniform(0, sh[2] - self.time_width))
 
                 w = int(self._rng.uniform(0, self.time_width))
 
-                mask[idx, :,
-                     y_left:y_left + w] = 1
+                mask[idx, :, y_left : y_left + w] = 1
 
         x = x.masked_fill(mask.type(torch.bool).to(device=x.device), 0)
 
@@ -73,13 +66,8 @@ class SpecCutout(nn.Module):
     rect_freq - maximum size of cut rectangles along the frequency dimension
     rect_time - maximum size of cut rectangles along the time dimension
     """
-    def __init__(
-        self,
-        rect_masks=0,
-        rect_time=5,
-        rect_freq=20,
-        rng=None
-    ):
+
+    def __init__(self, rect_masks=0, rect_time=5, rect_freq=20, rng=None):
         super(SpecCutout, self).__init__()
 
         self._rng = random.Random() if rng is None else rng
@@ -96,16 +84,13 @@ def forward(self, x):
 
         for idx in range(sh[0]):
             for i in range(self.rect_masks):
-                rect_x = int(self._rng.uniform(
-                    0, sh[1] - self.rect_freq))
-                rect_y = int(self._rng.uniform(
-                    0, sh[2] - self.rect_time))
+                rect_x = int(self._rng.uniform(0, sh[1] - self.rect_freq))
+                rect_y = int(self._rng.uniform(0, sh[2] - self.rect_time))
 
                 w_x = int(self._rng.uniform(0, self.rect_time))
                 w_y = int(self._rng.uniform(0, self.rect_freq))
 
-                mask[idx, rect_x:rect_x + w_x,
-                     rect_y:rect_y + w_y] = 1
+                mask[idx, rect_x : rect_x + w_x, rect_y : rect_y + w_y] = 1
 
         x = x.masked_fill(mask.type(torch.bool).to(device=x.device), 0)
 
diff --git a/nemo/collections/nlp/__init__.py b/nemo/collections/nlp/__init__.py
index ceadeb76ce19..a11dc8708e2a 100644
--- a/nemo/collections/nlp/__init__.py
+++ b/nemo/collections/nlp/__init__.py
@@ -13,13 +13,11 @@
 # limitations under the License.
 # =============================================================================
 
+import nemo
+
 from .data import *
 from .huggingface import *
 from .modules import *
 from .transformer import *
 
-
-import nemo
-
-
 backend = nemo.core.Backend.PyTorch
diff --git a/nemo/collections/nlp/data/data_layers.py b/nemo/collections/nlp/data/data_layers.py
index 4f9bd15df33c..4a656cd2b89e 100644
--- a/nemo/collections/nlp/data/data_layers.py
+++ b/nemo/collections/nlp/data/data_layers.py
@@ -4,21 +4,23 @@
 # __all__ so that it can be imported with 'from text_data_layers import *'
 
 
-__all__ = ['GlueDataLayerClassification',
-           'GlueDataLayerRegression',
-           'BertJointIntentSlotDataLayer',
-           'BertJointIntentSlotInferDataLayer',
-           'BertPunctuationCapitalizationDataLayer',
-           'BertPunctuationCapitalizationInferDataLayer',
-           'BertPretrainingDataLayer',
-           'BertPretrainingPreprocessedDataLayer',
-           'BertSentenceClassificationDataLayer',
-           'BertTokenClassificationDataLayer',
-           'BertTokenClassificationInferDataLayer',
-           'BertQuestionAnsweringDataLayer',
-           'LanguageModelingDataLayer',
-           'TextDataLayer',
-           'TranslationDataLayer']
+__all__ = [
+    'GlueDataLayerClassification',
+    'GlueDataLayerRegression',
+    'BertJointIntentSlotDataLayer',
+    'BertJointIntentSlotInferDataLayer',
+    'BertPunctuationCapitalizationDataLayer',
+    'BertPunctuationCapitalizationInferDataLayer',
+    'BertPretrainingDataLayer',
+    'BertPretrainingPreprocessedDataLayer',
+    'BertSentenceClassificationDataLayer',
+    'BertTokenClassificationDataLayer',
+    'BertTokenClassificationInferDataLayer',
+    'BertQuestionAnsweringDataLayer',
+    'LanguageModelingDataLayer',
+    'TextDataLayer',
+    'TranslationDataLayer',
+]
 import os
 import random
 import sys
@@ -28,11 +30,12 @@
 import torch
 from torch.utils import data as pt_data
 
-from .datasets import *
 import nemo
 from nemo.backends.pytorch.nm import DataLayerNM
 from nemo.core.neural_types import *
 
+from .datasets import *
+
 
 class TextDataLayer(DataLayerNM):
     """
@@ -97,38 +100,31 @@ def output_ports(self):
 
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "labels": NeuralType({
-                0: AxisType(BatchTag),
-            }),
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag),}),
         }
 
-    def __init__(self,
-                 input_file,
-                 tokenizer,
-                 max_seq_length,
-                 num_samples=-1,
-                 shuffle=False,
-                 batch_size=64,
-                 dataset_type=BertSentenceClassificationDataset,
-                 **kwargs):
+    def __init__(
+        self,
+        input_file,
+        tokenizer,
+        max_seq_length,
+        num_samples=-1,
+        shuffle=False,
+        batch_size=64,
+        dataset_type=BertSentenceClassificationDataset,
+        **kwargs
+    ):
         kwargs['batch_size'] = batch_size
-        dataset_params = {'input_file': input_file,
-                          'tokenizer': tokenizer,
-                          'max_seq_length': max_seq_length,
-                          'num_samples': num_samples,
-                          'shuffle': shuffle}
+        dataset_params = {
+            'input_file': input_file,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+            'num_samples': num_samples,
+            'shuffle': shuffle,
+        }
         super().__init__(dataset_type, dataset_params, **kwargs)
 
 
@@ -189,58 +185,42 @@ def output_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "loss_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "subtokens_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "intents": NeuralType({
-                0: AxisType(BatchTag),
-            }),
-            "slots": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "intents": NeuralType({0: AxisType(BatchTag),}),
+            "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
-    def __init__(self,
-                 input_file,
-                 slot_file,
-                 pad_label,
-                 tokenizer,
-                 max_seq_length,
-                 num_samples=-1,
-                 shuffle=False,
-                 batch_size=64,
-                 ignore_extra_tokens=False,
-                 ignore_start_end=False,
-                 dataset_type=BertJointIntentSlotDataset,
-                 **kwargs):
+    def __init__(
+        self,
+        input_file,
+        slot_file,
+        pad_label,
+        tokenizer,
+        max_seq_length,
+        num_samples=-1,
+        shuffle=False,
+        batch_size=64,
+        ignore_extra_tokens=False,
+        ignore_start_end=False,
+        dataset_type=BertJointIntentSlotDataset,
+        **kwargs
+    ):
         kwargs['batch_size'] = batch_size
-        dataset_params = {'input_file': input_file,
-                          'slot_file': slot_file,
-                          'pad_label': pad_label,
-                          'tokenizer': tokenizer,
-                          'max_seq_length': max_seq_length,
-                          'num_samples': num_samples,
-                          'shuffle': shuffle,
-                          'ignore_extra_tokens': ignore_extra_tokens,
-                          'ignore_start_end': ignore_start_end}
+        dataset_params = {
+            'input_file': input_file,
+            'slot_file': slot_file,
+            'pad_label': pad_label,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+            'num_samples': num_samples,
+            'shuffle': shuffle,
+            'ignore_extra_tokens': ignore_extra_tokens,
+            'ignore_start_end': ignore_start_end,
+        }
         super().__init__(dataset_type, dataset_params, **kwargs)
 
 
@@ -294,39 +274,22 @@ def output_ports(self):
 
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "loss_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "subtokens_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
-    def __init__(self,
-                 queries,
-                 tokenizer,
-                 max_seq_length,
-                 batch_size=1,
-                 dataset_type=BertJointIntentSlotInferDataset,
-                 **kwargs):
+    def __init__(
+        self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertJointIntentSlotInferDataset, **kwargs
+    ):
         kwargs['batch_size'] = batch_size
-        dataset_params = {'queries': queries,
-                          'tokenizer': tokenizer,
-                          'max_seq_length': max_seq_length}
+        dataset_params = {
+            'queries': queries,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+        }
         super().__init__(dataset_type, dataset_params, **kwargs)
 
 
@@ -364,39 +327,24 @@ def output_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            "input_ids":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "labels":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
-    def __init__(self,
-                 dataset,
-                 tokenizer,
-                 max_seq_length,
-                 batch_step=128,
-                 dataset_type=LanguageModelingDataset,
-                 **kwargs):
-        dataset_params = {'dataset': dataset,
-                          'tokenizer': tokenizer,
-                          'max_seq_length': max_seq_length,
-                          'batch_step': batch_step}
+    def __init__(
+        self, dataset, tokenizer, max_seq_length, batch_step=128, dataset_type=LanguageModelingDataset, **kwargs
+    ):
+        dataset_params = {
+            'dataset': dataset,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+            'batch_step': batch_step,
+        }
         super().__init__(dataset_type, dataset_params, **kwargs)
 
 
 class BertTokenClassificationDataLayer(TextDataLayer):
-
     @property
     def output_ports(self):
         """Returns definitions of module output ports.
@@ -432,65 +380,50 @@ def output_ports(self):
                 1: AxisType(TimeTag)
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "loss_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "subtokens_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "labels": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
-    def __init__(self,
-                 text_file,
-                 label_file,
-                 tokenizer,
-                 max_seq_length,
-                 pad_label='O',
-                 label_ids=None,
-                 num_samples=-1,
-                 shuffle=False,
-                 batch_size=64,
-                 ignore_extra_tokens=False,
-                 ignore_start_end=False,
-                 use_cache=False,
-                 dataset_type=BertTokenClassificationDataset,
-                 **kwargs):
+    def __init__(
+        self,
+        text_file,
+        label_file,
+        tokenizer,
+        max_seq_length,
+        pad_label='O',
+        label_ids=None,
+        num_samples=-1,
+        shuffle=False,
+        batch_size=64,
+        ignore_extra_tokens=False,
+        ignore_start_end=False,
+        use_cache=False,
+        dataset_type=BertTokenClassificationDataset,
+        **kwargs
+    ):
 
         kwargs['batch_size'] = batch_size
-        dataset_params = {'text_file': text_file,
-                          'label_file': label_file,
-                          'max_seq_length': max_seq_length,
-                          'tokenizer': tokenizer,
-                          'num_samples': num_samples,
-                          'shuffle': shuffle,
-                          'pad_label': pad_label,
-                          'label_ids': label_ids,
-                          'ignore_extra_tokens': ignore_extra_tokens,
-                          'ignore_start_end': ignore_start_end,
-                          'use_cache': use_cache}
+        dataset_params = {
+            'text_file': text_file,
+            'label_file': label_file,
+            'max_seq_length': max_seq_length,
+            'tokenizer': tokenizer,
+            'num_samples': num_samples,
+            'shuffle': shuffle,
+            'pad_label': pad_label,
+            'label_ids': label_ids,
+            'ignore_extra_tokens': ignore_extra_tokens,
+            'ignore_start_end': ignore_start_end,
+            'use_cache': use_cache,
+        }
         super().__init__(dataset_type, dataset_params, **kwargs)
 
 
 class BertTokenClassificationInferDataLayer(TextDataLayer):
-
     @property
     def output_ports(self):
         """Returns definitions of module output ports.
@@ -522,44 +455,32 @@ def output_ports(self):
 
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "loss_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "subtokens_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
-    def __init__(self,
-                 queries,
-                 tokenizer,
-                 max_seq_length,
-                 batch_size=1,
-                 dataset_type=BertTokenClassificationInferDataset,
-                 **kwargs):
+    def __init__(
+        self,
+        queries,
+        tokenizer,
+        max_seq_length,
+        batch_size=1,
+        dataset_type=BertTokenClassificationInferDataset,
+        **kwargs
+    ):
         kwargs['batch_size'] = batch_size
-        dataset_params = {'queries': queries,
-                          'tokenizer': tokenizer,
-                          'max_seq_length': max_seq_length}
+        dataset_params = {
+            'queries': queries,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+        }
         super().__init__(dataset_type, dataset_params, **kwargs)
 
 
 class BertPunctuationCapitalizationDataLayer(TextDataLayer):
-
     @property
     def output_ports(self):
         """Returns definitions of module output ports.
@@ -601,71 +522,53 @@ def output_ports(self):
 
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "loss_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "subtokens_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "punct_labels": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "capit_labels": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "punct_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "capit_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
-    def __init__(self,
-                 text_file,
-                 label_file,
-                 tokenizer,
-                 max_seq_length,
-                 pad_label='O',
-                 punct_label_ids=None,
-                 capit_label_ids=None,
-                 num_samples=-1,
-                 shuffle=False,
-                 batch_size=64,
-                 ignore_extra_tokens=False,
-                 ignore_start_end=False,
-                 use_cache=False,
-                 dataset_type=BertPunctuationCapitalizationDataset,
-                 **kwargs):
+    def __init__(
+        self,
+        text_file,
+        label_file,
+        tokenizer,
+        max_seq_length,
+        pad_label='O',
+        punct_label_ids=None,
+        capit_label_ids=None,
+        num_samples=-1,
+        shuffle=False,
+        batch_size=64,
+        ignore_extra_tokens=False,
+        ignore_start_end=False,
+        use_cache=False,
+        dataset_type=BertPunctuationCapitalizationDataset,
+        **kwargs
+    ):
 
         kwargs['batch_size'] = batch_size
-        dataset_params = {'text_file': text_file,
-                          'label_file': label_file,
-                          'max_seq_length': max_seq_length,
-                          'tokenizer': tokenizer,
-                          'num_samples': num_samples,
-                          'shuffle': shuffle,
-                          'pad_label': pad_label,
-                          'punct_label_ids': punct_label_ids,
-                          'capit_label_ids': capit_label_ids,
-                          'ignore_extra_tokens': ignore_extra_tokens,
-                          'ignore_start_end': ignore_start_end,
-                          'use_cache': use_cache}
+        dataset_params = {
+            'text_file': text_file,
+            'label_file': label_file,
+            'max_seq_length': max_seq_length,
+            'tokenizer': tokenizer,
+            'num_samples': num_samples,
+            'shuffle': shuffle,
+            'pad_label': pad_label,
+            'punct_label_ids': punct_label_ids,
+            'capit_label_ids': capit_label_ids,
+            'ignore_extra_tokens': ignore_extra_tokens,
+            'ignore_start_end': ignore_start_end,
+            'use_cache': use_cache,
+        }
         super().__init__(dataset_type, dataset_params, **kwargs)
 
 
 class BertPunctuationCapitalizationInferDataLayer(TextDataLayer):
-
     @property
     def output_ports(self):
         """Returns definitions of module output ports.
@@ -697,39 +600,28 @@ def output_ports(self):
 
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "loss_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "subtokens_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
-    def __init__(self,
-                 queries,
-                 tokenizer,
-                 max_seq_length,
-                 batch_size=1,
-                 dataset_type=BertTokenClassificationInferDataset,
-                 **kwargs):
+    def __init__(
+        self,
+        queries,
+        tokenizer,
+        max_seq_length,
+        batch_size=1,
+        dataset_type=BertTokenClassificationInferDataset,
+        **kwargs
+    ):
         kwargs['batch_size'] = batch_size
-        dataset_params = {'queries': queries,
-                          'tokenizer': tokenizer,
-                          'max_seq_length': max_seq_length}
+        dataset_params = {
+            'queries': queries,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+        }
         super().__init__(dataset_type, dataset_params, **kwargs)
 
 
@@ -787,47 +679,37 @@ def output_ports(self):
 
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "start_positions": NeuralType({
-                0: AxisType(BatchTag)
-            }),
-            "end_positions": NeuralType({
-                0: AxisType(BatchTag)
-            }),
-            "unique_ids": NeuralType({
-                0: AxisType(BatchTag)})
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "start_positions": NeuralType({0: AxisType(BatchTag)}),
+            "end_positions": NeuralType({0: AxisType(BatchTag)}),
+            "unique_ids": NeuralType({0: AxisType(BatchTag)}),
         }
 
-    def __init__(self,
-                 data_dir,
-                 tokenizer,
-                 version_2_with_negative,
-                 doc_stride,
-                 max_query_length,
-                 max_seq_length,
-                 mode="train",
-                 batch_size=64,
-                 dataset_type=SquadDataset,
-                 **kwargs):
+    def __init__(
+        self,
+        data_dir,
+        tokenizer,
+        version_2_with_negative,
+        doc_stride,
+        max_query_length,
+        max_seq_length,
+        mode="train",
+        batch_size=64,
+        dataset_type=SquadDataset,
+        **kwargs
+    ):
         kwargs['batch_size'] = batch_size
-        dataset_params = {'data_dir': data_dir,
-                          'mode': mode,
-                          'tokenizer': tokenizer,
-                          'version_2_with_negative': version_2_with_negative,
-                          'max_query_length': max_query_length,
-                          'max_seq_length': max_seq_length,
-                          'doc_stride': doc_stride}
+        dataset_params = {
+            'data_dir': data_dir,
+            'mode': mode,
+            'tokenizer': tokenizer,
+            'version_2_with_negative': version_2_with_negative,
+            'max_query_length': max_query_length,
+            'max_seq_length': max_seq_length,
+            'doc_stride': doc_stride,
+        }
 
         super().__init__(dataset_type, dataset_params, **kwargs)
 
@@ -883,43 +765,25 @@ def output_ports(self):
 
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "output_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "output_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "labels": NeuralType({0: AxisType(BatchTag)}),
         }
 
-    def __init__(self,
-                 tokenizer,
-                 dataset,
-                 max_seq_length,
-                 mask_probability,
-                 short_seq_prob=0.1,
-                 batch_size=64,
-                 **kwargs):
+    def __init__(
+        self, tokenizer, dataset, max_seq_length, mask_probability, short_seq_prob=0.1, batch_size=64, **kwargs
+    ):
         kwargs['batch_size'] = batch_size
-        dataset_params = {'tokenizer': tokenizer,
-                          'dataset': dataset,
-                          'max_seq_length': max_seq_length,
-                          'mask_probability': mask_probability,
-                          'short_seq_prob': short_seq_prob}
+        dataset_params = {
+            'tokenizer': tokenizer,
+            'dataset': dataset,
+            'max_seq_length': max_seq_length,
+            'mask_probability': mask_probability,
+            'short_seq_prob': short_seq_prob,
+        }
         super().__init__(BertPretrainingDataset, dataset_params, **kwargs)
 
 
@@ -974,40 +838,20 @@ def output_ports(self):
 
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "output_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "output_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "labels": NeuralType({0: AxisType(BatchTag)}),
         }
 
-    def __init__(self,
-                 dataset,
-                 max_pred_length,
-                 batch_size=64,
-                 training=True,
-                 **kwargs):
+    def __init__(self, dataset, max_pred_length, batch_size=64, training=True, **kwargs):
 
         if os.path.isdir(dataset):
-            self.files = [os.path.join(dataset, f)
-                          for f in os.listdir(dataset)
-                          if os.path.isfile(os.path.join(dataset, f))]
+            self.files = [
+                os.path.join(dataset, f) for f in os.listdir(dataset) if os.path.isfile(os.path.join(dataset, f))
+            ]
         else:
             self.files = [dataset]
         self.files.sort()
@@ -1030,8 +874,7 @@ def _collate_fn(self, x):
         for i in range(batch_size):
             for j in range(num_components):
                 components[j].append(x[i][j])
-        src_ids, src_segment_ids, src_mask, tgt_ids, tgt_mask, sent_ids = \
-            [np.stack(x, axis=0) for x in components]
+        src_ids, src_segment_ids, src_mask, tgt_ids, tgt_mask, sent_ids = [np.stack(x, axis=0) for x in components]
         src_ids = torch.Tensor(src_ids).long().to(self._device)
         src_segment_ids = torch.Tensor(src_segment_ids).long().to(self._device)
         src_mask = torch.Tensor(src_mask).long().to(self._device)
@@ -1055,15 +898,16 @@ def data_iterator(self):
             for f_id in range(self.num_files):
                 data_file = self.files[f_id]
                 train_data = BertPretrainingPreprocessedDataset(
-                    input_file=data_file,
-                    max_pred_length=self.max_pred_length)
+                    input_file=data_file, max_pred_length=self.max_pred_length
+                )
                 train_sampler = pt_data.RandomSampler(train_data)
                 train_dataloader = pt_data.DataLoader(
                     dataset=train_data,
                     batch_size=self.batch_size,
                     collate_fn=self._collate_fn,
                     shuffle=train_sampler is None,
-                    sampler=train_sampler)
+                    sampler=train_sampler,
+                )
                 for x in train_dataloader:
                     yield x
 
@@ -1123,46 +967,33 @@ def output_ports(self):
 
         """
         return {
-            "src_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "src_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "tgt_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "tgt_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "labels": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "sent_ids": NeuralType({
-                0: AxisType(BatchTag)
-            })
+            "src_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "src_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "tgt_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "tgt_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "sent_ids": NeuralType({0: AxisType(BatchTag)}),
         }
 
-    def __init__(self,
-                 tokenizer_src,
-                 tokenizer_tgt,
-                 dataset_src,
-                 dataset_tgt,
-                 tokens_in_batch=1024,
-                 clean=False,
-                 dataset_type=TranslationDataset,
-                 **kwargs):
-        dataset_params = {'tokenizer_src': tokenizer_src,
-                          'tokenizer_tgt': tokenizer_tgt,
-                          'dataset_src': dataset_src,
-                          'dataset_tgt': dataset_tgt,
-                          'tokens_in_batch': tokens_in_batch,
-                          'clean': clean}
+    def __init__(
+        self,
+        tokenizer_src,
+        tokenizer_tgt,
+        dataset_src,
+        dataset_tgt,
+        tokens_in_batch=1024,
+        clean=False,
+        dataset_type=TranslationDataset,
+        **kwargs
+    ):
+        dataset_params = {
+            'tokenizer_src': tokenizer_src,
+            'tokenizer_tgt': tokenizer_tgt,
+            'dataset_src': dataset_src,
+            'dataset_tgt': dataset_tgt,
+            'tokens_in_batch': tokens_in_batch,
+            'clean': clean,
+        }
         super().__init__(dataset_type, dataset_params, **kwargs)
 
         if self._placement == nemo.core.DeviceType.AllGpu:
@@ -1170,11 +1001,9 @@ def __init__(self,
         else:
             sampler = None
 
-        self._dataloader = pt_data.DataLoader(dataset=self._dataset,
-                                              batch_size=1,
-                                              collate_fn=self._collate_fn,
-                                              shuffle=sampler is None,
-                                              sampler=sampler)
+        self._dataloader = pt_data.DataLoader(
+            dataset=self._dataset, batch_size=1, collate_fn=self._collate_fn, shuffle=sampler is None, sampler=sampler,
+        )
 
     def _collate_fn(self, x):
         src_ids, src_mask, tgt_ids, tgt_mask, labels, sent_ids = x[0]
@@ -1230,44 +1059,37 @@ def output_ports(self):
                 0: AxisType(CategoricalTag)
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "labels": NeuralType({
-                0: AxisType(CategoricalTag),
-            }),
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(CategoricalTag),}),
         }
 
-    def __init__(self,
-                 data_dir,
-                 tokenizer,
-                 max_seq_length,
-                 processor,
-                 evaluate=False,
-                 token_params={},
-                 num_samples=-1,
-                 shuffle=False,
-                 batch_size=64,
-                 dataset_type=GLUEDataset,
-                 **kwargs):
+    def __init__(
+        self,
+        data_dir,
+        tokenizer,
+        max_seq_length,
+        processor,
+        evaluate=False,
+        token_params={},
+        num_samples=-1,
+        shuffle=False,
+        batch_size=64,
+        dataset_type=GLUEDataset,
+        **kwargs
+    ):
 
         kwargs['batch_size'] = batch_size
-        dataset_params = {'data_dir': data_dir,
-                          'output_mode': 'classification',
-                          'processor': processor,
-                          'evaluate': evaluate,
-                          'token_params': token_params,
-                          'tokenizer': tokenizer,
-                          'max_seq_length': max_seq_length}
+        dataset_params = {
+            'data_dir': data_dir,
+            'output_mode': 'classification',
+            'processor': processor,
+            'evaluate': evaluate,
+            'token_params': token_params,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+        }
 
         super().__init__(dataset_type, dataset_params, **kwargs)
 
@@ -1307,43 +1129,36 @@ def output_ports(self):
                 0: AxisType(RegressionTag)
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "labels": NeuralType({
-                0: AxisType(RegressionTag),
-            }),
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(RegressionTag),}),
         }
 
-    def __init__(self,
-                 data_dir,
-                 tokenizer,
-                 max_seq_length,
-                 processor,
-                 evaluate=False,
-                 token_params={},
-                 num_samples=-1,
-                 shuffle=False,
-                 batch_size=64,
-                 dataset_type=GLUEDataset,
-                 **kwargs):
+    def __init__(
+        self,
+        data_dir,
+        tokenizer,
+        max_seq_length,
+        processor,
+        evaluate=False,
+        token_params={},
+        num_samples=-1,
+        shuffle=False,
+        batch_size=64,
+        dataset_type=GLUEDataset,
+        **kwargs
+    ):
 
         kwargs['batch_size'] = batch_size
-        dataset_params = {'data_dir': data_dir,
-                          'output_mode': 'regression',
-                          'processor': processor,
-                          'evaluate': evaluate,
-                          'token_params': token_params,
-                          'tokenizer': tokenizer,
-                          'max_seq_length': max_seq_length}
+        dataset_params = {
+            'data_dir': data_dir,
+            'output_mode': 'regression',
+            'processor': processor,
+            'evaluate': evaluate,
+            'token_params': token_params,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+        }
 
         super().__init__(dataset_type, dataset_params, **kwargs)
diff --git a/nemo/collections/nlp/data/datasets/__init__.py b/nemo/collections/nlp/data/datasets/__init__.py
index b4f59ccfc39d..3244c1266b19 100644
--- a/nemo/collections/nlp/data/datasets/__init__.py
+++ b/nemo/collections/nlp/data/datasets/__init__.py
@@ -1,14 +1,9 @@
-from .bert_pretraining import (BertPretrainingDataset,
-                               BertPretrainingPreprocessedDataset)
+from .bert_pretraining import BertPretrainingDataset, BertPretrainingPreprocessedDataset
 from .glue import GLUEDataset
-from .joint_intent_slot import (BertJointIntentSlotDataset,
-                                BertJointIntentSlotInferDataset)
+from .joint_intent_slot import BertJointIntentSlotDataset, BertJointIntentSlotInferDataset
 from .language_modeling import LanguageModelingDataset
-from .punctuation_capitalization import (
-    BertPunctuationCapitalizationDataset,
-    BertPunctuationCapitalizationInferDataset)
+from .punctuation_capitalization import BertPunctuationCapitalizationDataset, BertPunctuationCapitalizationInferDataset
 from .sentence_classification import BertSentenceClassificationDataset
 from .squad import SquadDataset
-from .token_classification import (BertTokenClassificationDataset,
-                                   BertTokenClassificationInferDataset)
+from .token_classification import BertTokenClassificationDataset, BertTokenClassificationInferDataset
 from .translation import TranslationDataset
diff --git a/nemo/collections/nlp/data/datasets/bert_pretraining.py b/nemo/collections/nlp/data/datasets/bert_pretraining.py
index 3cf8b8fed5f5..c32ca7271654 100644
--- a/nemo/collections/nlp/data/datasets/bert_pretraining.py
+++ b/nemo/collections/nlp/data/datasets/bert_pretraining.py
@@ -20,6 +20,7 @@
 import os
 import pickle
 import random
+
 import h5py
 import numpy as np
 from torch.utils.data import Dataset
@@ -27,14 +28,16 @@
 
 
 class BertPretrainingDataset(Dataset):
-    def __init__(self,
-                 tokenizer,
-                 dataset,
-                 max_seq_length=128,
-                 mask_probability=0.15,
-                 short_seq_prob=0.1,
-                 seq_a_ratio=0.6,
-                 sentence_idx_file=None):
+    def __init__(
+        self,
+        tokenizer,
+        dataset,
+        max_seq_length=128,
+        mask_probability=0.15,
+        short_seq_prob=0.1,
+        seq_a_ratio=0.6,
+        sentence_idx_file=None,
+    ):
         self.tokenizer = tokenizer
         self.cls_id = tokenizer.token_to_id("[CLS]")
         self.sep_id = tokenizer.token_to_id("[SEP]")
@@ -47,8 +50,8 @@ def __init__(self,
         # from main memory when needed during training.
 
         if sentence_idx_file is None:
-            data_dir = dataset[:dataset.rfind('/')]
-            mode = dataset[dataset.rfind('/') + 1:dataset.rfind('.')]
+            data_dir = dataset[: dataset.rfind('/')]
+            mode = dataset[dataset.rfind('/') + 1 : dataset.rfind('.')]
             sentence_idx_file = f"{data_dir}/{mode}_sentence_indices.pkl"
 
         if os.path.isfile(sentence_idx_file):
@@ -67,10 +70,12 @@ def find_newlines(contents):
                     try:
                         # index and split are much faster than Python for loops
                         new_start = contents.index(b"\n", start)
-                        line = contents[start:new_start] \
-                            .replace(b"\xc2\x99", b" ") \
-                            .replace(b"\xc2\xa0", b" ") \
+                        line = (
+                            contents[start:new_start]
+                            .replace(b"\xc2\x99", b" ")
+                            .replace(b"\xc2\xa0", b" ")
                             .decode("utf-8", errors="ignore")
+                        )
 
                         if len(line.split()) > 0:
                             yield start
@@ -157,8 +162,7 @@ def get_document(filepath, offset):
 
             return document
 
-        def match_target_seq_length(document, target_seq_length, filename,
-                                    line_idx, sentence_indices):
+        def match_target_seq_length(document, target_seq_length, filename, line_idx, sentence_indices):
             # If document is shorter than target sequence length,
             # append the next line or take a random line as replacement.
             num_lines = len(sentence_indices[filename])
@@ -183,11 +187,10 @@ def match_target_seq_length(document, target_seq_length, filename,
         a_line_offset = self.sentence_indices[a_filename][a_line_idx]
         a_document = get_document(a_filename, a_line_offset)
         a_document, a_line_idx = match_target_seq_length(
-            a_document, target_seq_length_a, a_filename, a_line_idx,
-            self.sentence_indices)
+            a_document, target_seq_length_a, a_filename, a_line_idx, self.sentence_indices,
+        )
 
-        is_last_line = \
-            a_line_idx >= (len(self.sentence_indices[a_filename]) - 1)
+        is_last_line = a_line_idx >= (len(self.sentence_indices[a_filename]) - 1)
         # About 50% of the time, B is a random sentence from the corpus
         take_random_b = (random.random() < 0.5) or is_last_line
 
@@ -198,8 +201,7 @@ def match_target_seq_length(document, target_seq_length, filename,
             # we're processing.
             for _ in range(10):
                 b_filename = random.choice(self.filenames)
-                b_line_idx = random.choice(
-                    range(len(self.sentence_indices[b_filename])))
+                b_line_idx = random.choice(range(len(self.sentence_indices[b_filename])))
                 if b_filename != a_filename:
                     break
                 else:
@@ -207,7 +209,7 @@ def match_target_seq_length(document, target_seq_length, filename,
                     b_line_pos = self.sentence_indices[b_filename][b_line_idx]
                     a_line_pos = self.sentence_indices[a_filename][a_line_idx]
                     # TODO unclear about the following check
-                    if (abs(b_line_pos - a_line_pos) > max_num_tokens):
+                    if abs(b_line_pos - a_line_pos) > max_num_tokens:
                         break
                     else:
                         pass
@@ -219,8 +221,8 @@ def match_target_seq_length(document, target_seq_length, filename,
         b_line_pos = self.sentence_indices[b_filename][b_line_idx]
         b_document = get_document(b_filename, b_line_pos)
         b_document, b_line_idx = match_target_seq_length(
-            b_document, target_seq_length_b, b_filename, b_line_idx,
-            self.sentence_indices)
+            b_document, target_seq_length_b, b_filename, b_line_idx, self.sentence_indices,
+        )
 
         def truncate_seq_pair(a, b, max_num_tokens):
             # Truncates a pair of sequences to a maximum sequence length
@@ -232,9 +234,11 @@ def truncate_seq_pair(a, b, max_num_tokens):
                     trunc_document = b
 
                 if len(trunc_document) <= 1:
-                    raise ValueError("Input text corpora probably too small. "
-                                     "Failed to truncate sequence pair to "
-                                     "maximum sequence legnth.")
+                    raise ValueError(
+                        "Input text corpora probably too small. "
+                        "Failed to truncate sequence pair to "
+                        "maximum sequence legnth."
+                    )
 
                 # Randomly truncate from the front or the back
                 if random.random() < 0.5:
@@ -244,16 +248,15 @@ def truncate_seq_pair(a, b, max_num_tokens):
 
         truncate_seq_pair(a_document, b_document, max_num_tokens)
 
-        output_ids = [self.cls_id] + a_document + \
-                     [self.sep_id] + b_document + [self.sep_id]
+        output_ids = [self.cls_id] + a_document + [self.sep_id] + b_document + [self.sep_id]
 
         input_ids, output_mask = self.mask_ids(output_ids)
 
         input_mask = np.zeros(self.max_seq_length, dtype=np.long)
-        input_mask[:len(input_ids)] = 1
+        input_mask[: len(input_ids)] = 1
 
         input_type_ids = np.zeros(self.max_seq_length, dtype=np.int)
-        input_type_ids[len(a_document) + 2:len(output_ids) + 1] = 1
+        input_type_ids[len(a_document) + 2 : len(output_ids) + 1] = 1
 
         padding_length = max(0, self.max_seq_length - len(input_ids))
         if padding_length > 0:
@@ -262,9 +265,14 @@ def truncate_seq_pair(a, b, max_num_tokens):
             output_mask.extend([0] * padding_length)
 
         # TODO: wrap the return value with () for consistent style.
-        return np.array(input_ids), input_type_ids,\
-            np.array(input_mask, dtype=np.long), np.array(output_ids),\
-            np.array(output_mask, dtype=np.float32), is_next
+        return (
+            np.array(input_ids),
+            input_type_ids,
+            np.array(input_mask, dtype=np.long),
+            np.array(output_ids),
+            np.array(output_mask, dtype=np.float32),
+            is_next,
+        )
 
     def mask_ids(self, ids):
         """
@@ -296,8 +304,7 @@ def mask_ids(self, ids):
         mask_id = self.tokenizer.token_to_id("[MASK]")
 
         for word_ids in cand_indexes:
-            is_special = (word_ids[0] == self.cls_id) or \
-                         (word_ids[0] == self.sep_id)
+            is_special = (word_ids[0] == self.cls_id) or (word_ids[0] == self.sep_id)
             if is_special or (random.random() > self.mask_probability):
                 output_mask.extend([0] * len(word_ids))
                 masked_ids.extend(word_ids)
@@ -323,15 +330,18 @@ def mask_ids(self, ids):
 
 
 class BertPretrainingPreprocessedDataset(Dataset):
-
     def __init__(self, input_file, max_pred_length):
         self.input_file = input_file
         self.max_pred_length = max_pred_length
         f = h5py.File(input_file, "r")
-        keys = ['input_ids', 'input_mask',
-                'segment_ids', 'masked_lm_positions',
-                'masked_lm_ids',
-                'next_sentence_labels']
+        keys = [
+            'input_ids',
+            'input_mask',
+            'segment_ids',
+            'masked_lm_positions',
+            'masked_lm_ids',
+            'next_sentence_labels',
+        ]
         self.inputs = [np.asarray(f[key][:]) for key in keys]
         f.close()
 
@@ -341,11 +351,9 @@ def __len__(self):
 
     def __getitem__(self, index):
 
-        [input_ids, input_mask, segment_ids,
-         masked_lm_positions, masked_lm_ids,
-         next_sentence_labels] = \
-         [input[index].astype(np.int64)
-          for input in self.inputs]
+        [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels,] = [
+            input[index].astype(np.int64) for input in self.inputs
+        ]
 
         output_mask = np.zeros_like(input_ids)
         output_ids = input_ids.copy()
@@ -355,10 +363,16 @@ def __getitem__(self, index):
         if len(padded_mask_indices[0]) != 0:
             index = padded_mask_indices[0][0]
 
-        output_mask[masked_lm_positions[:index]] = 1.
+        output_mask[masked_lm_positions[:index]] = 1.0
         output_ids[masked_lm_positions[:index]] = masked_lm_ids[:index]
 
         input_mask = np.asarray(input_mask, dtype=np.float32)
         output_mask = np.asarray(output_mask, dtype=np.float32)
-        return input_ids, segment_ids, input_mask,\
-            output_ids, output_mask, next_sentence_labels
+        return (
+            input_ids,
+            segment_ids,
+            input_mask,
+            output_ids,
+            output_mask,
+            next_sentence_labels,
+        )
diff --git a/nemo/collections/nlp/data/datasets/glue.py b/nemo/collections/nlp/data/datasets/glue.py
index fa70f776b184..8893c5747c45 100644
--- a/nemo/collections/nlp/data/datasets/glue.py
+++ b/nemo/collections/nlp/data/datasets/glue.py
@@ -20,59 +20,55 @@
 https://github.com/huggingface/transformers
 """
 
-import nemo
 import numpy as np
 from torch.utils.data import Dataset
 
+import nemo
+
 
 class GLUEDataset(Dataset):
-    def __init__(self,
-                 data_dir,
-                 tokenizer,
-                 max_seq_length,
-                 processor,
-                 output_mode,
-                 evaluate,
-                 token_params):
+    def __init__(
+        self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params,
+    ):
         self.tokenizer = tokenizer
         self.label_list = processor.get_labels()
-        self.examples = processor.get_dev_examples(data_dir) if evaluate \
-            else processor.get_train_examples(data_dir)
-        self.features = convert_examples_to_features(self.examples,
-                                                     self.label_list,
-                                                     max_seq_length,
-                                                     tokenizer,
-                                                     output_mode,
-                                                     **token_params)
+        self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
+        self.features = convert_examples_to_features(
+            self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params
+        )
 
     def __len__(self):
         return len(self.features)
 
     def __getitem__(self, idx):
         feature = self.features[idx]
-        return (np.array(feature.input_ids),
-                np.array(feature.segment_ids),
-                np.array(feature.input_mask, dtype=np.long),
-                np.array(feature.label_id))
-
-
-def convert_examples_to_features(examples,
-                                 label_list,
-                                 max_seq_length,
-                                 tokenizer,
-                                 output_mode,
-                                 bos_token=None,
-                                 eos_token='[SEP]',
-                                 pad_token='[PAD]',
-                                 cls_token='[CLS]',
-                                 sep_token_extra=None,
-                                 cls_token_at_end=False,
-                                 cls_token_segment_id=0,
-                                 pad_token_segment_id=0,
-                                 pad_on_left=False,
-                                 mask_padding_with_zero=True,
-                                 sequence_a_segment_id=0,
-                                 sequence_b_segment_id=1):
+        return (
+            np.array(feature.input_ids),
+            np.array(feature.segment_ids),
+            np.array(feature.input_mask, dtype=np.long),
+            np.array(feature.label_id),
+        )
+
+
+def convert_examples_to_features(
+    examples,
+    label_list,
+    max_seq_length,
+    tokenizer,
+    output_mode,
+    bos_token=None,
+    eos_token='[SEP]',
+    pad_token='[PAD]',
+    cls_token='[CLS]',
+    sep_token_extra=None,
+    cls_token_at_end=False,
+    cls_token_segment_id=0,
+    pad_token_segment_id=0,
+    pad_on_left=False,
+    mask_padding_with_zero=True,
+    sequence_a_segment_id=0,
+    sequence_b_segment_id=1,
+):
     """ Loads a data file into a list of `InputBatch`s
         `cls_token_at_end` define the location of the CLS token:
             - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
@@ -109,8 +105,7 @@ def convert_examples_to_features(examples,
     features = []
     for ex_index, example in enumerate(examples):
         if ex_index % 10000 == 0:
-            nemo.logging.info(
-                "Writing example %d of %d" % (ex_index, len(examples)))
+            nemo.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
 
         tokens_a = tokenizer.text_to_tokens(example.text_a)
 
@@ -122,14 +117,13 @@ def convert_examples_to_features(examples,
             special_tokens_count += 1 if sep_token_extra else 0
             special_tokens_count += 2 if bos_token else 0
             special_tokens_count += 1 if cls_token else 0
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length -
-                               special_tokens_count)
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
         else:
             special_tokens_count = 1 if eos_token else 0
             special_tokens_count += 1 if sep_token_extra else 0
             special_tokens_count += 1 if bos_token else 0
             if len(tokens_a) > max_seq_length - special_tokens_count:
-                tokens_a = tokens_a[:max_seq_length - special_tokens_count]
+                tokens_a = tokens_a[: max_seq_length - special_tokens_count]
         # Add special tokens to sequence_a
         tokens = tokens_a
         if bos_token:
@@ -173,16 +167,12 @@ def convert_examples_to_features(examples,
         pad_token_id = tokenizer.tokens_to_ids([pad_token])[0]
         if pad_on_left:
             input_ids = ([pad_token_id] * padding_length) + input_ids
-            input_mask = ([0 if mask_padding_with_zero else 1] *
-                          padding_length) + input_mask
-            segment_ids = ([pad_token_segment_id] * padding_length) + \
-                segment_ids
+            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
+            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
         else:
             input_ids = input_ids + ([pad_token_id] * padding_length)
-            input_mask = input_mask + \
-                ([0 if mask_padding_with_zero else 1] * padding_length)
-            segment_ids = segment_ids + \
-                ([pad_token_segment_id] * padding_length)
+            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
         if len(input_ids) != max_seq_length:
             raise ValueError("input_ids must be of length max_seq_length")
         if len(input_mask) != max_seq_length:
@@ -199,22 +189,15 @@ def convert_examples_to_features(examples,
         if ex_index < 5:
             nemo.logging.info("*** Example ***")
             nemo.logging.info("guid: %s" % (example.guid))
-            nemo.logging.info(
-                "tokens: %s" % " ".join(list(map(str, tokens))))
-            nemo.logging.info(
-                "input_ids: %s" % " ".join(list(map(str, input_ids))))
-            nemo.logging.info(
-                "input_mask: %s" % " ".join(list(map(str, input_mask))))
-            nemo.logging.info(
-                "segment_ids: %s" % " ".join(list(map(str, segment_ids))))
-            nemo.logging.info(
-                "label: %s (id = %d)" % (example.label, label_id))
+            nemo.logging.info("tokens: %s" % " ".join(list(map(str, tokens))))
+            nemo.logging.info("input_ids: %s" % " ".join(list(map(str, input_ids))))
+            nemo.logging.info("input_mask: %s" % " ".join(list(map(str, input_mask))))
+            nemo.logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids))))
+            nemo.logging.info("label: %s (id = %d)" % (example.label, label_id))
 
         features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_mask=input_mask,
-                              segment_ids=segment_ids,
-                              label_id=label_id))
+            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id,)
+        )
     return features
 
 
diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot.py b/nemo/collections/nlp/data/datasets/joint_intent_slot.py
index 0b028a0e20e7..f7778177c5f3 100644
--- a/nemo/collections/nlp/data/datasets/joint_intent_slot.py
+++ b/nemo/collections/nlp/data/datasets/joint_intent_slot.py
@@ -28,13 +28,15 @@
 from . import utils
 
 
-def get_features(queries,
-                 max_seq_length,
-                 tokenizer,
-                 pad_label=128,
-                 raw_slots=None,
-                 ignore_extra_tokens=False,
-                 ignore_start_end=False):
+def get_features(
+    queries,
+    max_seq_length,
+    tokenizer,
+    pad_label=128,
+    raw_slots=None,
+    ignore_extra_tokens=False,
+    ignore_start_end=False,
+):
     all_subtokens = []
     all_loss_mask = []
     all_subtokens_mask = []
@@ -61,8 +63,7 @@ def get_features(queries,
             subtokens.extend(word_tokens)
 
             loss_mask.append(1)
-            loss_mask.extend([not ignore_extra_tokens] *
-                             (len(word_tokens) - 1))
+            loss_mask.extend([not ignore_extra_tokens] * (len(word_tokens) - 1))
 
             subtokens_mask.append(1)
             subtokens_mask.extend([0] * (len(word_tokens) - 1))
@@ -89,22 +90,19 @@ def get_features(queries,
 
     for i, subtokens in enumerate(all_subtokens):
         if len(subtokens) > max_seq_length:
-            subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1:]
-            all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1:]
-            all_loss_mask[i] = [1 - ignore_start_end] + \
-                all_loss_mask[i][-max_seq_length + 1:]
-            all_subtokens_mask[i] = [0] + \
-                all_subtokens_mask[i][-max_seq_length + 1:]
+            subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1 :]
+            all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :]
+            all_loss_mask[i] = [1 - ignore_start_end] + all_loss_mask[i][-max_seq_length + 1 :]
+            all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :]
 
             if with_label:
-                all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1:]
+                all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1 :]
             too_long_count += 1
 
-        all_input_ids.append([tokenizer._convert_token_to_id(t)
-                              for t in subtokens])
+        all_input_ids.append([tokenizer._convert_token_to_id(t) for t in subtokens])
 
         if len(subtokens) < max_seq_length:
-            extra = (max_seq_length - len(subtokens))
+            extra = max_seq_length - len(subtokens)
             all_input_ids[i] = all_input_ids[i] + [0] * extra
             all_loss_mask[i] = all_loss_mask[i] + [0] * extra
             all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra
@@ -117,12 +115,14 @@ def get_features(queries,
 
     nemo.logging.info(f'{too_long_count} are longer than {max_seq_length}')
 
-    return (all_input_ids,
-            all_segment_ids,
-            all_input_mask,
-            all_loss_mask,
-            all_subtokens_mask,
-            all_slots)
+    return (
+        all_input_ids,
+        all_segment_ids,
+        all_input_mask,
+        all_loss_mask,
+        all_subtokens_mask,
+        all_slots,
+    )
 
 
 class BertJointIntentSlotDataset(Dataset):
@@ -152,17 +152,18 @@ class BertJointIntentSlotDataset(Dataset):
 
     """
 
-    def __init__(self,
-                 input_file,
-                 slot_file,
-                 max_seq_length,
-                 tokenizer,
-                 num_samples=-1,
-                 shuffle=True,
-                 pad_label=128,
-                 ignore_extra_tokens=False,
-                 ignore_start_end=False
-                 ):
+    def __init__(
+        self,
+        input_file,
+        slot_file,
+        max_seq_length,
+        tokenizer,
+        num_samples=-1,
+        shuffle=True,
+        pad_label=128,
+        ignore_extra_tokens=False,
+        ignore_start_end=False,
+    ):
         if num_samples == 0:
             raise ValueError("num_samples has to be positive", num_samples)
 
@@ -188,13 +189,15 @@ def __init__(self,
             raw_intents.append(int(parts[-1]))
             queries.append(' '.join(parts[:-1]))
 
-        features = get_features(queries,
-                                max_seq_length,
-                                tokenizer,
-                                pad_label=pad_label,
-                                raw_slots=raw_slots,
-                                ignore_extra_tokens=ignore_extra_tokens,
-                                ignore_start_end=ignore_start_end)
+        features = get_features(
+            queries,
+            max_seq_length,
+            tokenizer,
+            pad_label=pad_label,
+            raw_slots=raw_slots,
+            ignore_extra_tokens=ignore_extra_tokens,
+            ignore_start_end=ignore_start_end,
+        )
         self.all_input_ids = features[0]
         self.all_segment_ids = features[1]
         self.all_input_mask = features[2]
@@ -207,13 +210,15 @@ def __len__(self):
         return len(self.all_input_ids)
 
     def __getitem__(self, idx):
-        return (np.array(self.all_input_ids[idx]),
-                np.array(self.all_segment_ids[idx]),
-                np.array(self.all_input_mask[idx], dtype=np.long),
-                np.array(self.all_loss_mask[idx]),
-                np.array(self.all_subtokens_mask[idx]),
-                self.all_intents[idx],
-                np.array(self.all_slots[idx]))
+        return (
+            np.array(self.all_input_ids[idx]),
+            np.array(self.all_segment_ids[idx]),
+            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_loss_mask[idx]),
+            np.array(self.all_subtokens_mask[idx]),
+            self.all_intents[idx],
+            np.array(self.all_slots[idx]),
+        )
 
 
 class BertJointIntentSlotInferDataset(Dataset):
@@ -237,14 +242,9 @@ class BertJointIntentSlotInferDataset(Dataset):
 
     """
 
-    def __init__(self,
-                 queries,
-                 max_seq_length,
-                 tokenizer):
+    def __init__(self, queries, max_seq_length, tokenizer):
 
-        features = get_features(queries,
-                                max_seq_length,
-                                tokenizer)
+        features = get_features(queries, max_seq_length, tokenizer)
 
         self.all_input_ids = features[0]
         self.all_segment_ids = features[1]
@@ -256,8 +256,10 @@ def __len__(self):
         return len(self.all_input_ids)
 
     def __getitem__(self, idx):
-        return (np.array(self.all_input_ids[idx]),
-                np.array(self.all_segment_ids[idx]),
-                np.array(self.all_input_mask[idx], dtype=np.long),
-                np.array(self.all_loss_mask[idx]),
-                np.array(self.all_subtokens_mask[idx]))
+        return (
+            np.array(self.all_input_ids[idx]),
+            np.array(self.all_segment_ids[idx]),
+            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_loss_mask[idx]),
+            np.array(self.all_subtokens_mask[idx]),
+        )
diff --git a/nemo/collections/nlp/data/datasets/language_modeling.py b/nemo/collections/nlp/data/datasets/language_modeling.py
index 551c791c3fb7..d8912da7f891 100644
--- a/nemo/collections/nlp/data/datasets/language_modeling.py
+++ b/nemo/collections/nlp/data/datasets/language_modeling.py
@@ -21,11 +21,7 @@
 
 
 class LanguageModelingDataset(Dataset):
-    def __init__(self,
-                 tokenizer,
-                 dataset,
-                 max_seq_length=512,
-                 batch_step=None):
+    def __init__(self, tokenizer, dataset, max_seq_length=512, batch_step=None):
         self.tokenizer = tokenizer
         self.max_seq_length = max_seq_length
         self.batch_step = batch_step or self.max_seq_length
@@ -39,6 +35,6 @@ def __getitem__(self, idx):
         left = idx * self.batch_step
         right = left + self.max_seq_length
         src_ids = self.ids[left:right]
-        labels = self.ids[left + 1:right + 1]
+        labels = self.ids[left + 1 : right + 1]
         src_mask = (src_ids != self.tokenizer.pad_id()).astype(np.float32)
         return src_ids, src_mask, labels
diff --git a/nemo/collections/nlp/data/datasets/punctuation_capitalization.py b/nemo/collections/nlp/data/datasets/punctuation_capitalization.py
index 78dd8e2e7e5f..3efbdb453277 100644
--- a/nemo/collections/nlp/data/datasets/punctuation_capitalization.py
+++ b/nemo/collections/nlp/data/datasets/punctuation_capitalization.py
@@ -24,23 +24,26 @@
 import pickle
 import random
 
-import nemo
 import numpy as np
 from torch.utils.data import Dataset
 
+import nemo
+
 from . import utils
 
 
-def get_features(queries,
-                 max_seq_length,
-                 tokenizer,
-                 punct_label_ids=None,
-                 capit_label_ids=None,
-                 pad_label='O',
-                 punct_labels_lines=None,
-                 capit_labels_lines=None,
-                 ignore_extra_tokens=False,
-                 ignore_start_end=False):
+def get_features(
+    queries,
+    max_seq_length,
+    tokenizer,
+    punct_label_ids=None,
+    capit_label_ids=None,
+    pad_label='O',
+    punct_labels_lines=None,
+    capit_labels_lines=None,
+    ignore_extra_tokens=False,
+    ignore_start_end=False,
+):
     """
     Args:
     queries (list of str): text sequences
@@ -85,20 +88,17 @@ def get_features(queries,
         if with_label:
             pad_id = punct_label_ids[pad_label]
             punct_labels = [pad_id]
-            punct_query_labels = \
-                [punct_label_ids[lab] for lab in punct_labels_lines[i]]
+            punct_query_labels = [punct_label_ids[lab] for lab in punct_labels_lines[i]]
 
             capit_labels = [pad_id]
-            capit_query_labels = \
-                [capit_label_ids[lab] for lab in capit_labels_lines[i]]
+            capit_query_labels = [capit_label_ids[lab] for lab in capit_labels_lines[i]]
 
         for j, word in enumerate(words):
             word_tokens = tokenizer.text_to_tokens(word)
             subtokens.extend(word_tokens)
 
             loss_mask.append(1)
-            loss_mask.extend([int(not ignore_extra_tokens)] *
-                             (len(word_tokens) - 1))
+            loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1))
 
             subtokens_mask.append(1)
             subtokens_mask.extend([0] * (len(word_tokens) - 1))
@@ -130,25 +130,20 @@ def get_features(queries,
 
     for i, subtokens in enumerate(all_subtokens):
         if len(subtokens) > max_seq_length:
-            subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1:]
-            all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1:]
-            all_loss_mask[i] = [int(not ignore_start_end)] + \
-                all_loss_mask[i][-max_seq_length + 1:]
-            all_subtokens_mask[i] = [0] + \
-                all_subtokens_mask[i][-max_seq_length + 1:]
+            subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1 :]
+            all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :]
+            all_loss_mask[i] = [int(not ignore_start_end)] + all_loss_mask[i][-max_seq_length + 1 :]
+            all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :]
 
             if with_label:
-                punct_all_labels[i] = \
-                    [pad_id] + punct_all_labels[i][-max_seq_length + 1:]
-                capit_all_labels[i] = \
-                    [pad_id] + capit_all_labels[i][-max_seq_length + 1:]
+                punct_all_labels[i] = [pad_id] + punct_all_labels[i][-max_seq_length + 1 :]
+                capit_all_labels[i] = [pad_id] + capit_all_labels[i][-max_seq_length + 1 :]
             too_long_count += 1
 
-        all_input_ids.append([tokenizer.tokens_to_ids(t)
-                              for t in subtokens])
+        all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens])
 
         if len(subtokens) < max_seq_length:
-            extra = (max_seq_length - len(subtokens))
+            extra = max_seq_length - len(subtokens)
             all_input_ids[i] = all_input_ids[i] + [0] * extra
             all_loss_mask[i] = all_loss_mask[i] + [0] * extra
             all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra
@@ -165,30 +160,25 @@ def get_features(queries,
     for i in range(min(len(all_input_ids), 5)):
         nemo.logging.info("*** Example ***")
         nemo.logging.info("i: %s" % (i))
-        nemo.logging.info(
-            "subtokens: %s" % " ".join(list(map(str, all_subtokens[i]))))
-        nemo.logging.info(
-            "loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i]))))
-        nemo.logging.info(
-            "input_mask: %s" % " ".join(list(map(str, all_input_mask[i]))))
-        nemo.logging.info(
-            "subtokens_mask: %s" % " ".join(list(map(
-                str, all_subtokens_mask[i]))))
+        nemo.logging.info("subtokens: %s" % " ".join(list(map(str, all_subtokens[i]))))
+        nemo.logging.info("loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i]))))
+        nemo.logging.info("input_mask: %s" % " ".join(list(map(str, all_input_mask[i]))))
+        nemo.logging.info("subtokens_mask: %s" % " ".join(list(map(str, all_subtokens_mask[i]))))
         if with_label:
-            nemo.logging.info("punct_labels: %s" %
-                              " ".join(list(map(str, punct_all_labels[i]))))
-            nemo.logging.info("capit_labels: %s" %
-                              " ".join(list(map(str, capit_all_labels[i]))))
-
-    return (all_input_ids,
-            all_segment_ids,
-            all_input_mask,
-            all_loss_mask,
-            all_subtokens_mask,
-            punct_all_labels,
-            capit_all_labels,
-            punct_label_ids,
-            capit_label_ids)
+            nemo.logging.info("punct_labels: %s" % " ".join(list(map(str, punct_all_labels[i]))))
+            nemo.logging.info("capit_labels: %s" % " ".join(list(map(str, capit_all_labels[i]))))
+
+    return (
+        all_input_ids,
+        all_segment_ids,
+        all_input_mask,
+        all_loss_mask,
+        all_subtokens_mask,
+        punct_all_labels,
+        capit_all_labels,
+        punct_label_ids,
+        capit_label_ids,
+    )
 
 
 class BertPunctuationCapitalizationDataset(Dataset):
@@ -226,19 +216,21 @@ class BertPunctuationCapitalizationDataset(Dataset):
             the loss_mask
     """
 
-    def __init__(self,
-                 text_file,
-                 label_file,
-                 max_seq_length,
-                 tokenizer,
-                 num_samples=-1,
-                 shuffle=False,
-                 pad_label='O',
-                 punct_label_ids=None,
-                 capit_label_ids=None,
-                 ignore_extra_tokens=False,
-                 ignore_start_end=False,
-                 use_cache=False):
+    def __init__(
+        self,
+        text_file,
+        label_file,
+        max_seq_length,
+        tokenizer,
+        num_samples=-1,
+        shuffle=False,
+        pad_label='O',
+        punct_label_ids=None,
+        capit_label_ids=None,
+        ignore_extra_tokens=False,
+        ignore_start_end=False,
+        use_cache=False,
+    ):
 
         if use_cache:
             # Cache features
@@ -280,13 +272,10 @@ def __init__(self,
                     capit_unique_labels.update(capit_line)
 
             if len(punct_labels_lines) != len(text_lines):
-                raise ValueError(
-                    "Labels file should contain labels for every word")
+                raise ValueError("Labels file should contain labels for every word")
 
             if shuffle or num_samples > 0:
-                dataset = list(zip(text_lines,
-                                   punct_labels_lines,
-                                   capit_labels_lines))
+                dataset = list(zip(text_lines, punct_labels_lines, capit_labels_lines))
                 random.shuffle(dataset)
 
                 if num_samples > 0:
@@ -301,21 +290,22 @@ def __init__(self,
             if punct_label_ids:
                 if len(punct_label_ids) != len(punct_unique_labels):
                     nemo.logging.info(
-                        'Not all labels from the specified' +
-                        'label_ids dictionary are present in the' +
-                        'current dataset. Using the provided' +
-                        'label_ids dictionary.')
+                        'Not all labels from the specified'
+                        + 'label_ids dictionary are present in the'
+                        + 'current dataset. Using the provided'
+                        + 'label_ids dictionary.'
+                    )
                 else:
-                    nemo.logging.info(
-                        'Using the provided label_ids dictionary.')
+                    nemo.logging.info('Using the provided label_ids dictionary.')
             else:
                 nemo.logging.info(
-                    'Creating a new label to label_id dictionary.' +
-                    ' It\'s recommended to use label_ids generated' +
-                    ' during training for dev/test sets to avoid' +
-                    ' errors if some labels are not' +
-                    ' present in the dev/test sets.' +
-                    ' For training set label_ids should be None.')
+                    'Creating a new label to label_id dictionary.'
+                    + ' It\'s recommended to use label_ids generated'
+                    + ' during training for dev/test sets to avoid'
+                    + ' errors if some labels are not'
+                    + ' present in the dev/test sets.'
+                    + ' For training set label_ids should be None.'
+                )
 
                 def create_label_ids(unique_labels, pad_label=pad_label):
                     label_ids = {pad_label: 0}
@@ -328,16 +318,18 @@ def create_label_ids(unique_labels, pad_label=pad_label):
                 punct_label_ids = create_label_ids(punct_unique_labels)
                 capit_label_ids = create_label_ids(capit_unique_labels)
 
-            features = get_features(text_lines,
-                                    max_seq_length,
-                                    tokenizer,
-                                    pad_label=pad_label,
-                                    punct_labels_lines=punct_labels_lines,
-                                    capit_labels_lines=capit_labels_lines,
-                                    punct_label_ids=punct_label_ids,
-                                    capit_label_ids=capit_label_ids,
-                                    ignore_extra_tokens=ignore_extra_tokens,
-                                    ignore_start_end=ignore_start_end)
+            features = get_features(
+                text_lines,
+                max_seq_length,
+                tokenizer,
+                pad_label=pad_label,
+                punct_labels_lines=punct_labels_lines,
+                capit_labels_lines=capit_labels_lines,
+                punct_label_ids=punct_label_ids,
+                capit_label_ids=capit_label_ids,
+                ignore_extra_tokens=ignore_extra_tokens,
+                ignore_start_end=ignore_start_end,
+            )
 
             if use_cache:
                 pickle.dump(features, open(features_pkl, "wb"))
@@ -355,41 +347,35 @@ def create_label_ids(unique_labels, pad_label=pad_label):
 
         # save label_ids
         def get_stats_and_save(all_labels, label_ids, name):
-            infold = text_file[:text_file.rfind('/')]
+            infold = text_file[: text_file.rfind('/')]
             merged_labels = itertools.chain.from_iterable(all_labels)
             nemo.logging.info('Three most popular labels')
-            _, label_frequencies = \
-                utils.get_label_stats(merged_labels,
-                                      infold + '/label_count_' + name + '.tsv')
+            _, label_frequencies = utils.get_label_stats(merged_labels, infold + '/label_count_' + name + '.tsv')
 
             out = open(os.path.join(infold, name + '_label_ids.csv'), 'w')
-            labels, _ = zip(*sorted(label_ids.items(),  key=lambda x: x[1]))
+            labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1]))
             out.write('\n'.join(labels))
             nemo.logging.info(f'Labels: {label_ids}')
             nemo.logging.info(f'Labels mapping saved to : {out.name}')
 
             return label_frequencies
 
-        self.punct_label_frequencies = \
-            get_stats_and_save(self.punct_all_labels,
-                               self.punct_label_ids,
-                               'punct')
-        self.capit_label_frequencies = \
-            get_stats_and_save(self.capit_all_labels,
-                               self.capit_label_ids,
-                               'capit')
+        self.punct_label_frequencies = get_stats_and_save(self.punct_all_labels, self.punct_label_ids, 'punct')
+        self.capit_label_frequencies = get_stats_and_save(self.capit_all_labels, self.capit_label_ids, 'capit')
 
     def __len__(self):
         return len(self.all_input_ids)
 
     def __getitem__(self, idx):
-        return (np.array(self.all_input_ids[idx]),
-                np.array(self.all_segment_ids[idx]),
-                np.array(self.all_input_mask[idx], dtype=np.long),
-                np.array(self.all_loss_mask[idx]),
-                np.array(self.all_subtokens_mask[idx]),
-                np.array(self.punct_all_labels[idx]),
-                np.array(self.capit_all_labels[idx]))
+        return (
+            np.array(self.all_input_ids[idx]),
+            np.array(self.all_segment_ids[idx]),
+            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_loss_mask[idx]),
+            np.array(self.all_subtokens_mask[idx]),
+            np.array(self.punct_all_labels[idx]),
+            np.array(self.capit_all_labels[idx]),
+        )
 
 
 class BertPunctuationCapitalizationInferDataset(Dataset):
@@ -409,14 +395,9 @@ class BertPunctuationCapitalizationInferDataset(Dataset):
         tokenizer (Tokenizer): such as NemoBertTokenizer
     """
 
-    def __init__(self,
-                 queries,
-                 max_seq_length,
-                 tokenizer):
+    def __init__(self, queries, max_seq_length, tokenizer):
 
-        features = get_features(queries,
-                                max_seq_length,
-                                tokenizer)
+        features = get_features(queries, max_seq_length, tokenizer)
 
         self.all_input_ids = features[0]
         self.all_segment_ids = features[1]
@@ -428,8 +409,10 @@ def __len__(self):
         return len(self.all_input_ids)
 
     def __getitem__(self, idx):
-        return (np.array(self.all_input_ids[idx]),
-                np.array(self.all_segment_ids[idx]),
-                np.array(self.all_input_mask[idx], dtype=np.float32),
-                np.array(self.all_loss_mask[idx]),
-                np.array(self.all_subtokens_mask[idx]))
+        return (
+            np.array(self.all_input_ids[idx]),
+            np.array(self.all_segment_ids[idx]),
+            np.array(self.all_input_mask[idx], dtype=np.float32),
+            np.array(self.all_loss_mask[idx]),
+            np.array(self.all_subtokens_mask[idx]),
+        )
diff --git a/nemo/collections/nlp/data/datasets/sentence_classification.py b/nemo/collections/nlp/data/datasets/sentence_classification.py
index 87e71be853e7..215fa140ca50 100644
--- a/nemo/collections/nlp/data/datasets/sentence_classification.py
+++ b/nemo/collections/nlp/data/datasets/sentence_classification.py
@@ -22,10 +22,11 @@
 
 import random
 
-import nemo
 import numpy as np
 from torch.utils.data import Dataset
 
+import nemo
+
 from . import utils
 
 
@@ -44,12 +45,9 @@ class BertSentenceClassificationDataset(Dataset):
         shuffle (bool): whether to shuffle your data.
     """
 
-    def __init__(self,
-                 input_file,
-                 max_seq_length,
-                 tokenizer,
-                 num_samples=-1,
-                 shuffle=True):
+    def __init__(
+        self, input_file, max_seq_length, tokenizer, num_samples=-1, shuffle=True,
+    ):
         with open(input_file, "r") as f:
             sent_labels, all_sent_subtokens = [], []
             sent_lengths = []
@@ -87,17 +85,16 @@ def __init__(self,
 
         for i in range(len(all_sent_subtokens)):
             if len(all_sent_subtokens[i]) > self.max_seq_length:
-                shorten_sent = all_sent_subtokens[i][-self.max_seq_length+1:]
+                shorten_sent = all_sent_subtokens[i][-self.max_seq_length + 1 :]
                 all_sent_subtokens[i] = ['[CLS]'] + shorten_sent
                 too_long_count += 1
 
-        nemo.logging.info(f'{too_long_count} out of {len(sent_lengths)} \
-                       sentencess with more than {max_seq_length} subtokens.')
+        nemo.logging.info(
+            f'{too_long_count} out of {len(sent_lengths)} \
+                       sentencess with more than {max_seq_length} subtokens.'
+        )
 
-        self.convert_sequences_to_features(all_sent_subtokens,
-                                           sent_labels,
-                                           tokenizer,
-                                           self.max_seq_length)
+        self.convert_sequences_to_features(all_sent_subtokens, sent_labels, tokenizer, self.max_seq_length)
 
         self.tokenizer = tokenizer
         self.vocab_size = self.tokenizer.vocab_size
@@ -109,16 +106,14 @@ def __getitem__(self, idx):
 
         feature = self.features[idx]
 
-        return (np.array(feature.input_ids),
-                np.array(feature.segment_ids),
-                np.array(feature.input_mask, dtype=np.long),
-                feature.sent_label)
+        return (
+            np.array(feature.input_ids),
+            np.array(feature.segment_ids),
+            np.array(feature.input_mask, dtype=np.long),
+            feature.sent_label,
+        )
 
-    def convert_sequences_to_features(self,
-                                      all_sent_subtokens,
-                                      sent_labels,
-                                      tokenizer,
-                                      max_seq_length):
+    def convert_sequences_to_features(self, all_sent_subtokens, sent_labels, tokenizer, max_seq_length):
         """Loads a data file into a list of `InputBatch`s.
         """
 
@@ -128,8 +123,7 @@ def convert_sequences_to_features(self,
             sent_label = sent_labels[sent_id]
             word_count = 0
             # input_ids = tokenizer.tokens_to_ids(sent_subtokens)
-            input_ids = [tokenizer._convert_token_to_id(
-                t) for t in sent_subtokens]
+            input_ids = [tokenizer._convert_token_to_id(t) for t in sent_subtokens]
 
             # The mask has 1 for real tokens and 0 for padding tokens.
             # Only real tokens are attended to.
@@ -150,26 +144,23 @@ def convert_sequences_to_features(self,
                 nemo.logging.info("subtokens: %s" % " ".join(sent_subtokens))
                 nemo.logging.info("sent_label: %s" % sent_label)
                 nemo.logging.info("input_ids: %s" % utils.list2str(input_ids))
-                nemo.logging.info(
-                    "input_mask: %s" % utils.list2str(input_mask))
+                nemo.logging.info("input_mask: %s" % utils.list2str(input_mask))
 
-            self.features.append(InputFeatures(
-                sent_id=sent_id,
-                sent_label=sent_label,
-                input_ids=input_ids,
-                input_mask=input_mask,
-                segment_ids=segment_ids))
+            self.features.append(
+                InputFeatures(
+                    sent_id=sent_id,
+                    sent_label=sent_label,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    segment_ids=segment_ids,
+                )
+            )
 
 
 class InputFeatures(object):
     """A single set of features of data."""
 
-    def __init__(self,
-                 sent_id,
-                 sent_label,
-                 input_ids,
-                 input_mask,
-                 segment_ids):
+    def __init__(self, sent_id, sent_label, input_ids, input_mask, segment_ids):
         self.sent_id = sent_id
         self.sent_label = sent_label
         self.input_ids = input_ids
diff --git a/nemo/collections/nlp/data/datasets/squad.py b/nemo/collections/nlp/data/datasets/squad.py
index f734535e9de9..615a205b55bf 100644
--- a/nemo/collections/nlp/data/datasets/squad.py
+++ b/nemo/collections/nlp/data/datasets/squad.py
@@ -22,23 +22,26 @@
 import sys
 
 import numpy as np
-from tqdm import tqdm
 import torch
 from torch.utils.data import Dataset
+from tqdm import tqdm
 
 import nemo
 from nemo.collections.nlp.utils.nlp_utils import _is_whitespace
-from .utils import DataProcessor
+
 from ...utils.metrics.squad_metrics import (
     _compute_softmax,
     _get_best_indexes,
     apply_no_ans_threshold,
     exact_match_score,
+    f1_score,
+    find_all_best_thresh,
+    get_final_text,
     make_eval_dict,
-    f1_score, get_final_text,
-    normalize_answer,
     merge_eval,
-    find_all_best_thresh)
+    normalize_answer,
+)
+from .utils import DataProcessor
 
 
 """
@@ -68,65 +71,57 @@ class SquadDataset(Dataset):
         mode (str): Use "train" or "dev" to define between
             training and evaluation.
     """
+
     def __init__(
-            self,
-            data_dir,
-            tokenizer,
-            doc_stride,
-            max_query_length,
-            max_seq_length,
-            version_2_with_negative,
-            mode):
+        self, data_dir, tokenizer, doc_stride, max_query_length, max_seq_length, version_2_with_negative, mode,
+    ):
         self.tokenizer = tokenizer
         if not version_2_with_negative:
             processor_name = 'SquadV1Processor'
         else:
             processor_name = 'SquadV2Processor'
-        self.processor = getattr(sys.modules[__name__],
-                                 processor_name)()
+        self.processor = getattr(sys.modules[__name__], processor_name)()
         if mode == "dev":
-            self.examples = self.processor.get_dev_examples(
-                                data_dir=data_dir)
+            self.examples = self.processor.get_dev_examples(data_dir=data_dir)
         elif mode == "train":
-            self.examples = self.processor.get_train_examples(
-                                data_dir=data_dir)
+            self.examples = self.processor.get_train_examples(data_dir=data_dir)
         else:
             raise Exception
         if mode == "train":
-            cached_train_features_file = data_dir + '/cache' + \
-                '_{0}_{1}_{2}_{3}'.format(
-                    mode,
-                    str(max_seq_length),
-                    str(doc_stride),
-                    str(max_query_length))
+            cached_train_features_file = (
+                data_dir
+                + '/cache'
+                + '_{0}_{1}_{2}_{3}'.format(mode, str(max_seq_length), str(doc_stride), str(max_query_length),)
+            )
 
             if os.path.exists(cached_train_features_file):
                 with open(cached_train_features_file, "rb") as reader:
                     self.features = pickle.load(reader)
             else:
                 self.features = convert_examples_to_features(
-                                    examples=self.examples,
-                                    tokenizer=tokenizer,
-                                    max_seq_length=max_seq_length,
-                                    doc_stride=doc_stride,
-                                    max_query_length=max_query_length,
-                                    has_groundtruth=True)
-                master_device = not torch.distributed.is_initialized() \
-                    or torch.distributed.get_rank() == 0
+                    examples=self.examples,
+                    tokenizer=tokenizer,
+                    max_seq_length=max_seq_length,
+                    doc_stride=doc_stride,
+                    max_query_length=max_query_length,
+                    has_groundtruth=True,
+                )
+                master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
                 if master_device:
                     nemo.logging.info(
-                        "  Saving train features into cached file %s",
-                        cached_train_features_file)
+                        "  Saving train features into cached file %s", cached_train_features_file,
+                    )
                     with open(cached_train_features_file, "wb") as writer:
                         pickle.dump(self.features, writer)
         elif mode == "dev":
             self.features = convert_examples_to_features(
-                                    examples=self.examples,
-                                    tokenizer=tokenizer,
-                                    max_seq_length=max_seq_length,
-                                    doc_stride=doc_stride,
-                                    max_query_length=max_query_length,
-                                    has_groundtruth=True)
+                examples=self.examples,
+                tokenizer=tokenizer,
+                max_seq_length=max_seq_length,
+                doc_stride=doc_stride,
+                max_query_length=max_query_length,
+                has_groundtruth=True,
+            )
         else:
             raise Exception
 
@@ -135,23 +130,26 @@ def __len__(self):
 
     def __getitem__(self, idx):
         feature = self.features[idx]
-        return (np.array(feature.input_ids),
-                np.array(feature.segment_ids),
-                np.array(feature.input_mask),
-                np.array(feature.start_position),
-                np.array(feature.end_position),
-                np.array(feature.unique_id))
+        return (
+            np.array(feature.input_ids),
+            np.array(feature.segment_ids),
+            np.array(feature.input_mask),
+            np.array(feature.start_position),
+            np.array(feature.end_position),
+            np.array(feature.unique_id),
+        )
 
     def get_predictions(
-            self,
-            unique_ids,
-            start_logits,
-            end_logits,
-            n_best_size,
-            max_answer_length,
-            do_lower_case,
-            version_2_with_negative,
-            null_score_diff_threshold):
+        self,
+        unique_ids,
+        start_logits,
+        end_logits,
+        n_best_size,
+        max_answer_length,
+        do_lower_case,
+        version_2_with_negative,
+        null_score_diff_threshold,
+    ):
         example_index_to_features = collections.defaultdict(list)
 
         unique_id_to_pos = {}
@@ -162,10 +160,8 @@ def get_predictions(
             example_index_to_features[feature.example_index].append(feature)
 
         _PrelimPrediction = collections.namedtuple(
-            "PrelimPrediction", [
-                "feature_index", "start_index", "end_index", "start_logit",
-                "end_logit"
-            ])
+            "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit",],
+        )
 
         all_predictions = collections.OrderedDict()
         all_nbest_json = collections.OrderedDict()
@@ -187,14 +183,12 @@ def get_predictions(
             null_end_logit = 0
             for (feature_index, feature) in enumerate(features):
                 pos = unique_id_to_pos[feature.unique_id]
-                start_indexes = _get_best_indexes(start_logits[pos],
-                                                  n_best_size)
+                start_indexes = _get_best_indexes(start_logits[pos], n_best_size)
                 end_indexes = _get_best_indexes(end_logits[pos], n_best_size)
                 # if we could have irrelevant answers,
                 # get the min score of irrelevant
                 if version_2_with_negative:
-                    feature_null_score = start_logits[pos][0] + end_logits[
-                        pos][0]
+                    feature_null_score = start_logits[pos][0] + end_logits[pos][0]
                     if feature_null_score < score_null:
                         score_null = feature_null_score
                         min_null_feature_index = feature_index
@@ -213,8 +207,7 @@ def get_predictions(
                             continue
                         if end_index not in feature.token_to_orig_map:
                             continue
-                        if not feature.token_is_max_context.get(
-                                start_index, False):
+                        if not feature.token_is_max_context.get(start_index, False):
                             continue
                         if end_index < start_index:
                             continue
@@ -227,22 +220,23 @@ def get_predictions(
                                 start_index=start_index,
                                 end_index=end_index,
                                 start_logit=start_logits[pos][start_index],
-                                end_logit=end_logits[pos][end_index]))
+                                end_logit=end_logits[pos][end_index],
+                            )
+                        )
 
             if version_2_with_negative:
                 prelim_predictions.append(
-                    _PrelimPrediction(feature_index=min_null_feature_index,
-                                      start_index=0,
-                                      end_index=0,
-                                      start_logit=null_start_logit,
-                                      end_logit=null_end_logit))
-            prelim_predictions = sorted(
-                prelim_predictions,
-                key=lambda x: (x.start_logit + x.end_logit),
-                reverse=True)
-
-            _NbestPrediction = collections.namedtuple(
-                "NbestPrediction", ["text", "start_logit", "end_logit"])
+                    _PrelimPrediction(
+                        feature_index=min_null_feature_index,
+                        start_index=0,
+                        end_index=0,
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit,
+                    )
+                )
+            prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,)
+
+            _NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"])
 
             seen_predictions = {}
             nbest = []
@@ -251,13 +245,10 @@ def get_predictions(
                     break
                 feature = features[pred.feature_index]
                 if pred.start_index > 0:  # this is a non-null prediction
-                    tok_tokens = feature.tokens[pred.start_index:(
-                        pred.end_index + 1)]
-                    orig_doc_start = feature.token_to_orig_map[
-                        pred.start_index]
+                    tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+                    orig_doc_start = feature.token_to_orig_map[pred.start_index]
                     orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                    orig_tokens = example.doc_tokens[orig_doc_start:(
-                        orig_doc_end + 1)]
+                    orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
                     tok_text = " ".join(tok_tokens)
 
                     # De-tokenize WordPieces that have been split off.
@@ -269,8 +260,7 @@ def get_predictions(
                     tok_text = " ".join(tok_text.split())
                     orig_text = " ".join(orig_tokens)
 
-                    final_text = get_final_text(tok_text, orig_text,
-                                                do_lower_case)
+                    final_text = get_final_text(tok_text, orig_text, do_lower_case)
                     if final_text in seen_predictions:
                         continue
 
@@ -280,34 +270,25 @@ def get_predictions(
                     seen_predictions[final_text] = True
 
                 nbest.append(
-                    _NbestPrediction(text=final_text,
-                                     start_logit=pred.start_logit,
-                                     end_logit=pred.end_logit))
+                    _NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,)
+                )
             # if we didn't include the empty option in the n-best, include it
             if version_2_with_negative:
                 if "" not in seen_predictions:
-                    nbest.append(
-                        _NbestPrediction(text="",
-                                         start_logit=null_start_logit,
-                                         end_logit=null_end_logit))
+                    nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit,))
 
                 # In very rare edge cases we could only
                 # have single null pred. We just create a nonce prediction
                 # in this case to avoid failure.
                 if len(nbest) == 1:
                     nbest.insert(
-                        0,
-                        _NbestPrediction(text="empty",
-                                         start_logit=0.0,
-                                         end_logit=0.0))
+                        0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0),
+                    )
 
             # In very rare edge cases we could have no valid predictions. So we
             # just create a nonce prediction in this case to avoid failure.
             if not nbest:
-                nbest.append(
-                    _NbestPrediction(text="empty",
-                                     start_logit=0.0,
-                                     end_logit=0.0))
+                nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
             assert len(nbest) >= 1
 
@@ -337,8 +318,7 @@ def get_predictions(
             else:
                 # predict "" iff the null score -
                 # the score of best non-null > threshold
-                score_diff = score_null - best_non_null_entry.start_logit - (
-                    best_non_null_entry.end_logit)
+                score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
                 scores_diff_json[example.qas_id] = score_diff
                 if score_diff > null_score_diff_threshold:
                     all_predictions[example.qas_id] = ""
@@ -349,59 +329,41 @@ def get_predictions(
         return all_predictions, all_nbest_json, scores_diff_json
 
     def evaluate_predictions(
-            self,
-            all_predictions,
-            no_answer_probs=None,
-            no_answer_probability_threshold=1.0):
-        qas_id_to_has_answer = {example.qas_id:
-                                bool(example.answers) for
-                                example in self.examples}
-        has_answer_qids = [qas_id for qas_id, has_answer in
-                           qas_id_to_has_answer.items() if has_answer]
-        no_answer_qids = [qas_id for qas_id, has_answer in
-                          qas_id_to_has_answer.items() if not has_answer]
+        self, all_predictions, no_answer_probs=None, no_answer_probability_threshold=1.0,
+    ):
+        qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in self.examples}
+        has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
+        no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
         if no_answer_probs is None:
             no_answer_probs = {k: 0.0 for k in all_predictions}
 
         exact, f1 = self.get_raw_scores(all_predictions)
 
         exact_threshold = apply_no_ans_threshold(
-                            exact,
-                            no_answer_probs,
-                            qas_id_to_has_answer,
-                            no_answer_probability_threshold)
-        f1_threshold = apply_no_ans_threshold(f1,
-                                              no_answer_probs,
-                                              qas_id_to_has_answer,
-                                              no_answer_probability_threshold)
+            exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold,
+        )
+        f1_threshold = apply_no_ans_threshold(
+            f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold,
+        )
 
         evaluation = make_eval_dict(exact_threshold, f1_threshold)
 
         if has_answer_qids:
-            has_ans_eval = make_eval_dict(exact_threshold,
-                                          f1_threshold,
-                                          qid_list=has_answer_qids)
+            has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
             merge_eval(evaluation, has_ans_eval, "HasAns")
 
         if no_answer_qids:
-            no_ans_eval = make_eval_dict(exact_threshold,
-                                         f1_threshold,
-                                         qid_list=no_answer_qids)
+            no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
             merge_eval(evaluation, no_ans_eval, "NoAns")
 
         if no_answer_probs:
-            find_all_best_thresh(evaluation,
-                                 all_predictions,
-                                 exact,
-                                 f1,
-                                 no_answer_probs,
-                                 qas_id_to_has_answer)
+            find_all_best_thresh(
+                evaluation, all_predictions, exact, f1, no_answer_probs, qas_id_to_has_answer,
+            )
 
         return evaluation["best_exact"], evaluation["best_f1"]
 
-    def get_raw_scores(
-            self,
-            preds):
+    def get_raw_scores(self, preds):
         """
         Computes the exact and f1 scores from the examples
         and the model predictions
@@ -411,8 +373,7 @@ def get_raw_scores(
 
         for example in self.examples:
             qas_id = example.qas_id
-            gold_answers = [answer["text"] for answer in example.answers
-                            if normalize_answer(answer["text"])]
+            gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
 
             if not gold_answers:
                 # For unanswerable questions,
@@ -424,15 +385,24 @@ def get_raw_scores(
                 continue
 
             prediction = preds[qas_id]
-            exact_scores[qas_id] = \
-                max(exact_match_score(a, prediction) for a in gold_answers)
-            f1_scores[qas_id] = \
-                max(f1_score(a, prediction) for a in gold_answers)
+            exact_scores[qas_id] = max(exact_match_score(a, prediction) for a in gold_answers)
+            f1_scores[qas_id] = max(f1_score(a, prediction) for a in gold_answers)
 
         return exact_scores, f1_scores
 
     def evaluate(
-            self,
+        self,
+        unique_ids,
+        start_logits,
+        end_logits,
+        n_best_size,
+        max_answer_length,
+        do_lower_case,
+        version_2_with_negative,
+        null_score_diff_threshold,
+    ):
+
+        (all_predictions, all_nbest_json, scores_diff_json,) = self.get_predictions(
             unique_ids,
             start_logits,
             end_logits,
@@ -440,14 +410,8 @@ def evaluate(
             max_answer_length,
             do_lower_case,
             version_2_with_negative,
-            null_score_diff_threshold):
-
-        all_predictions, all_nbest_json, scores_diff_json = \
-            self.get_predictions(unique_ids, start_logits,
-                                 end_logits, n_best_size,
-                                 max_answer_length, do_lower_case,
-                                 version_2_with_negative,
-                                 null_score_diff_threshold)
+            null_score_diff_threshold,
+        )
 
         exact_match, f1 = self.evaluate_predictions(all_predictions)
 
@@ -455,11 +419,8 @@ def evaluate(
 
 
 def convert_examples_to_features(
-        examples,
-        tokenizer,
-        max_seq_length,
-        doc_stride, max_query_length,
-        has_groundtruth):
+    examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth,
+):
     """Loads a data file into a list of `InputBatch`s."""
 
     unique_id = 1000000000
@@ -494,20 +455,18 @@ def convert_examples_to_features(
         if has_groundtruth and not example.is_impossible:
             tok_start_position = orig_to_tok_index[example.start_position]
             if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[
-                    example.end_position + 1] - 1
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
             else:
                 tok_end_position = len(all_doc_tokens) - 1
 
             (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position,
-                tokenizer, example.answer_text)
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text,
+            )
 
         # The -3 accounts for [CLS], [SEP] and [SEP]
         # doc_spans contains all possible contexts options of given length
         max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-        _DocSpan = collections.namedtuple(
-            "DocSpan", ["start", "length"])
+        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
         doc_spans = []
         start_offset = 0
         while start_offset < len(all_doc_tokens):
@@ -535,12 +494,9 @@ def convert_examples_to_features(
 
             for i in range(doc_span.length):
                 split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = \
-                    tok_to_orig_index[split_token_index]
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
 
-                is_max_context = _check_is_max_context(doc_spans,
-                                                       doc_span_index,
-                                                       split_token_index)
+                is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
                 token_is_max_context[len(tokens)] = is_max_context
                 tokens.append(all_doc_tokens[split_token_index])
                 segment_ids.append(1)
@@ -572,16 +528,14 @@ def convert_examples_to_features(
                 doc_start = doc_span.start
                 doc_end = doc_span.start + doc_span.length - 1
                 out_of_span = False
-                if not (tok_start_position >= doc_start and
-                        tok_end_position <= doc_end):
+                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
                     out_of_span = True
                 if out_of_span:
                     start_position = 0
                     end_position = 0
                 else:
                     doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start \
-                        + doc_offset
+                    start_position = tok_start_position - doc_start + doc_offset
                     end_position = tok_end_position - doc_start + doc_offset
             if has_groundtruth and example.is_impossible:
                 # if our document chunk does not contain
@@ -596,30 +550,23 @@ def convert_examples_to_features(
                 nemo.logging.info("example_index: %s" % (example_index))
                 nemo.logging.info("doc_span_index: %s" % (doc_span_index))
                 nemo.logging.info("tokens: %s" % " ".join(tokens))
-                nemo.logging.info("token_to_orig_map: %s" % " ".join([
-                    "%d:%d" % (x, y) for (x, y)
-                    in token_to_orig_map.items()]))
-                nemo.logging.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y) for (x, y)
-                    in token_is_max_context.items()
-                ]))
-                nemo.logging.info("input_ids: %s" % " ".join(
-                    [str(x) for x in input_ids]))
                 nemo.logging.info(
-                    "input_mask: %s" % " ".join(
-                        [str(x) for x in input_mask]))
+                    "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])
+                )
                 nemo.logging.info(
-                    "segment_ids: %s" % " ".join(
-                        [str(x) for x in segment_ids]))
+                    "token_is_max_context: %s"
+                    % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
+                )
+                nemo.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                nemo.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                nemo.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
                 if has_groundtruth and example.is_impossible:
                     nemo.logging.info("impossible example")
                 if has_groundtruth and not example.is_impossible:
-                    answer_text = " ".join(
-                                    tokens[start_position:(end_position + 1)])
+                    answer_text = " ".join(tokens[start_position : (end_position + 1)])
                     nemo.logging.info("start_position: %d" % (start_position))
                     nemo.logging.info("end_position: %d" % (end_position))
-                    nemo.logging.info(
-                        "answer: %s" % (answer_text))
+                    nemo.logging.info("answer: %s" % (answer_text))
 
             features.append(
                 InputFeatures(
@@ -634,7 +581,9 @@ def convert_examples_to_features(
                     segment_ids=segment_ids,
                     start_position=start_position,
                     end_position=end_position,
-                    is_impossible=example.is_impossible))
+                    is_impossible=example.is_impossible,
+                )
+            )
             unique_id += 1
 
     return features
@@ -644,19 +593,20 @@ class InputFeatures(object):
     """A single set of features of data."""
 
     def __init__(
-            self,
-            unique_id,
-            example_index,
-            doc_span_index,
-            tokens,
-            token_to_orig_map,
-            token_is_max_context,
-            input_ids,
-            input_mask,
-            segment_ids,
-            start_position=None,
-            end_position=None,
-            is_impossible=None):
+        self,
+        unique_id,
+        example_index,
+        doc_span_index,
+        tokens,
+        token_to_orig_map,
+        token_is_max_context,
+        input_ids,
+        input_mask,
+        segment_ids,
+        start_position=None,
+        end_position=None,
+        is_impossible=None,
+    ):
         self.unique_id = unique_id
         self.example_index = example_index
         self.doc_span_index = doc_span_index
@@ -681,10 +631,7 @@ class SquadProcessor(DataProcessor):
     train_file = None
     dev_file = None
 
-    def get_train_examples(
-            self,
-            data_dir,
-            filename=None):
+    def get_train_examples(self, data_dir, filename=None):
         """
         Returns the training examples from the data directory.
         Args:
@@ -699,21 +646,18 @@ def get_train_examples(
             data_dir = ""
 
         if self.train_file is None:
-            raise ValueError("SquadProcessor should be instantiated via \
-                             SquadV1Processor or SquadV2Processor")
+            raise ValueError(
+                "SquadProcessor should be instantiated via \
+                             SquadV1Processor or SquadV2Processor"
+            )
 
         with open(
-            os.path.join(data_dir,
-                         self.train_file if filename is None else filename),
-            "r", encoding="utf-8"
+            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8",
         ) as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "train")
 
-    def get_dev_examples(
-            self,
-            data_dir,
-            filename=None):
+    def get_dev_examples(self, data_dir, filename=None):
         """
         Returns the evaluation example from the data directory.
         Args:
@@ -728,20 +672,17 @@ def get_dev_examples(
             data_dir = ""
 
         if self.dev_file is None:
-            raise ValueError("SquadProcessor should be instantiated via \
-                             SquadV1Processor or SquadV2Processor")
+            raise ValueError(
+                "SquadProcessor should be instantiated via \
+                             SquadV1Processor or SquadV2Processor"
+            )
         with open(
-            os.path.join(data_dir,
-                         self.dev_file if filename is None else filename),
-            "r", encoding="utf-8"
+            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8",
         ) as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "dev")
 
-    def _create_examples(
-            self,
-            input_data,
-            set_type):
+    def _create_examples(self, input_data, set_type):
         examples = []
         for entry in tqdm(input_data):
             title = entry["title"]
@@ -857,34 +798,25 @@ def __init__(
             # start_position is index of word, end_position inclusive
             self.start_position = char_to_word_offset[start_position_character]
             self.end_position = char_to_word_offset[
-                min(start_position_character + len(answer_text) - 1,
-                    len(char_to_word_offset) - 1)
+                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1,)
             ]
 
 
-def _improve_answer_span(
-        doc_tokens,
-        input_start,
-        input_end,
-        tokenizer,
-        orig_answer_text):
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
     """Returns tokenized answer spans that
     better match the annotated answer."""
     tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text))
 
     for new_start in range(input_start, input_end + 1):
         for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
             if text_span == tok_answer_text:
                 return (new_start, new_end)
 
     return (input_start, input_end)
 
 
-def _check_is_max_context(
-        doc_spans,
-        cur_span_index,
-        position):
+def _check_is_max_context(doc_spans, cur_span_index, position):
     """Check if this is the 'max context' doc span for the token."""
     best_score = None
     best_span_index = None
@@ -896,8 +828,7 @@ def _check_is_max_context(
             continue
         num_left_context = position - doc_span.start
         num_right_context = end - position
-        score = min(num_left_context, num_right_context) + \
-            0.01 * doc_span.length
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
         if best_score is None or score > best_score:
             best_score = score
             best_span_index = span_index
diff --git a/nemo/collections/nlp/data/datasets/token_classification.py b/nemo/collections/nlp/data/datasets/token_classification.py
index de3688b4b5cf..477f5058e28c 100644
--- a/nemo/collections/nlp/data/datasets/token_classification.py
+++ b/nemo/collections/nlp/data/datasets/token_classification.py
@@ -24,21 +24,24 @@
 import pickle
 import random
 
-import nemo
 import numpy as np
 from torch.utils.data import Dataset
 
+import nemo
+
 from . import utils
 
 
-def get_features(queries,
-                 max_seq_length,
-                 tokenizer,
-                 label_ids=None,
-                 pad_label='O',
-                 raw_labels=None,
-                 ignore_extra_tokens=False,
-                 ignore_start_end=False):
+def get_features(
+    queries,
+    max_seq_length,
+    tokenizer,
+    label_ids=None,
+    pad_label='O',
+    raw_labels=None,
+    ignore_extra_tokens=False,
+    ignore_start_end=False,
+):
     """
     Args:
     queries (list of str): text sequences
@@ -85,8 +88,7 @@ def get_features(queries,
             subtokens.extend(word_tokens)
 
             loss_mask.append(1)
-            loss_mask.extend([int(not ignore_extra_tokens)] *
-                             (len(word_tokens) - 1))
+            loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1))
 
             subtokens_mask.append(1)
             subtokens_mask.extend([0] * (len(word_tokens) - 1))
@@ -114,22 +116,19 @@ def get_features(queries,
 
     for i, subtokens in enumerate(all_subtokens):
         if len(subtokens) > max_seq_length:
-            subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1:]
-            all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1:]
-            all_loss_mask[i] = [int(not ignore_start_end)] + \
-                all_loss_mask[i][-max_seq_length + 1:]
-            all_subtokens_mask[i] = [0] + \
-                all_subtokens_mask[i][-max_seq_length + 1:]
+            subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1 :]
+            all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :]
+            all_loss_mask[i] = [int(not ignore_start_end)] + all_loss_mask[i][-max_seq_length + 1 :]
+            all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :]
 
             if with_label:
-                all_labels[i] = [pad_id] + all_labels[i][-max_seq_length + 1:]
+                all_labels[i] = [pad_id] + all_labels[i][-max_seq_length + 1 :]
             too_long_count += 1
 
-        all_input_ids.append([tokenizer.tokens_to_ids(t)
-                              for t in subtokens])
+        all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens])
 
         if len(subtokens) < max_seq_length:
-            extra = (max_seq_length - len(subtokens))
+            extra = max_seq_length - len(subtokens)
             all_input_ids[i] = all_input_ids[i] + [0] * extra
             all_loss_mask[i] = all_loss_mask[i] + [0] * extra
             all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra
@@ -145,24 +144,22 @@ def get_features(queries,
     for i in range(min(len(all_input_ids), 5)):
         nemo.logging.debug("*** Example ***")
         nemo.logging.debug("i: %s", i)
+        nemo.logging.debug("subtokens: %s", " ".join(list(map(str, all_subtokens[i]))))
+        nemo.logging.debug("loss_mask: %s", " ".join(list(map(str, all_loss_mask[i]))))
+        nemo.logging.debug("input_mask: %s", " ".join(list(map(str, all_input_mask[i]))))
         nemo.logging.debug(
-            "subtokens: %s", " ".join(list(map(str, all_subtokens[i]))))
-        nemo.logging.debug(
-            "loss_mask: %s", " ".join(list(map(str, all_loss_mask[i]))))
-        nemo.logging.debug(
-            "input_mask: %s", " ".join(list(map(str, all_input_mask[i]))))
-        nemo.logging.debug(
-            "subtokens_mask: %s", " ".join(list(map(
-                str, all_subtokens_mask[i]))))
+            "subtokens_mask: %s", " ".join(list(map(str, all_subtokens_mask[i]))),
+        )
         if with_label:
-            nemo.logging.debug(
-                "labels: %s", " ".join(list(map(str, all_labels[i]))))
-    return (all_input_ids,
-            all_segment_ids,
-            all_input_mask,
-            all_loss_mask,
-            all_subtokens_mask,
-            all_labels)
+            nemo.logging.debug("labels: %s", " ".join(list(map(str, all_labels[i]))))
+    return (
+        all_input_ids,
+        all_segment_ids,
+        all_input_mask,
+        all_loss_mask,
+        all_subtokens_mask,
+        all_labels,
+    )
 
 
 class BertTokenClassificationDataset(Dataset):
@@ -199,18 +196,20 @@ class BertTokenClassificationDataset(Dataset):
             the loss_mask
     """
 
-    def __init__(self,
-                 text_file,
-                 label_file,
-                 max_seq_length,
-                 tokenizer,
-                 num_samples=-1,
-                 shuffle=False,
-                 pad_label='O',
-                 label_ids=None,
-                 ignore_extra_tokens=False,
-                 ignore_start_end=False,
-                 use_cache=False):
+    def __init__(
+        self,
+        text_file,
+        label_file,
+        max_seq_length,
+        tokenizer,
+        num_samples=-1,
+        shuffle=False,
+        pad_label='O',
+        label_ids=None,
+        ignore_extra_tokens=False,
+        ignore_start_end=False,
+        use_cache=False,
+    ):
 
         if use_cache:
             # Cache features
@@ -220,19 +219,16 @@ def __init__(self,
             if not filename.endswith('.txt'):
                 raise ValueError("{text_file} should have extension .txt")
 
-            features_pkl = os.path.join(data_dir,
-                                        filename[:-4] + "_features.pkl")
+            features_pkl = os.path.join(data_dir, filename[:-4] + "_features.pkl")
             label_ids_pkl = os.path.join(data_dir, "label_ids.pkl")
 
-        if use_cache and \
-                os.path.exists(features_pkl) and os.path.exists(label_ids_pkl):
+        if use_cache and os.path.exists(features_pkl) and os.path.exists(label_ids_pkl):
             # If text_file was already processed, load from pickle
             features = pickle.load(open(features_pkl, 'rb'))
             nemo.logging.info(f'features restored from {features_pkl}')
 
             label_ids = pickle.load(open(label_ids_pkl, 'rb'))
-            nemo.logging.info(
-                f'Labels to ids dict restored from {label_ids_pkl}')
+            nemo.logging.info(f'Labels to ids dict restored from {label_ids_pkl}')
         else:
             if num_samples == 0:
                 raise ValueError("num_samples has to be positive", num_samples)
@@ -250,8 +246,7 @@ def __init__(self,
                     unique_labels.update(line)
 
             if len(labels_lines) != len(text_lines):
-                raise ValueError(
-                    "Labels file should contain labels for every word")
+                raise ValueError("Labels file should contain labels for every word")
 
             if shuffle or num_samples > 0:
                 dataset = list(zip(text_lines, labels_lines))
@@ -268,21 +263,22 @@ def __init__(self,
             if label_ids:
                 if len(label_ids) != len(unique_labels):
                     nemo.logging.warning(
-                        f'Not all labels from the specified' +
-                        ' label_ids dictionary are present in the' +
-                        ' current dataset. Using the provided' +
-                        ' label_ids dictionary.')
+                        f'Not all labels from the specified'
+                        + ' label_ids dictionary are present in the'
+                        + ' current dataset. Using the provided'
+                        + ' label_ids dictionary.'
+                    )
                 else:
-                    nemo.logging.info(
-                        f'Using the provided label_ids dictionary.')
+                    nemo.logging.info(f'Using the provided label_ids dictionary.')
             else:
                 nemo.logging.info(
-                    f'Creating a new label to label_id dictionary.' +
-                    ' It\'s recommended to use label_ids generated' +
-                    ' during training for dev/test sets to avoid' +
-                    ' errors if some labels are not' +
-                    ' present in the dev/test sets.' +
-                    ' For training set label_ids should be None.')
+                    f'Creating a new label to label_id dictionary.'
+                    + ' It\'s recommended to use label_ids generated'
+                    + ' during training for dev/test sets to avoid'
+                    + ' errors if some labels are not'
+                    + ' present in the dev/test sets.'
+                    + ' For training set label_ids should be None.'
+                )
 
                 label_ids = {pad_label: 0}
                 if pad_label in unique_labels:
@@ -290,22 +286,23 @@ def __init__(self,
                 for label in sorted(unique_labels):
                     label_ids[label] = len(label_ids)
 
-            features = get_features(text_lines,
-                                    max_seq_length,
-                                    tokenizer,
-                                    pad_label=pad_label,
-                                    raw_labels=labels_lines,
-                                    label_ids=label_ids,
-                                    ignore_extra_tokens=ignore_extra_tokens,
-                                    ignore_start_end=ignore_start_end)
+            features = get_features(
+                text_lines,
+                max_seq_length,
+                tokenizer,
+                pad_label=pad_label,
+                raw_labels=labels_lines,
+                label_ids=label_ids,
+                ignore_extra_tokens=ignore_extra_tokens,
+                ignore_start_end=ignore_start_end,
+            )
 
             if use_cache:
                 pickle.dump(features, open(features_pkl, "wb"))
                 nemo.logging.info(f'features saved to {features_pkl}')
 
                 pickle.dump(label_ids, open(label_ids_pkl, "wb"))
-                nemo.logging.info(
-                    f'labels to ids dict saved to {label_ids_pkl}')
+                nemo.logging.info(f'labels to ids dict saved to {label_ids_pkl}')
 
         self.all_input_ids = features[0]
         self.all_segment_ids = features[1]
@@ -315,15 +312,14 @@ def __init__(self,
         self.all_labels = features[5]
         self.label_ids = label_ids
 
-        infold = text_file[:text_file.rfind('/')]
+        infold = text_file[: text_file.rfind('/')]
         merged_labels = itertools.chain.from_iterable(self.all_labels)
         nemo.logging.info('Three most popular labels')
-        _, self.label_frequencies = \
-            utils.get_label_stats(merged_labels, infold + '/label_stats.tsv')
+        _, self.label_frequencies = utils.get_label_stats(merged_labels, infold + '/label_stats.tsv')
 
         # save label_ids
         out = open(infold + '/label_ids.csv', 'w')
-        labels, _ = zip(*sorted(self.label_ids.items(),  key=lambda x: x[1]))
+        labels, _ = zip(*sorted(self.label_ids.items(), key=lambda x: x[1]))
         out.write('\n'.join(labels))
         nemo.logging.info(f'Labels: {self.label_ids}')
         nemo.logging.info(f'Labels mapping saved to : {out.name}')
@@ -332,12 +328,14 @@ def __len__(self):
         return len(self.all_input_ids)
 
     def __getitem__(self, idx):
-        return (np.array(self.all_input_ids[idx]),
-                np.array(self.all_segment_ids[idx]),
-                np.array(self.all_input_mask[idx], dtype=np.long),
-                np.array(self.all_loss_mask[idx]),
-                np.array(self.all_subtokens_mask[idx]),
-                np.array(self.all_labels[idx]))
+        return (
+            np.array(self.all_input_ids[idx]),
+            np.array(self.all_segment_ids[idx]),
+            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_loss_mask[idx]),
+            np.array(self.all_subtokens_mask[idx]),
+            np.array(self.all_labels[idx]),
+        )
 
 
 class BertTokenClassificationInferDataset(Dataset):
@@ -357,14 +355,9 @@ class BertTokenClassificationInferDataset(Dataset):
         tokenizer (Tokenizer): such as NemoBertTokenizer
     """
 
-    def __init__(self,
-                 queries,
-                 max_seq_length,
-                 tokenizer):
+    def __init__(self, queries, max_seq_length, tokenizer):
 
-        features = get_features(queries,
-                                max_seq_length,
-                                tokenizer)
+        features = get_features(queries, max_seq_length, tokenizer)
 
         self.all_input_ids = features[0]
         self.all_segment_ids = features[1]
@@ -376,8 +369,10 @@ def __len__(self):
         return len(self.all_input_ids)
 
     def __getitem__(self, idx):
-        return (np.array(self.all_input_ids[idx]),
-                np.array(self.all_segment_ids[idx]),
-                np.array(self.all_input_mask[idx], dtype=np.long),
-                np.array(self.all_loss_mask[idx]),
-                np.array(self.all_subtokens_mask[idx]))
+        return (
+            np.array(self.all_input_ids[idx]),
+            np.array(self.all_segment_ids[idx]),
+            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_loss_mask[idx]),
+            np.array(self.all_subtokens_mask[idx]),
+        )
diff --git a/nemo/collections/nlp/data/datasets/translation.py b/nemo/collections/nlp/data/datasets/translation.py
index f284d3234bcc..e9c1134e70e0 100644
--- a/nemo/collections/nlp/data/datasets/translation.py
+++ b/nemo/collections/nlp/data/datasets/translation.py
@@ -18,17 +18,13 @@
 import numpy as np
 from torch.utils.data import Dataset
 
-from ..utils import dataset_to_ids, clean_src_and_target
+from ..utils import clean_src_and_target, dataset_to_ids
 
 
 class TranslationDataset(Dataset):
-    def __init__(self,
-                 tokenizer_src,
-                 tokenizer_tgt,
-                 dataset_src,
-                 dataset_tgt,
-                 tokens_in_batch=1024,
-                 clean=False):
+    def __init__(
+        self, tokenizer_src, tokenizer_tgt, dataset_src, dataset_tgt, tokens_in_batch=1024, clean=False,
+    ):
 
         self.src_tokenizer = tokenizer_src
         self.tgt_tokenizer = tokenizer_tgt
@@ -64,15 +60,11 @@ def pad_batches(self, src_ids, tgt_ids, batch_indices):
         for batch_idx, b in enumerate(batch_indices):
             src_len = max([len(src_ids[i]) for i in b])
             tgt_len = max([len(tgt_ids[i]) for i in b])
-            src_ids_ = self.src_tokenizer.pad_id() * np.ones(
-                (len(b), src_len), dtype=np.int)
-            tgt_ids_ = self.tgt_tokenizer.pad_id() * np.ones(
-                (len(b), tgt_len), dtype=np.int)
+            src_ids_ = self.src_tokenizer.pad_id() * np.ones((len(b), src_len), dtype=np.int)
+            tgt_ids_ = self.tgt_tokenizer.pad_id() * np.ones((len(b), tgt_len), dtype=np.int)
             for i, sentence_idx in enumerate(b):
-                src_ids_[i][:len(src_ids[sentence_idx]
-                                 )] = src_ids[sentence_idx]
-                tgt_ids_[i][:len(tgt_ids[sentence_idx]
-                                 )] = tgt_ids[sentence_idx]
+                src_ids_[i][: len(src_ids[sentence_idx])] = src_ids[sentence_idx]
+                tgt_ids_[i][: len(tgt_ids[sentence_idx])] = tgt_ids[sentence_idx]
             batches[batch_idx] = {"src": src_ids_, "tgt": tgt_ids_}
         return batches
 
@@ -141,16 +133,13 @@ def pack_data_into_batches(self, src_ids, tgt_ids):
                         batches_to_evict = num_examples_to_split
 
                     batches.append(batches[num_batches][batches_to_evict:])
-                    batches[num_batches] = \
-                        batches[num_batches][:batches_to_evict]
+                    batches[num_batches] = batches[num_batches][:batches_to_evict]
                     batch_size = num_examples_to_split - batches_to_evict
 
                     num_batches += 1
                     if batch_size > 0:
-                        src_len = max(
-                            [len(src_ids[j]) for j in batches[num_batches]])
-                        tgt_len = max(
-                            [len(tgt_ids[j]) for j in batches[num_batches]])
+                        src_len = max([len(src_ids[j]) for j in batches[num_batches]])
+                        tgt_len = max([len(tgt_ids[j]) for j in batches[num_batches]])
                     else:
                         src_len = 0
                         tgt_len = 0
diff --git a/nemo/collections/nlp/data/datasets/utils.py b/nemo/collections/nlp/data/datasets/utils.py
index c53365867770..2236bf46a7e1 100644
--- a/nemo/collections/nlp/data/datasets/utils.py
+++ b/nemo/collections/nlp/data/datasets/utils.py
@@ -1,4 +1,3 @@
-from collections import Counter
 import csv
 import glob
 import itertools
@@ -8,29 +7,28 @@
 import re
 import shutil
 import subprocess
+from collections import Counter
 
-import nemo
 import numpy as np
 from sentencepiece import SentencePieceTrainer as SPT
 from tqdm import tqdm
 
-from ...utils.nlp_utils import (get_vocab,
-                                write_vocab,
-                                write_vocab_in_order,
-                                label2idx)
+import nemo
 
+from ...utils.nlp_utils import get_vocab, label2idx, write_vocab, write_vocab_in_order
 
 DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}'
-MODE_EXISTS_TMP = \
-    '{} mode of {} dataset has already been processed and stored at {}'
+MODE_EXISTS_TMP = '{} mode of {} dataset has already been processed and stored at {}'
 
 
 def get_stats(lengths):
     lengths = np.asarray(lengths)
-    nemo.logging.info(f'Min: {np.min(lengths)} | \
+    nemo.logging.info(
+        f'Min: {np.min(lengths)} | \
                  Max: {np.max(lengths)} | \
                  Mean: {np.mean(lengths)} | \
-                 Median: {np.median(lengths)}')
+                 Median: {np.median(lengths)}'
+    )
     nemo.logging.info(f'75 percentile: {np.percentile(lengths, 75)}')
     nemo.logging.info(f'99 percentile: {np.percentile(lengths, 99)}')
 
@@ -69,18 +67,15 @@ def if_exist(outfold, files):
 def process_sst_2(data_dir):
     if not os.path.exists(data_dir):
         link = 'https://gluebenchmark.com/tasks'
-        raise ValueError(f'Data not found at {data_dir}. '
-                         f'Please download SST-2 from {link}.')
-    nemo.logging.info(
-        'Keep in mind that SST-2 is only available in lower case.')
+        raise ValueError(f'Data not found at {data_dir}. ' f'Please download SST-2 from {link}.')
+    nemo.logging.info('Keep in mind that SST-2 is only available in lower case.')
     return data_dir
 
 
 def process_imdb(data_dir, uncased, modes=['train', 'test']):
     if not os.path.exists(data_dir):
         link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset'
-        raise ValueError(f'Data not found at {data_dir}. '
-                         f'Please download IMDB from {link}.')
+        raise ValueError(f'Data not found at {data_dir}. ' f'Please download IMDB from {link}.')
 
     outfold = f'{data_dir}/nemo-processed'
 
@@ -123,8 +118,7 @@ def process_thucnews(data_dir):
     train_size = 0.8
     if not os.path.exists(data_dir):
         link = 'thuctc.thunlp.org/'
-        raise ValueError(f'Data not found at {data_dir}. '
-                         f'Please download THUCNews from {link}.')
+        raise ValueError(f'Data not found at {data_dir}. ' f'Please download THUCNews from {link}.')
 
     outfold = f'{data_dir}/nemo-processed-thucnews'
 
@@ -138,11 +132,24 @@ def process_thucnews(data_dir):
     outfiles = {}
 
     for mode in modes:
-        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'a+',
-                              encoding='utf-8')
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'a+', encoding='utf-8')
         outfiles[mode].write('sentence\tlabel\n')
-    categories = ['体育', '娱乐', '家居', '彩票', '房产', '教育', '时尚',
-                  '时政', '星座', '游戏', '社会', '科技', '股票', '财经']
+    categories = [
+        '体育',
+        '娱乐',
+        '家居',
+        '彩票',
+        '房产',
+        '教育',
+        '时尚',
+        '时政',
+        '星座',
+        '游戏',
+        '社会',
+        '科技',
+        '股票',
+        '财经',
+    ]
     for category in categories:
         label = categories.index(category)
         category_files = glob.glob(f'{data_dir}/{category}/*.txt')
@@ -150,8 +157,7 @@ def process_thucnews(data_dir):
         test_files = category_files[:test_num]
         train_files = category_files[test_num:]
         for mode in modes:
-            nemo.logging.info(
-                f'Processing {mode} data of the category {category}')
+            nemo.logging.info(f'Processing {mode} data of the category {category}')
             if mode == 'test':
                 files = test_files
             else:
@@ -167,10 +173,7 @@ def process_thucnews(data_dir):
     return outfold
 
 
-def process_nlu(filename,
-                uncased,
-                modes=['train', 'test'],
-                dataset_name='nlu-ubuntu'):
+def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ubuntu'):
     """ Dataset has to be of:
     - ubuntu
     - chat
@@ -179,38 +182,40 @@ def process_nlu(filename,
 
     if not os.path.exists(filename):
         link = 'https://github.com/sebischair/NLU-Evaluation-Corpora'
-        raise ValueError(f'Data not found at {filename}. '
-                         'Please download IMDB from {link}.')
+        raise ValueError(f'Data not found at {filename}. ' 'Please download IMDB from {link}.')
 
     if dataset_name == 'nlu-ubuntu':
-        INTENT = {'makeupdate': 1,
-                  'setupprinter': 2,
-                  'shutdowncomputer': 3,
-                  'softwarerecommendation': 4,
-                  'none': 0}
+        INTENT = {
+            'makeupdate': 1,
+            'setupprinter': 2,
+            'shutdowncomputer': 3,
+            'softwarerecommendation': 4,
+            'none': 0,
+        }
     elif dataset_name == 'nlu-chat':
         INTENT = {'departuretime': 0, 'findconnection': 1}
     elif dataset_name == 'nlu-web':
-        INTENT = {'changepassword': 1,
-                  'deleteaccount': 2,
-                  'downloadvideo': 3,
-                  'exportdata': 4,
-                  'filterspam': 5,
-                  'findalternative': 6,
-                  'syncaccounts': 7,
-                  'none': 0}
+        INTENT = {
+            'changepassword': 1,
+            'deleteaccount': 2,
+            'downloadvideo': 3,
+            'exportdata': 4,
+            'filterspam': 5,
+            'findalternative': 6,
+            'syncaccounts': 7,
+            'none': 0,
+        }
     else:
         raise ValueError(f'{dataset_name}: Invalid dataset name')
 
-    infold = filename[:filename.rfind('/')]
+    infold = filename[: filename.rfind('/')]
     outfold = f'{infold}/{dataset_name}-nemo-processed'
 
     if uncased:
         outfold = f'{outfold}_uncased'
 
     if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        nemo.logging.info(
-            DATABASE_EXISTS_TMP.format(dataset_name.upper(), outfold))
+        nemo.logging.info(DATABASE_EXISTS_TMP.format(dataset_name.upper(), outfold))
         return outfold
     nemo.logging.info(f'Processing data and store at {outfold}')
 
@@ -297,19 +302,17 @@ def process_atis(infold, uncased, modes=['train', 'test'], dev_split=0):
             slot = ' '.join(slots[i].strip().split()[1:-1])
             outfiles[mode + '_slots'].write(slot + '\n')
 
-    shutil.copyfile(f'{infold}/atis.dict.intent.csv',
-                    f'{outfold}/dict.intents.csv')
-    shutil.copyfile(f'{infold}/atis.dict.slots.csv',
-                    f'{outfold}/dict.slots.csv')
+    shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv')
+    shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv')
     for mode in modes:
         outfiles[mode].close()
 
     return outfold
 
 
-def process_jarvis_datasets(infold, uncased, dataset_name,
-                            modes=['train', 'test', 'eval'],
-                            ignore_prev_intent=False):
+def process_jarvis_datasets(
+    infold, uncased, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False,
+):
     """ process and convert Jarvis datasets into NeMo's BIO format
     """
     outfold = f'{infold}/{dataset_name}-nemo-processed'
@@ -322,8 +325,7 @@ def process_jarvis_datasets(infold, uncased, dataset_name,
         nemo.logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold))
         return outfold
 
-    nemo.logging.info(
-        f'Processing {dataset_name} dataset and store at {outfold}')
+    nemo.logging.info(f'Processing {dataset_name} dataset and store at {outfold}')
 
     os.makedirs(outfold, exist_ok=True)
 
@@ -341,13 +343,11 @@ def process_jarvis_datasets(infold, uncased, dataset_name,
 
     for mode in modes:
         if if_exist(outfold, [f'{mode}.tsv']):
-            nemo.logging.info(
-                MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
+            nemo.logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
             continue
 
         if not if_exist(infold, [f'{mode}.tsv']):
-            nemo.logging.info(f'{mode} mode of {dataset_name}'
-                              f' is skipped as it was not found.')
+            nemo.logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.')
             continue
 
         outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
@@ -373,8 +373,7 @@ def process_jarvis_datasets(infold, uncased, dataset_name,
             else:
                 start_token = 1
             sentence_cld = " ".join(sentence.strip().split()[start_token:-1])
-            outfiles[mode].write(f'{sentence_cld}\t'
-                                 f'{str(intents_list[intent_str])}\n')
+            outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n')
 
             slot_tags_list = []
             if slot_tags_str.strip():
@@ -383,9 +382,7 @@ def process_jarvis_datasets(infold, uncased, dataset_name,
                     if not st.strip():
                         continue
                     [start_i, end_i, slot_name] = st.strip().split(":")
-                    slot_tags_list.append([int(start_i),
-                                           int(end_i),
-                                           slot_name])
+                    slot_tags_list.append([int(start_i), int(end_i), slot_name])
                     if slot_name not in slots_list:
                         slots_list[slot_name] = len(slots_list)
                         slots_list_all[f'B-{slot_name}'] = len(slots_list_all)
@@ -398,14 +395,11 @@ def process_jarvis_datasets(infold, uncased, dataset_name,
             processed_index = 0
             for tag_start, tag_end, tag_str in slot_tags_list:
                 if tag_start > processed_index:
-                    words_list = \
-                        sentence[processed_index:tag_start].strip().split()
-                    slots.extend([str(slots_list_all['O'])]*len(words_list))
+                    words_list = sentence[processed_index:tag_start].strip().split()
+                    slots.extend([str(slots_list_all['O'])] * len(words_list))
                 words_list = sentence[tag_start:tag_end].strip().split()
                 slots.append(str(slots_list_all[f'B-{tag_str}']))
-                slots.extend(
-                        [str(slots_list_all[f'I-{tag_str}'])] *
-                        (len(words_list) - 1))
+                slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1))
                 processed_index = tag_end
 
             if processed_index < len(sentence):
@@ -539,8 +533,7 @@ def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
     if not os.path.exists(data_dir):
         link = 'www.github.com/snipsco/spoken-language'
         '-understanding-research-datasets'
-        raise ValueError(f'Data not found at {data_dir}. '
-                         'Resquest to download the SNIPS dataset from {link}.')
+        raise ValueError(f'Data not found at {data_dir}. ' 'Resquest to download the SNIPS dataset from {link}.')
 
     outfold = f'{data_dir}/nemo-processed'
 
@@ -550,8 +543,7 @@ def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
     exist = True
     for dataset in ['light', 'speak', 'all']:
         if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]):
-            nemo.logging.info(DATABASE_EXISTS_TMP.format(
-                'SNIPS-' + dataset.upper(), outfold))
+            nemo.logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset.upper(), outfold))
         else:
             exist = False
     if exist:
@@ -568,18 +560,23 @@ def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
     speak_files = [f'{data_dir}/{speak_dir}/training_dataset.json']
     speak_files.append(f'{data_dir}/{speak_dir}/test_dataset.json')
 
-    light_train, light_dev, light_slots, light_intents = get_dataset(
-        light_files, dev_split)
-    speak_train, speak_dev, speak_slots, speak_intents = get_dataset(
-        speak_files)
-
-    create_dataset(light_train, light_dev, light_slots,
-                   light_intents, uncased, f'{outfold}/light')
-    create_dataset(speak_train, speak_dev, speak_slots,
-                   speak_intents, uncased, f'{outfold}/speak')
-    create_dataset(light_train + speak_train, light_dev + speak_dev,
-                   light_slots | speak_slots, light_intents | speak_intents,
-                   uncased, f'{outfold}/all')
+    light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split)
+    speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files)
+
+    create_dataset(
+        light_train, light_dev, light_slots, light_intents, uncased, f'{outfold}/light',
+    )
+    create_dataset(
+        speak_train, speak_dev, speak_slots, speak_intents, uncased, f'{outfold}/speak',
+    )
+    create_dataset(
+        light_train + speak_train,
+        light_dev + speak_dev,
+        light_slots | speak_slots,
+        light_intents | speak_intents,
+        uncased,
+        f'{outfold}/all',
+    )
 
     return outfold
 
@@ -687,9 +684,7 @@ def get_intents_slots_dialogflow(files, slot_labels):
                     query_text = ''.join([query_text, segment['text']])
                     if 'alias' in segment:
                         for _ in segment['text'].split():
-                            slots = ' '.join([
-                                slots,
-                                slot_labels.get(segment['alias'])])
+                            slots = ' '.join([slots, slot_labels.get(segment['alias'])])
                     else:
                         for _ in segment['text'].split():
                             slots = ' '.join([slots, slot_labels.get('O')])
@@ -743,16 +738,12 @@ def write_files(data, outfile):
             f.write(item)
 
 
-def process_dialogflow(
-        data_dir,
-        uncased,
-        modes=['train', 'test'],
-        dev_split=0.1):
+def process_dialogflow(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
     if not os.path.exists(data_dir):
         link = 'www.dialogflow.com'
-        raise ValueError(f'Data not found at {data_dir}. '
-                         'Export your dialogflow data from'
-                         '{link} and unzip at {data_dir}.')
+        raise ValueError(
+            f'Data not found at {data_dir}. ' 'Export your dialogflow data from' '{link} and unzip at {data_dir}.'
+        )
 
     outfold = f'{data_dir}/dialogflow/nemo-processed'
 
@@ -765,12 +756,9 @@ def process_dialogflow(
 
     slot_labels = get_slots_dialogflow(files)
 
-    intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(
-        files,
-        slot_labels)
+    intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(files, slot_labels)
 
-    train_queries, train_slots, test_queries, test_slots = \
-        partition_data(intent_queries, slot_tags, split=dev_split)
+    train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split)
 
     write_files(train_queries, f'{outfold}/train.tsv')
     write_files(train_slots, f'{outfold}/train_slots.tsv')
@@ -842,8 +830,7 @@ def get_slot_labels(slot_annotations, task_name):
     return all_labels
 
 
-def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names,
-                              task_name):
+def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names, task_name):
 
     slot_tags = []
     inorder_utterances = []
@@ -854,8 +841,7 @@ def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names,
     for annotation in slot_annotations[0:]:
         an = json.loads(annotation)
         utterance = an['source']
-        if len(utterance) > 2 and utterance.startswith('"') \
-                and utterance.endswith('"'):
+        if len(utterance) > 2 and utterance.startswith('"') and utterance.endswith('"'):
             utterance = utterance[1:-1]
 
         if utterance in agreed_all:
@@ -870,10 +856,10 @@ def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names,
             for i in sorted(entities.keys()):
                 annotated_entities = an[task_name]['annotations']['entities']
                 tags = annotated_entities[entities.get(i)]
-                untagged_words = utterance[lastptr:tags['startOffset']]
+                untagged_words = utterance[lastptr : tags['startOffset']]
                 for _ in untagged_words.split():
                     slotlist.append(all_labels.get('O'))
-                anno_words = utterance[tags['startOffset']:tags['endOffset']]
+                anno_words = utterance[tags['startOffset'] : tags['endOffset']]
                 # tagging with the IOB format.
                 for j, _ in enumerate(anno_words.split()):
                     if j == 0:
@@ -884,7 +870,7 @@ def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names,
                         slotlist.append(all_labels.get(i_slot))
                 lastptr = tags['endOffset']
 
-            untagged_words = utterance[lastptr:len(utterance)]
+            untagged_words = utterance[lastptr : len(utterance)]
             for _ in untagged_words.split():
                 slotlist.append(all_labels.get('O'))
 
@@ -903,16 +889,12 @@ def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names,
     return all_labels, inorder_utterances, slot_tags
 
 
-def process_mturk(
-        data_dir,
-        uncased,
-        modes=['train', 'test'],
-        dev_split=0.1):
+def process_mturk(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
     if not os.path.exists(data_dir):
         link = 'www.mturk.com'
-        raise ValueError(f'Data not found at {data_dir}. '
-                         'Export your mturk data from'
-                         '{link} and unzip at {data_dir}.')
+        raise ValueError(
+            f'Data not found at {data_dir}. ' 'Export your mturk data from' '{link} and unzip at {data_dir}.'
+        )
 
     outfold = f'{data_dir}/nemo-processed'
 
@@ -920,8 +902,7 @@ def process_mturk(
         nemo.logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold))
         return outfold
 
-    nemo.logging.info(
-        f'Processing dataset from mturk and storing at {outfold}')
+    nemo.logging.info(f'Processing dataset from mturk and storing at {outfold}')
 
     os.makedirs(outfold, exist_ok=True)
 
@@ -929,8 +910,7 @@ def process_mturk(
     annotation_data_file = f'{data_dir}/annotation.manifest'
 
     if not os.path.exists(classification_data_file):
-        raise FileNotFoundError(f'File not found '
-                                f'at {classification_data_file}')
+        raise FileNotFoundError(f'File not found ' f'at {classification_data_file}')
 
     if not os.path.exists(annotation_data_file):
         raise FileNotFoundError(f'File not found at {annotation_data_file}')
@@ -952,21 +932,17 @@ def process_mturk(
     # It is assumed that every utterances will have corresponding
     # slot annotation information
     if len(slot_annotations) < len(agreed_all):
-        raise ValueError(f'Every utterance must have corresponding'
-                         f'slot annotation information')
+        raise ValueError(f'Every utterance must have corresponding' f'slot annotation information')
 
     slot_labels, intent_queries, slot_tags = process_intent_slot_mturk(
-            slot_annotations,
-            agreed_all,
-            intent_names,
-            task_name)
+        slot_annotations, agreed_all, intent_names, task_name
+    )
 
     assert len(slot_tags) == len(intent_queries)
 
     dev_split = 0.1
 
-    train_queries, train_slots, test_queries, test_slots = \
-        partition_data(intent_queries, slot_tags, split=dev_split)
+    train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split)
 
     write_files(train_queries, f'{outfold}/train.tsv')
     write_files(train_slots, f'{outfold}/train_slots.tsv')
@@ -1002,8 +978,7 @@ def calc_class_weights(label_freq):
     """
 
     most_common_label_freq = label_freq[0]
-    weighted_slots = sorted([(index, most_common_label_freq[1]/freq)
-                            for (index, freq) in label_freq])
+    weighted_slots = sorted([(index, most_common_label_freq[1] / freq) for (index, freq) in label_freq])
     return [weight for (_, weight) in weighted_slots]
 
 
@@ -1045,20 +1020,15 @@ class JointIntentSlotDataDesc:
 
     """
 
-    def __init__(self,
-                 data_dir,
-                 do_lower_case=False,
-                 dataset_name='default',
-                 none_slot_label='O',
-                 pad_label=-1):
+    def __init__(
+        self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1,
+    ):
         if dataset_name == 'atis':
             self.data_dir = process_atis(data_dir, do_lower_case)
         elif dataset_name == 'snips-atis':
             self.data_dir, self.pad_label = merge(
-                data_dir,
-                ['ATIS/nemo-processed-uncased',
-                 'snips/nemo-processed-uncased/all'],
-                dataset_name)
+                data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all',], dataset_name,
+            )
         elif dataset_name == 'dialogflow':
             self.data_dir = process_dialogflow(data_dir, do_lower_case)
         elif dataset_name == 'mturk-processed':
@@ -1072,19 +1042,16 @@ def __init__(self,
             elif dataset_name.endswith('all'):
                 self.data_dir = f'{self.data_dir}/all'
         elif dataset_name.startswith('jarvis'):
-            self.data_dir = process_jarvis_datasets(data_dir,
-                                                    do_lower_case,
-                                                    dataset_name,
-                                                    modes=["train",
-                                                           "test",
-                                                           "eval"],
-                                                    ignore_prev_intent=False)
+            self.data_dir = process_jarvis_datasets(
+                data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False,
+            )
         else:
             if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
                 raise FileNotFoundError(
                     "Make sure that your data follows the standard format "
                     "supported by JointIntentSlotDataset. Your data must "
-                    "contain dict.intents.csv and dict.slots.csv.")
+                    "contain dict.intents.csv and dict.slots.csv."
+                )
             self.data_dir = data_dir
 
         self.intent_dict_file = self.data_dir + '/dict.intents.csv'
@@ -1096,8 +1063,7 @@ def __init__(self,
         for mode in ['train', 'test', 'eval']:
 
             if not if_exist(self.data_dir, [f'{mode}.tsv']):
-                nemo.logging.info(f' Stats calculation for {mode} mode'
-                                  f' is skipped as {mode}.tsv was not found.')
+                nemo.logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.')
                 continue
 
             slot_file = f'{self.data_dir}/{mode}_slots.tsv'
@@ -1112,7 +1078,8 @@ def __init__(self,
                 raise ValueError(
                     "Make sure that the number of slot lines match the "
                     "number of intent lines. There should be a 1-1 "
-                    "correspondence between every slot and intent lines.")
+                    "correspondence between every slot and intent lines."
+                )
 
             dataset = list(zip(slot_lines, input_lines))
 
@@ -1124,16 +1091,14 @@ def __init__(self,
                 raw_intents.append(int(parts[-1]))
                 queries.append(' '.join(parts[:-1]))
 
-            infold = input_file[:input_file.rfind('/')]
+            infold = input_file[: input_file.rfind('/')]
 
             nemo.logging.info(f'Three most popular intents during {mode}ing')
-            total_intents, intent_label_freq = get_label_stats(
-                raw_intents, infold + f'/{mode}_intent_stats.tsv')
+            total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv')
             merged_slots = itertools.chain.from_iterable(raw_slots)
 
             nemo.logging.info(f'Three most popular slots during {mode}ing')
-            slots_total, slots_label_freq = get_label_stats(
-                merged_slots, infold + f'/{mode}_slot_stats.tsv')
+            slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv')
 
             if mode == 'train':
 
@@ -1141,8 +1106,7 @@ def __init__(self,
                 nemo.logging.info(f'Slot weights are - {self.slot_weights}')
 
                 self.intent_weights = calc_class_weights(intent_label_freq)
-                nemo.logging.info(
-                    f'Intent weights are - {self.intent_weights}')
+                nemo.logging.info(f'Intent weights are - {self.intent_weights}')
 
             nemo.logging.info(f'Total intents - {total_intents}')
             nemo.logging.info(f'Intent label frequency - {intent_label_freq}')
@@ -1153,8 +1117,7 @@ def __init__(self,
             self.pad_label = pad_label
         else:
             if none_slot_label not in slots:
-                raise ValueError(f'none_slot_label {none_slot_label} not '
-                                 f'found in {self.slot_dict_file}.')
+                raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.')
             self.pad_label = slots[none_slot_label]
 
 
@@ -1182,33 +1145,28 @@ def __init__(self, dataset_name, data_dir, do_lower_case):
             elif dataset_name.endswith('web'):
                 data_dir = f'{data_dir}/WebApplicationsCorpus.json'
                 self.num_labels = 8
-            self.data_dir = process_nlu(data_dir,
-                                        do_lower_case,
-                                        dataset_name=dataset_name)
+            self.data_dir = process_nlu(data_dir, do_lower_case, dataset_name=dataset_name)
             self.eval_file = self.data_dir + '/test.tsv'
         elif dataset_name.startswith('jarvis'):
-            self.data_dir = process_jarvis_datasets(data_dir,
-                                                    do_lower_case,
-                                                    dataset_name,
-                                                    modes=['train',
-                                                           'test',
-                                                           'eval'],
-                                                    ignore_prev_intent=False)
+            self.data_dir = process_jarvis_datasets(
+                data_dir, do_lower_case, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False,
+            )
 
             intents = get_intent_labels(f'{self.data_dir}/dict.intents.csv')
             self.num_labels = len(intents)
         else:
-            raise ValueError("Looks like you passed a dataset name that isn't "
-                             "already supported by NeMo. Please make sure "
-                             "that you build the preprocessing method for it.")
+            raise ValueError(
+                "Looks like you passed a dataset name that isn't "
+                "already supported by NeMo. Please make sure "
+                "that you build the preprocessing method for it."
+            )
 
         self.train_file = self.data_dir + '/train.tsv'
 
         for mode in ['train', 'test', 'eval']:
 
             if not if_exist(self.data_dir, [f'{mode}.tsv']):
-                nemo.logging.info(f' Stats calculation for {mode} mode'
-                                  f' is skipped as {mode}.tsv was not found.')
+                nemo.logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.')
                 continue
 
             input_file = f'{self.data_dir}/{mode}.tsv'
@@ -1221,11 +1179,10 @@ def __init__(self, dataset_name, data_dir, do_lower_case):
                 raw_sentences.append(int(parts[-1]))
                 queries.append(' '.join(parts[:-1]))
 
-            infold = input_file[:input_file.rfind('/')]
+            infold = input_file[: input_file.rfind('/')]
 
             nemo.logging.info(f'Three most popular classes during {mode}ing')
-            total_sents, sent_label_freq = get_label_stats(
-                raw_sentences, infold + f'/{mode}_sentence_stats.tsv')
+            total_sents, sent_label_freq = get_label_stats(raw_sentences, infold + f'/{mode}_sentence_stats.tsv')
 
             if mode == 'train':
 
@@ -1233,8 +1190,7 @@ def __init__(self, dataset_name, data_dir, do_lower_case):
                 nemo.logging.info(f'Class weights are - {self.class_weights}')
 
             nemo.logging.info(f'Total Sentences - {total_sents}')
-            nemo.logging.info(
-                f'Sentence class frequencies - {sent_label_freq}')
+            nemo.logging.info(f'Sentence class frequencies - {sent_label_freq}')
 
 
 def create_vocab_lm(data_dir, do_lower_case):
@@ -1271,8 +1227,7 @@ def create_vocab_lm(data_dir, do_lower_case):
 
 def download_wkt2(data_dir):
     os.makedirs('data/lm', exist_ok=True)
-    nemo.logging.warning(f'Data not found at {data_dir}. '
-                         f'Downloading wikitext-2 to data/lm')
+    nemo.logging.warning(f'Data not found at {data_dir}. ' f'Downloading wikitext-2 to data/lm')
     data_dir = 'data/lm/wikitext-2'
     subprocess.call('scripts/get_wkt2.sh')
     return data_dir
@@ -1289,20 +1244,17 @@ def __init__(self, dataset_name, data_dir, do_lower_case):
             nemo.logging.warning(
                 "Looks like you passed a dataset name that isn't "
                 "already supported by NeMo. Please make sure that "
-                "you build the preprocessing method for it.")
+                "you build the preprocessing method for it."
+            )
 
 
-def create_vocab_mlm(data_dir,
-                     vocab_size,
-                     sample_size,
-                     special_tokens=['[PAD]', '[UNK]',
-                                     '[CLS]', '[SEP]', '[MASK]'],
-                     train_file=''):
+def create_vocab_mlm(
+    data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file='',
+):
     vocab = special_tokens[:]
     bert_dir = f'{data_dir}/bert'
     if if_exist(bert_dir, ['tokenizer.model']):
-        nemo.logging.info(
-            DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir))
+        nemo.logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir))
         return data_dir, f'{bert_dir}/tokenizer.model'
     nemo.logging.info(f'Processing WikiText dataset and store at {bert_dir}')
     os.makedirs(bert_dir, exist_ok=True)
@@ -1320,11 +1272,13 @@ def create_vocab_mlm(data_dir,
     else:
         train_file = f'{data_dir}/{train_file}'
 
-    cmd = (f"--input={train_file} --model_prefix={bert_dir}/tokenizer "
-           f"--vocab_size={vocab_size - len(vocab)} "
-           f"--input_sentence_size={sample_size} "
-           f"--shuffle_input_sentence=true --hard_vocab_limit=false "
-           f"--bos_id=-1 --eos_id=-1")
+    cmd = (
+        f"--input={train_file} --model_prefix={bert_dir}/tokenizer "
+        f"--vocab_size={vocab_size - len(vocab)} "
+        f"--input_sentence_size={sample_size} "
+        f"--shuffle_input_sentence=true --hard_vocab_limit=false "
+        f"--bos_id=-1 --eos_id=-1"
+    )
     SPT.Train(cmd)
 
     # Add BERT control symbols
@@ -1349,27 +1303,21 @@ def create_vocab_mlm(data_dir,
 
 
 class BERTPretrainingDataDesc:
-    def __init__(self,
-                 dataset_name,
-                 data_dir,
-                 vocab_size,
-                 sample_size,
-                 special_tokens,
-                 train_file=''):
+    def __init__(
+        self, dataset_name, data_dir, vocab_size, sample_size, special_tokens, train_file='',
+    ):
         if dataset_name == 'wikitext-2':
             if not os.path.exists(data_dir):
                 data_dir = download_wkt2(data_dir)
             self.data_dir, self.tokenizer_model = create_vocab_mlm(
-                data_dir,
-                vocab_size,
-                sample_size,
-                special_tokens,
-                train_file)
+                data_dir, vocab_size, sample_size, special_tokens, train_file
+            )
         else:
             nemo.logging.warning(
                 "Looks like you passed a dataset name that isn't "
                 "already supported by NeMo. Please make sure that "
-                "you build the preprocessing method for it.")
+                "you build the preprocessing method for it."
+            )
 
         self.train_file = f'{data_dir}/train.txt'
         self.eval_file = f'{data_dir}/valid.txt'
@@ -1438,13 +1386,11 @@ class MrpcProcessor(DataProcessor):
     def get_train_examples(self, data_dir):
         """See base class."""
         nemo.logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}')
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -1460,10 +1406,7 @@ def _create_examples(self, lines, set_type):
             text_a = line[3]
             text_b = line[4]
             label = line[0]
-            examples.append(InputExample(guid=guid,
-                                         text_a=text_a,
-                                         text_b=text_b,
-                                         label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -1472,14 +1415,11 @@ class MnliProcessor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-            "dev_matched")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched",)
 
     def get_labels(self):
         """See base class."""
@@ -1495,10 +1435,7 @@ def _create_examples(self, lines, set_type):
             text_a = line[8]
             text_b = line[9]
             label = line[-1]
-            examples.append(InputExample(guid=guid,
-                                         text_a=text_a,
-                                         text_b=text_b,
-                                         label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -1507,9 +1444,7 @@ class MnliMismatchedProcessor(MnliProcessor):
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
-            "dev_matched")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched",)
 
 
 class ColaProcessor(DataProcessor):
@@ -1517,13 +1452,11 @@ class ColaProcessor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -1536,10 +1469,7 @@ def _create_examples(self, lines, set_type):
             guid = "%s-%s" % (set_type, i)
             text_a = line[3]
             label = line[1]
-            examples.append(InputExample(guid=guid,
-                                         text_a=text_a,
-                                         text_b=None,
-                                         label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
         return examples
 
 
@@ -1548,13 +1478,11 @@ class Sst2Processor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -1569,10 +1497,7 @@ def _create_examples(self, lines, set_type):
             guid = "%s-%s" % (set_type, i)
             text_a = line[0]
             label = line[1]
-            examples.append(InputExample(guid=guid,
-                                         text_a=text_a,
-                                         text_b=None,
-                                         label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
         return examples
 
 
@@ -1581,13 +1506,11 @@ class StsbProcessor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -1603,10 +1526,7 @@ def _create_examples(self, lines, set_type):
             text_a = line[7]
             text_b = line[8]
             label = line[-1]
-            examples.append(InputExample(guid=guid,
-                                         text_a=text_a,
-                                         text_b=text_b,
-                                         label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -1615,13 +1535,11 @@ class QqpProcessor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -1640,10 +1558,7 @@ def _create_examples(self, lines, set_type):
                 label = line[5]
             except IndexError:
                 continue
-            examples.append(InputExample(guid=guid,
-                                         text_a=text_a,
-                                         text_b=text_b,
-                                         label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -1652,14 +1567,11 @@ class QnliProcessor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
-            "dev_matched")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
 
     def get_labels(self):
         """See base class."""
@@ -1675,10 +1587,7 @@ def _create_examples(self, lines, set_type):
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
-            examples.append(InputExample(guid=guid,
-                                         text_a=text_a,
-                                         text_b=text_b,
-                                         label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -1687,13 +1596,11 @@ class RteProcessor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -1709,10 +1616,7 @@ def _create_examples(self, lines, set_type):
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
-            examples.append(InputExample(guid=guid,
-                                         text_a=text_a,
-                                         text_b=text_b,
-                                         label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -1721,13 +1625,11 @@ class WnliProcessor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_labels(self):
         """See base class."""
@@ -1743,10 +1645,7 @@ def _create_examples(self, lines, set_type):
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
-            examples.append(InputExample(guid=guid,
-                                         text_a=text_a,
-                                         text_b=text_b,
-                                         label=label))
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
 
@@ -1760,7 +1659,7 @@ def _create_examples(self, lines, set_type):
     "qqp": QqpProcessor,
     "qnli": QnliProcessor,
     "rte": RteProcessor,
-    "wnli": WnliProcessor
+    "wnli": WnliProcessor,
 }
 
 output_modes = {
diff --git a/nemo/collections/nlp/data/tokenizers/__init__.py b/nemo/collections/nlp/data/tokenizers/__init__.py
index 41c6646c77e6..ba9baba6c89c 100644
--- a/nemo/collections/nlp/data/tokenizers/__init__.py
+++ b/nemo/collections/nlp/data/tokenizers/__init__.py
@@ -1,6 +1,6 @@
-from .spc_tokenizer import SentencePieceTokenizer
 from .bert_tokenizer import NemoBertTokenizer
-from .yttm_tokenizer import YouTokenToMeTokenizer
+from .char_tokenizer import CharTokenizer
 from .gpt2_tokenizer import NemoGPT2Tokenizer
+from .spc_tokenizer import SentencePieceTokenizer
 from .word_tokenizer import WordTokenizer
-from .char_tokenizer import CharTokenizer
+from .yttm_tokenizer import YouTokenToMeTokenizer
diff --git a/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py b/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py
index d9c3124b6fb5..cc6b20e875a8 100644
--- a/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py
@@ -1,7 +1,9 @@
-from .tokenizer_spec import TokenizerSpec
-from transformers import BertTokenizer
 import re
 
+from transformers import BertTokenizer
+
+from .tokenizer_spec import TokenizerSpec
+
 
 def handle_quotes(text):
     text_ = ""
@@ -40,20 +42,21 @@ def remove_spaces(text):
 
 
 class NemoBertTokenizer(TokenizerSpec):
-    def __init__(self, pretrained_model=None,
-                 vocab_file=None,
-                 do_lower_case=True,
-                 max_len=None,
-                 do_basic_tokenize=True,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+    def __init__(
+        self,
+        pretrained_model=None,
+        vocab_file=None,
+        do_lower_case=True,
+        max_len=None,
+        do_basic_tokenize=True,
+        never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"),
+    ):
         if pretrained_model:
             self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
             if "uncased" not in pretrained_model:
                 self.tokenizer.basic_tokenizer.do_lower_case = False
         else:
-            self.tokenizer = BertTokenizer(vocab_file,
-                                           do_lower_case,
-                                           do_basic_tokenize)
+            self.tokenizer = BertTokenizer(vocab_file, do_lower_case, do_basic_tokenize)
         self.vocab_size = len(self.tokenizer.vocab)
         self.never_split = never_split
 
diff --git a/nemo/collections/nlp/data/tokenizers/char_tokenizer.py b/nemo/collections/nlp/data/tokenizers/char_tokenizer.py
index 64c6b8cacc6b..d634277bd3d5 100644
--- a/nemo/collections/nlp/data/tokenizers/char_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/char_tokenizer.py
@@ -11,8 +11,7 @@ def __init__(self, vocab_path):
                 self.vocab[special_token] = len(self.vocab)
         self.inv_vocab = {v: k for k, v in self.vocab.items()}
         self.vocab_size = len(self.vocab)
-        self.special_tokens = self.tokens_to_ids(
-            ["<PAD>", "<UNK>", "<BOS>", "<EOS>"])
+        self.special_tokens = self.tokens_to_ids(["<PAD>", "<UNK>", "<BOS>", "<EOS>"])
 
     def text_to_tokens(self, text):
         token_candidates = [char for char in text]
diff --git a/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py b/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py
index f37be62a349e..7c7417c9f0c7 100644
--- a/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py
@@ -1,13 +1,19 @@
-from .tokenizer_spec import TokenizerSpec
 from transformers import GPT2Tokenizer
 
+from .tokenizer_spec import TokenizerSpec
+
 
 class NemoGPT2Tokenizer(TokenizerSpec):
-    def __init__(self, pretrained_model=None,
-                 vocab_file=None, merges_file=None, errors='replace',
-                 bos_token="<|endoftext|>",
-                 eos_token="<|endoftext|>",
-                 **kwargs):
+    def __init__(
+        self,
+        pretrained_model=None,
+        vocab_file=None,
+        merges_file=None,
+        errors='replace',
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        **kwargs
+    ):
         if pretrained_model:
             self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
         self.vocab_size = self.tokenizer.vocab_size
diff --git a/nemo/collections/nlp/data/tokenizers/spc_tokenizer.py b/nemo/collections/nlp/data/tokenizers/spc_tokenizer.py
index 7dc21f1efe9a..67a2c00bda3e 100644
--- a/nemo/collections/nlp/data/tokenizers/spc_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/spc_tokenizer.py
@@ -1,4 +1,5 @@
 import sentencepiece as spm
+
 from .tokenizer_spec import TokenizerSpec
 
 
diff --git a/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py b/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py
index 687df89b2650..eeadf617c189 100644
--- a/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py
+++ b/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py
@@ -1,4 +1,4 @@
-from abc import abstractmethod, ABC
+from abc import ABC, abstractmethod
 from typing import List
 
 
diff --git a/nemo/collections/nlp/data/tokenizers/word_tokenizer.py b/nemo/collections/nlp/data/tokenizers/word_tokenizer.py
index 04026454abe7..f45940f03c58 100644
--- a/nemo/collections/nlp/data/tokenizers/word_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/word_tokenizer.py
@@ -11,8 +11,7 @@ def __init__(self, vocab_path):
                 self.vocab[special_token] = len(self.vocab)
         self.inv_vocab = {v: k for k, v in self.vocab.items()}
         self.vocab_size = len(self.vocab)
-        self.special_tokens = self.tokens_to_ids(
-            ["<PAD>", "<UNK>", "<BOS>", "<EOS>"])
+        self.special_tokens = self.tokens_to_ids(["<PAD>", "<UNK>", "<BOS>", "<EOS>"])
 
     def text_to_tokens(self, text):
         token_candidates = text.strip().split()
diff --git a/nemo/collections/nlp/data/tokenizers/yttm_tokenizer.py b/nemo/collections/nlp/data/tokenizers/yttm_tokenizer.py
index 612aada2f76d..94acc3e4b1ae 100644
--- a/nemo/collections/nlp/data/tokenizers/yttm_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/yttm_tokenizer.py
@@ -1,4 +1,5 @@
 import youtokentome as yttm
+
 from .tokenizer_spec import TokenizerSpec
 
 
@@ -6,8 +7,7 @@ class YouTokenToMeTokenizer(TokenizerSpec):
     def __init__(self, model_path):
         self.tokenizer = yttm.BPE(model=model_path)
         self.vocab_size = len(self.tokenizer.vocab())
-        self.special_tokens = self.tokens_to_ids(
-            ["<PAD>", "<UNK>", "<BOS>", "<EOS>"])
+        self.special_tokens = self.tokens_to_ids(["<PAD>", "<UNK>", "<BOS>", "<EOS>"])
 
     def text_to_tokens(self, text):
         return self.tokenizer.encode(text, output_type=yttm.OutputType.SUBWORD)
diff --git a/nemo/collections/nlp/data/utils.py b/nemo/collections/nlp/data/utils.py
index e0bab809c939..1119f48a91aa 100644
--- a/nemo/collections/nlp/data/utils.py
+++ b/nemo/collections/nlp/data/utils.py
@@ -3,9 +3,10 @@
 import re
 import string
 
-import nemo
 import numpy as np
 
+import nemo
+
 
 def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
     """
@@ -33,8 +34,7 @@ def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
         for sentence in data:
             sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8"))
             if add_bos_eos:
-                sent_ids = [tokenizer.bos_id()] + sent_ids + \
-                           [tokenizer.eos_id()]
+                sent_ids = [tokenizer.bos_id()] + sent_ids + [tokenizer.eos_id()]
             ids.append(sent_ids)
         if cache_ids:
             nemo.logging.info("Caching tokenized dataset ...")
@@ -42,12 +42,9 @@ def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
     return ids
 
 
-def clean_src_and_target(src_ids,
-                         tgt_ids,
-                         max_tokens=128,
-                         min_tokens=3,
-                         max_tokens_diff=25,
-                         max_tokens_ratio=2.5):
+def clean_src_and_target(
+    src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5,
+):
     """
     Cleans source and target sentences to get rid of noisy data.
     Specifically, a pair of sentences is removed if
@@ -63,10 +60,14 @@ def clean_src_and_target(src_ids,
     src_ids_, tgt_ids_ = [], []
     for i in range(len(src_ids)):
         src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i])
-        if src_len > max_tokens or tgt_len > max_tokens or \
-                src_len < min_tokens or tgt_len < min_tokens or \
-                (src_ids[i] == tgt_ids[i]) or \
-                np.abs(src_len - tgt_len) > max_tokens_diff:
+        if (
+            src_len > max_tokens
+            or tgt_len > max_tokens
+            or src_len < min_tokens
+            or tgt_len < min_tokens
+            or (src_ids[i] == tgt_ids[i])
+            or np.abs(src_len - tgt_len) > max_tokens_diff
+        ):
             continue
         ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1)
         if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio):
@@ -116,8 +117,7 @@ def check_is_max_context(doc_spans, cur_span_index, position):
             continue
         num_left_context = position - doc_span.start
         num_right_context = end - position
-        score = min(num_left_context,
-                    num_right_context) + 0.01 * doc_span.length
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
         if best_score is None or score > best_score:
             best_score = score
             best_span_index = span_index
diff --git a/nemo/collections/nlp/huggingface/bert.py b/nemo/collections/nlp/huggingface/bert.py
index 858a0005ad4e..684b6d93048a 100644
--- a/nemo/collections/nlp/huggingface/bert.py
+++ b/nemo/collections/nlp/huggingface/bert.py
@@ -1,18 +1,11 @@
 # Copyright (c) 2019 NVIDIA Corporation
-from typing import Optional, List
+from typing import List, Optional
 
-from transformers import (BertConfig,
-                          BertModel,
-                          BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                          BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
+from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertConfig, BertModel
 
 from nemo.backends.pytorch.nm import TrainableNM
 from nemo.core.neural_modules import PretrainedModelInfo
-from nemo.core.neural_types import (AxisType,
-                                    BatchTag,
-                                    ChannelTag,
-                                    NeuralType,
-                                    TimeTag)
+from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
 
 
 class BERT(TrainableNM):
@@ -55,18 +48,9 @@ def input_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "token_type_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "attention_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "token_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "attention_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
     @property
@@ -80,25 +64,22 @@ def output_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "hidden_states": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
-
-    def __init__(self, *,
-                 pretrained_model_name=None,
-                 config_filename=None,
-                 vocab_size=None,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 max_position_embeddings=512,
-                 **kwargs):
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
+
+    def __init__(
+        self,
+        *,
+        pretrained_model_name=None,
+        config_filename=None,
+        vocab_size=None,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        max_position_embeddings=512,
+        **kwargs
+    ):
         TrainableNM.__init__(self, **kwargs)
 
         # Check that only one of pretrained_model_name, config_filename, and
@@ -112,9 +93,11 @@ def __init__(self, *,
             total += 1
 
         if total != 1:
-            raise ValueError("Only one of pretrained_model_name, vocab_size, "
-                             + "or config_filename should be passed into the "
-                             + "BERT constructor.")
+            raise ValueError(
+                "Only one of pretrained_model_name, vocab_size, "
+                + "or config_filename should be passed into the "
+                + "BERT constructor."
+            )
 
         if vocab_size is not None:
             config = BertConfig(
@@ -125,7 +108,8 @@ def __init__(self, *,
                 num_attention_heads=num_attention_heads,
                 intermediate_size=intermediate_size,
                 hidden_act=hidden_act,
-                max_position_embeddings=max_position_embeddings)
+                max_position_embeddings=max_position_embeddings,
+            )
             model = BertModel(config)
         elif pretrained_model_name is not None:
             model = BertModel.from_pretrained(pretrained_model_name)
@@ -133,8 +117,9 @@ def __init__(self, *,
             config = BertConfig.from_json_file(config_filename)
             model = BertModel(config)
         else:
-            raise ValueError("Either pretrained_model_name or vocab_size must"
-                             + " be passed into the BERT constructor")
+            raise ValueError(
+                "Either pretrained_model_name or vocab_size must" + " be passed into the BERT constructor"
+            )
 
         model.to(self._device)
 
@@ -151,10 +136,10 @@ def list_pretrained_models() -> Optional[List[PretrainedModelInfo]]:
                 pretrained_model_name=key,
                 description="weights by HuggingFace",
                 parameters=BERT_PRETRAINED_CONFIG_ARCHIVE_MAP[key],
-                location=value)
+                location=value,
+            )
             pretrained_models.append(model_info)
         return pretrained_models
 
     def forward(self, input_ids, token_type_ids, attention_mask):
-        return self.bert(input_ids, token_type_ids=token_type_ids,
-                         attention_mask=attention_mask)[0]
+        return self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,)[0]
diff --git a/nemo/collections/nlp/modules/classifiers.py b/nemo/collections/nlp/modules/classifiers.py
index 3a6a276496de..ea606f4eb262 100644
--- a/nemo/collections/nlp/modules/classifiers.py
+++ b/nemo/collections/nlp/modules/classifiers.py
@@ -1,19 +1,20 @@
-__all__ = ['TokenClassifier',
-           'BertTokenClassifier',
-           'SequenceClassifier',
-           'JointIntentSlotClassifier',
-           'SequenceRegression']
+__all__ = [
+    'TokenClassifier',
+    'BertTokenClassifier',
+    'SequenceClassifier',
+    'JointIntentSlotClassifier',
+    'SequenceRegression',
+]
 
 import torch.nn as nn
 
 from nemo.backends.pytorch.common import MultiLayerPerceptron
-from nemo.backends.pytorch.nm import TrainableNM, LossNM
-from nemo.core.neural_types import *
+from nemo.backends.pytorch.nm import LossNM, TrainableNM
 from nemo.collections.nlp.transformer.utils import gelu
+from nemo.core.neural_types import *
 
 from ..transformer.utils import transformer_weights_init
 
-
 ACT2FN = {"gelu": gelu, "relu": nn.functional.relu}
 
 
@@ -43,13 +44,7 @@ def input_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "hidden_states": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
 
     @property
     def output_ports(self):
@@ -62,37 +57,29 @@ def output_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "logits": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
-
-    def __init__(self,
-                 hidden_size,
-                 num_classes,
-                 activation='relu',
-                 log_softmax=True,
-                 dropout=0.0,
-                 use_transformer_pretrained=True):
+        return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
+
+    def __init__(
+        self,
+        hidden_size,
+        num_classes,
+        activation='relu',
+        log_softmax=True,
+        dropout=0.0,
+        use_transformer_pretrained=True,
+    ):
         super().__init__()
         if activation not in ACT2FN:
             raise ValueError(f'activation "{activation}" not found')
         self.dense = nn.Linear(hidden_size, hidden_size)
         self.act = ACT2FN[activation]
         self.norm = nn.LayerNorm(hidden_size, eps=1e-12)
-        self.mlp = MultiLayerPerceptron(hidden_size,
-                                        num_classes,
-                                        self._device,
-                                        num_layers=1,
-                                        activation=activation,
-                                        log_softmax=log_softmax)
+        self.mlp = MultiLayerPerceptron(
+            hidden_size, num_classes, self._device, num_layers=1, activation=activation, log_softmax=log_softmax,
+        )
         self.dropout = nn.Dropout(dropout)
         if use_transformer_pretrained:
-            self.apply(
-                lambda module: transformer_weights_init(module, xavier=False))
+            self.apply(lambda module: transformer_weights_init(module, xavier=False))
         self.to(self._device)
 
     def forward(self, hidden_states):
@@ -130,13 +117,7 @@ def input_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "hidden_states": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
 
     @property
     def output_ports(self):
@@ -149,36 +130,26 @@ def output_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "logits": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
-
-    def __init__(self,
-                 hidden_size,
-                 num_classes,
-                 name=None,
-                 num_layers=2,
-                 activation='relu',
-                 log_softmax=True,
-                 dropout=0.0,
-                 use_transformer_pretrained=True):
+        return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
+
+    def __init__(
+        self,
+        hidden_size,
+        num_classes,
+        name=None,
+        num_layers=2,
+        activation='relu',
+        log_softmax=True,
+        dropout=0.0,
+        use_transformer_pretrained=True,
+    ):
         super().__init__()
 
         self.name = name
-        self.mlp = MultiLayerPerceptron(hidden_size,
-                                        num_classes,
-                                        self._device,
-                                        num_layers,
-                                        activation,
-                                        log_softmax)
+        self.mlp = MultiLayerPerceptron(hidden_size, num_classes, self._device, num_layers, activation, log_softmax,)
         self.dropout = nn.Dropout(dropout)
         if use_transformer_pretrained:
-            self.apply(
-                lambda module: transformer_weights_init(module, xavier=False))
+            self.apply(lambda module: transformer_weights_init(module, xavier=False))
         # self.to(self._device) # sometimes this is necessary
 
     def __str__(self):
@@ -220,13 +191,7 @@ def input_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "hidden_states": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
 
     @property
     def output_ports(self):
@@ -237,32 +202,23 @@ def output_ports(self):
 
             1: AxisType(ChannelTag)
         """
-        return {
-            "logits": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag)
-            })
-        }
-
-    def __init__(self,
-                 hidden_size,
-                 num_classes,
-                 num_layers=2,
-                 activation='relu',
-                 log_softmax=True,
-                 dropout=0.0,
-                 use_transformer_pretrained=True):
+        return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})}
+
+    def __init__(
+        self,
+        hidden_size,
+        num_classes,
+        num_layers=2,
+        activation='relu',
+        log_softmax=True,
+        dropout=0.0,
+        use_transformer_pretrained=True,
+    ):
         super().__init__()
-        self.mlp = MultiLayerPerceptron(hidden_size,
-                                        num_classes,
-                                        self._device,
-                                        num_layers,
-                                        activation,
-                                        log_softmax)
+        self.mlp = MultiLayerPerceptron(hidden_size, num_classes, self._device, num_layers, activation, log_softmax,)
         self.dropout = nn.Dropout(dropout)
         if use_transformer_pretrained:
-            self.apply(
-                lambda module: transformer_weights_init(module, xavier=False))
+            self.apply(lambda module: transformer_weights_init(module, xavier=False))
         # self.to(self._device) # sometimes this is necessary
 
     def forward(self, hidden_states, idx_conditioned_on=0):
@@ -295,13 +251,7 @@ def input_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "hidden_states": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
 
     @property
     def output_ports(self):
@@ -320,41 +270,33 @@ def output_ports(self):
             2: AxisType(ChannelTag)
         """
         return {
-            "intent_logits": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag)
-            }),
-            "slot_logits": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
+            "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
         }
 
-    def __init__(self,
-                 hidden_size,
-                 num_intents,
-                 num_slots,
-                 dropout=0.0,
-                 use_transformer_pretrained=True,
-                 **kwargs):
+    def __init__(
+        self, hidden_size, num_intents, num_slots, dropout=0.0, use_transformer_pretrained=True, **kwargs,
+    ):
         super().__init__(**kwargs)
         self.dropout = nn.Dropout(dropout)
-        self.slot_mlp = MultiLayerPerceptron(hidden_size,
-                                             num_classes=num_slots,
-                                             device=self._device,
-                                             num_layers=2,
-                                             activation='relu',
-                                             log_softmax=False)
-        self.intent_mlp = MultiLayerPerceptron(hidden_size,
-                                               num_classes=num_intents,
-                                               device=self._device,
-                                               num_layers=2,
-                                               activation='relu',
-                                               log_softmax=False)
+        self.slot_mlp = MultiLayerPerceptron(
+            hidden_size,
+            num_classes=num_slots,
+            device=self._device,
+            num_layers=2,
+            activation='relu',
+            log_softmax=False,
+        )
+        self.intent_mlp = MultiLayerPerceptron(
+            hidden_size,
+            num_classes=num_intents,
+            device=self._device,
+            num_layers=2,
+            activation='relu',
+            log_softmax=False,
+        )
         if use_transformer_pretrained:
-            self.apply(
-                lambda module: transformer_weights_init(module, xavier=False))
+            self.apply(lambda module: transformer_weights_init(module, xavier=False))
         # self.to(self._device)
 
     def forward(self, hidden_states):
@@ -388,13 +330,7 @@ def input_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "hidden_states": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
 
     @property
     def output_ports(self):
@@ -404,28 +340,24 @@ def output_ports(self):
             0: AxisType(RegressionTag)
         """
         return {
-            "preds": NeuralType({
-                0: AxisType(RegressionTag)
-            }),
+            "preds": NeuralType({0: AxisType(RegressionTag)}),
         }
 
-    def __init__(self,
-                 hidden_size,
-                 num_layers=2,
-                 activation='relu',
-                 dropout=0.0,
-                 use_transformer_pretrained=True):
+    def __init__(
+        self, hidden_size, num_layers=2, activation='relu', dropout=0.0, use_transformer_pretrained=True,
+    ):
         super().__init__()
-        self.mlp = MultiLayerPerceptron(hidden_size,
-                                        num_classes=1,
-                                        device=self._device,
-                                        num_layers=num_layers,
-                                        activation=activation,
-                                        log_softmax=False)
+        self.mlp = MultiLayerPerceptron(
+            hidden_size,
+            num_classes=1,
+            device=self._device,
+            num_layers=num_layers,
+            activation=activation,
+            log_softmax=False,
+        )
         self.dropout = nn.Dropout(dropout)
         if use_transformer_pretrained:
-            self.apply(
-                lambda module: transformer_weights_init(module, xavier=False))
+            self.apply(lambda module: transformer_weights_init(module, xavier=False))
         # self.to(self._device) # sometimes this is necessary
 
     def forward(self, hidden_states, idx_conditioned_on=0):
diff --git a/nemo/collections/nlp/modules/losses.py b/nemo/collections/nlp/modules/losses.py
index f96620b24f39..db5977440ca3 100644
--- a/nemo/collections/nlp/modules/losses.py
+++ b/nemo/collections/nlp/modules/losses.py
@@ -1,19 +1,20 @@
 import torch
 from torch import nn
 
-
 from nemo.backends.pytorch.nm import LossNM
 from nemo.core.neural_types import *
-from .pytorch_utils import SmoothedCrossEntropyLoss
-from ..utils.nlp_utils import mask_padded_tokens
 
+from ..utils.nlp_utils import mask_padded_tokens
+from .pytorch_utils import SmoothedCrossEntropyLoss
 
-__all__ = ['JointIntentSlotLoss',
-           'LossAggregatorNM',
-           'MaskedLanguageModelingLossNM',
-           'PaddedSmoothedCrossEntropyLossNM',
-           'QuestionAnsweringLoss',
-           'TokenClassificationLoss']
+__all__ = [
+    'JointIntentSlotLoss',
+    'LossAggregatorNM',
+    'MaskedLanguageModelingLossNM',
+    'PaddedSmoothedCrossEntropyLossNM',
+    'QuestionAnsweringLoss',
+    'TokenClassificationLoss',
+]
 
 
 class QuestionAnsweringLoss(LossNM):
@@ -47,17 +48,9 @@ def input_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "logits": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            }),
-            "start_positions": NeuralType({
-                0: AxisType(BatchTag)
-            }),
-            "end_positions": NeuralType({
-                0: AxisType(BatchTag)
-            })
+            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            "start_positions": NeuralType({0: AxisType(BatchTag)}),
+            "end_positions": NeuralType({0: AxisType(BatchTag)}),
         }
 
     @property
@@ -79,16 +72,8 @@ def output_ports(self):
         """
         return {
             "loss": NeuralType(None),
-            "start_logits":
-                NeuralType({
-                    0: AxisType(BatchTag),
-                    1: AxisType(TimeTag)
-                }),
-            "end_logits":
-                NeuralType({
-                    0: AxisType(BatchTag),
-                    1: AxisType(TimeTag)
-                })
+            "start_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "end_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
     def __init__(self, **kwargs):
@@ -147,22 +132,9 @@ def input_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            "logits":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            }),
-            "output_ids":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "output_mask":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
     @property
@@ -172,9 +144,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, label_smoothing=0.0, **kwargs):
         LossNM.__init__(self, **kwargs)
@@ -211,9 +181,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, *, num_inputs=2, **kwargs):
         # Store number of inputs/losses.
@@ -263,19 +231,9 @@ def input_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            "logits": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            }),
-            "labels": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "loss_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
     @property
@@ -285,9 +243,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, num_classes, class_weights=None, **kwargs):
         LossNM.__init__(self, **kwargs)
@@ -358,27 +314,11 @@ def input_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            "intent_logits": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag)
-            }),
-            "slot_logits": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            }),
-            "loss_mask": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "intents": NeuralType({
-                0: AxisType(BatchTag),
-            }),
-            "slots":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
+            "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "intents": NeuralType({0: AxisType(BatchTag),}),
+            "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
     @property
@@ -388,16 +328,16 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
-
-    def __init__(self,
-                 num_slots,
-                 slot_classes_loss_weights=None,
-                 intent_classes_loss_weights=None,
-                 intent_loss_weight=0.6,
-                 **kwargs):
+        return {"loss": NeuralType(None)}
+
+    def __init__(
+        self,
+        num_slots,
+        slot_classes_loss_weights=None,
+        intent_classes_loss_weights=None,
+        intent_loss_weight=0.6,
+        **kwargs
+    ):
         LossNM.__init__(self, **kwargs)
         self.num_slots = num_slots
         self.intent_loss_weight = intent_loss_weight
@@ -406,24 +346,15 @@ def __init__(self,
 
         # For weighted loss to tackle class imbalance
         if slot_classes_loss_weights:
-            self.slot_classes_loss_weights = torch.FloatTensor(
-                slot_classes_loss_weights).to(self._device)
+            self.slot_classes_loss_weights = torch.FloatTensor(slot_classes_loss_weights).to(self._device)
 
         if intent_classes_loss_weights:
-            self.intent_classes_loss_weights = torch.FloatTensor(
-                intent_classes_loss_weights).to(self._device)
-
-        self._criterion_intent = nn.CrossEntropyLoss(
-            weight=self.intent_classes_loss_weights)
-        self._criterion_slot = nn.CrossEntropyLoss(
-            weight=self.slot_classes_loss_weights)
-
-    def _loss_function(self,
-                       intent_logits,
-                       slot_logits,
-                       loss_mask,
-                       intents,
-                       slots):
+            self.intent_classes_loss_weights = torch.FloatTensor(intent_classes_loss_weights).to(self._device)
+
+        self._criterion_intent = nn.CrossEntropyLoss(weight=self.intent_classes_loss_weights)
+        self._criterion_slot = nn.CrossEntropyLoss(weight=self.slot_classes_loss_weights)
+
+    def _loss_function(self, intent_logits, slot_logits, loss_mask, intents, slots):
         intent_loss = self._criterion_intent(intent_logits, intents)
 
         active_loss = loss_mask.view(-1) > 0.5
@@ -435,8 +366,7 @@ def _loss_function(self,
             slot_loss = 0.0
         else:
             slot_loss = self._criterion_slot(active_logits, active_labels)
-        loss = intent_loss * self.intent_loss_weight + \
-            slot_loss * (1 - self.intent_loss_weight)
+        loss = intent_loss * self.intent_loss_weight + slot_loss * (1 - self.intent_loss_weight)
 
         return loss
 
@@ -471,17 +401,8 @@ def input_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            "logits":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            }),
-            "target_ids":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            "target_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
     @property
@@ -491,22 +412,19 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, **kwargs):
         LossNM.__init__(self, **kwargs)
 
         loss_params = {
             "label_smoothing": self.local_parameters.get("label_smoothing", 0),
-            "predict_last_k": self.local_parameters.get("predict_last_k", 0)
+            "predict_last_k": self.local_parameters.get("predict_last_k", 0),
         }
         self._loss_fn = SmoothedCrossEntropyLoss(**loss_params)
         self._pad_id = self.local_parameters['pad_id']
 
     def _loss_function(self, logits, target_ids):
-        target_mask = mask_padded_tokens(
-            target_ids, self._pad_id).to(logits.dtype)
+        target_mask = mask_padded_tokens(target_ids, self._pad_id).to(logits.dtype)
         loss = self._loss_fn(logits, target_ids, target_mask)
         return loss
diff --git a/nemo/collections/nlp/modules/pytorch_utils.py b/nemo/collections/nlp/modules/pytorch_utils.py
index a6ccbbe2f352..58af90a6b595 100644
--- a/nemo/collections/nlp/modules/pytorch_utils.py
+++ b/nemo/collections/nlp/modules/pytorch_utils.py
@@ -34,13 +34,11 @@ def forward(self, logits, output_ids, output_mask, eps=1e-6):
         """
         batch_size, seq_len, vocab_size = logits.size()
         smoothing = vocab_size * self._smoothing / (vocab_size - 1)
-        target_logits = logits.gather(
-            2, output_ids.unsqueeze(2)).squeeze(2)
+        target_logits = logits.gather(2, output_ids.unsqueeze(2)).squeeze(2)
         smoothing_logits = logits.mean(dim=-1)
-        neg_log_likelihood = (1.0 - smoothing) * target_logits + \
-            smoothing * smoothing_logits
-        neg_log_likelihood = neg_log_likelihood[:, -self._predict_last_k:]
-        output_mask = output_mask[:, -self._predict_last_k:]
+        neg_log_likelihood = (1.0 - smoothing) * target_logits + smoothing * smoothing_logits
+        neg_log_likelihood = neg_log_likelihood[:, -self._predict_last_k :]
+        output_mask = output_mask[:, -self._predict_last_k :]
         neg_log_likelihood = -torch.sum(neg_log_likelihood * output_mask)
         neg_log_likelihood = neg_log_likelihood / (output_mask.sum() + eps)
         return neg_log_likelihood
diff --git a/nemo/collections/nlp/modules/transformer_nm.py b/nemo/collections/nlp/modules/transformer_nm.py
index d2d4b72ae4b5..2d78121ea5c0 100644
--- a/nemo/collections/nlp/modules/transformer_nm.py
+++ b/nemo/collections/nlp/modules/transformer_nm.py
@@ -2,21 +2,25 @@
 """
 This package contains Transformer for translation Neural Module
 """
-__all__ = ['TransformerEncoderNM',
-           'TransformerDecoderNM',
-           'GreedyLanguageGeneratorNM',
-           'BeamSearchTranslatorNM']
+__all__ = [
+    'TransformerEncoderNM',
+    'TransformerDecoderNM',
+    'GreedyLanguageGeneratorNM',
+    'BeamSearchTranslatorNM',
+]
 
 import math
 
-from nemo.backends.pytorch.nm import TrainableNM, LossNM
+from nemo.backends.pytorch.nm import LossNM, TrainableNM
 from nemo.core.neural_types import *
 
-from ..transformer import (TransformerEmbedding,
-                           TransformerEncoder,
-                           TransformerDecoder,
-                           GreedySequenceGenerator,
-                           BeamSearchSequenceGenerator)
+from ..transformer import (
+    BeamSearchSequenceGenerator,
+    GreedySequenceGenerator,
+    TransformerDecoder,
+    TransformerEmbedding,
+    TransformerEncoder,
+)
 from ..transformer.utils import transformer_weights_init
 
 
@@ -60,14 +64,8 @@ def input_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask_src": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
     @property
@@ -81,29 +79,25 @@ def output_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "hidden_states": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
-
-    def __init__(self,
-                 vocab_size,
-                 d_model,
-                 d_inner,
-                 max_seq_length,
-                 num_layers,
-                 num_attn_heads,
-                 ffn_dropout=0.0,
-                 embedding_dropout=0.0,
-                 attn_score_dropout=0.0,
-                 attn_layer_dropout=0.0,
-                 learn_positional_encodings=False,
-                 hidden_act='relu',
-                 mask_future=False,
-                 **kwargs):
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
+
+    def __init__(
+        self,
+        vocab_size,
+        d_model,
+        d_inner,
+        max_seq_length,
+        num_layers,
+        num_attn_heads,
+        ffn_dropout=0.0,
+        embedding_dropout=0.0,
+        attn_score_dropout=0.0,
+        attn_layer_dropout=0.0,
+        learn_positional_encodings=False,
+        hidden_act='relu',
+        mask_future=False,
+        **kwargs
+    ):
         TrainableNM.__init__(self, **kwargs)
 
         self.embedding_layer = TransformerEmbedding(
@@ -111,7 +105,8 @@ def __init__(self,
             hidden_size=d_model,
             max_sequence_length=max_seq_length,
             embedding_dropout=embedding_dropout,
-            learn_positional_encodings=learn_positional_encodings)
+            learn_positional_encodings=learn_positional_encodings,
+        )
         self.encoder = TransformerEncoder(
             num_layers=num_layers,
             hidden_size=d_model,
@@ -121,11 +116,11 @@ def __init__(self,
             ffn_dropout=ffn_dropout,
             hidden_act=hidden_act,
             attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout)
+            attn_layer_dropout=attn_layer_dropout,
+        )
 
         std_init_range = 1 / math.sqrt(d_model)
-        self.apply(lambda module: transformer_weights_init(module,
-                                                           std_init_range))
+        self.apply(lambda module: transformer_weights_init(module, std_init_range))
         self.to(self._device)
 
     def forward(self, input_ids, input_mask_src):
@@ -184,23 +179,10 @@ def input_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            "input_ids_tgt": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "hidden_states_src": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            }),
-            "input_mask_src": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            }),
-            "input_mask_tgt": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            "input_ids_tgt": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask_tgt": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
     @property
@@ -214,28 +196,24 @@ def output_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {
-            "hidden_states": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            })
-        }
-
-    def __init__(self,
-                 vocab_size,
-                 d_model,
-                 d_inner,
-                 num_layers,
-                 max_seq_length,
-                 num_attn_heads,
-                 ffn_dropout=0.0,
-                 embedding_dropout=0.0,
-                 attn_score_dropout=0.0,
-                 attn_layer_dropout=0.0,
-                 learn_positional_encodings=False,
-                 hidden_act='relu',
-                 **kwargs):
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
+
+    def __init__(
+        self,
+        vocab_size,
+        d_model,
+        d_inner,
+        num_layers,
+        max_seq_length,
+        num_attn_heads,
+        ffn_dropout=0.0,
+        embedding_dropout=0.0,
+        attn_score_dropout=0.0,
+        attn_layer_dropout=0.0,
+        learn_positional_encodings=False,
+        hidden_act='relu',
+        **kwargs
+    ):
         TrainableNM.__init__(self, **kwargs)
 
         self.embedding_layer = TransformerEmbedding(
@@ -243,7 +221,7 @@ def __init__(self,
             hidden_size=d_model,
             max_sequence_length=max_seq_length,
             embedding_dropout=embedding_dropout,
-            learn_positional_encodings=learn_positional_encodings
+            learn_positional_encodings=learn_positional_encodings,
         )
         self.decoder = TransformerDecoder(
             num_layers=num_layers,
@@ -253,24 +231,16 @@ def __init__(self,
             ffn_dropout=ffn_dropout,
             hidden_act=hidden_act,
             attn_score_dropout=attn_score_dropout,
-            attn_layer_dropout=attn_layer_dropout
+            attn_layer_dropout=attn_layer_dropout,
         )
 
         std_init_range = 1 / math.sqrt(d_model)
-        self.apply(lambda module: transformer_weights_init(module,
-                                                           std_init_range))
+        self.apply(lambda module: transformer_weights_init(module, std_init_range))
         self.to(self._device)
 
-    def forward(self,
-                input_ids_tgt,
-                hidden_states_src,
-                input_mask_src,
-                input_mask_tgt):
+    def forward(self, input_ids_tgt, hidden_states_src, input_mask_src, input_mask_tgt):
         hidden_states_tgt = self.embedding_layer(input_ids_tgt)
-        hidden_states = self.decoder(hidden_states_tgt,
-                                     input_mask_tgt,
-                                     hidden_states_src,
-                                     input_mask_src)
+        hidden_states = self.decoder(hidden_states_tgt, input_mask_tgt, hidden_states_src, input_mask_src,)
         return hidden_states
 
 
@@ -298,12 +268,7 @@ def input_ports(self):
 
             1: AxisType(TimeTag)
         """
-        return {
-            "input_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
-        }
+        return {"input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})}
 
     @property
     def output_ports(self):
@@ -314,22 +279,9 @@ def output_ports(self):
 
             1: AxisType(TimeTag)
         """
-        return {
-            "output_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
-        }
+        return {"output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})}
 
-    def __init__(self,
-                 decoder,
-                 log_softmax,
-                 max_seq_length,
-                 pad_token,
-                 bos_token,
-                 eos_token,
-                 batch_size=1,
-                 **kwargs):
+    def __init__(self, decoder, log_softmax, max_seq_length, pad_token, bos_token, eos_token, batch_size=1, **kwargs):
         TrainableNM.__init__(self, **kwargs)
 
         self.generator = GreedySequenceGenerator(
@@ -340,7 +292,7 @@ def __init__(self,
             pad=pad_token,
             bos=bos_token,
             eos=eos_token,
-            batch_size=batch_size
+            batch_size=batch_size,
         )
 
     @property
@@ -388,17 +340,8 @@ def input_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            "hidden_states_src":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(ChannelTag)
-            }),
-            "input_mask_src":
-            NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
+            "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
     @property
@@ -410,29 +353,26 @@ def output_ports(self):
 
             1: AxisType(TimeTag)
         """
-        return {
-            "output_ids": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)
-            })
-        }
+        return {"output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})}
 
     @property
     def num_weights(self):
         return 0
 
-    def __init__(self,
-                 decoder,
-                 log_softmax,
-                 max_seq_length,
-                 pad_token,
-                 bos_token,
-                 eos_token,
-                 batch_size=1,
-                 beam_size=4,
-                 max_delta_length=50,
-                 length_penalty=0,
-                 **kwargs):
+    def __init__(
+        self,
+        decoder,
+        log_softmax,
+        max_seq_length,
+        pad_token,
+        bos_token,
+        eos_token,
+        batch_size=1,
+        beam_size=4,
+        max_delta_length=50,
+        length_penalty=0,
+        **kwargs
+    ):
         TrainableNM.__init__(self, **kwargs)
 
         self.generator = BeamSearchSequenceGenerator(
@@ -446,10 +386,9 @@ def __init__(self,
             eos=eos_token,
             batch_size=batch_size,
             beam_size=beam_size,
-            len_pen=length_penalty
+            len_pen=length_penalty,
         )
 
     def forward(self, hidden_states_src, input_mask_src):
-        output_ids = self.generator(encoder_hidden_states=hidden_states_src,
-                                    encoder_input_mask=input_mask_src)
+        output_ids = self.generator(encoder_hidden_states=hidden_states_src, encoder_input_mask=input_mask_src,)
         return output_ids
diff --git a/nemo/collections/nlp/transformer/decoders.py b/nemo/collections/nlp/transformer/decoders.py
index e9191b5e8fa2..ccd1b26d2f38 100644
--- a/nemo/collections/nlp/transformer/decoders.py
+++ b/nemo/collections/nlp/transformer/decoders.py
@@ -25,49 +25,52 @@ class TransformerDecoderBlock(nn.Module):
         hidden_act: activation function used between two linear layers in FFN
     """
 
-    def __init__(self, hidden_size, inner_size, num_attention_heads=1,
-                 attn_score_dropout=0, attn_layer_dropout=0, ffn_dropout=0,
-                 hidden_act="relu"):
+    def __init__(
+        self,
+        hidden_size,
+        inner_size,
+        num_attention_heads=1,
+        attn_score_dropout=0,
+        attn_layer_dropout=0,
+        ffn_dropout=0,
+        hidden_act="relu",
+    ):
         super().__init__()
 
         self.first_sub_layer = MultiHeadAttention(
-            hidden_size, num_attention_heads,
-            attn_score_dropout, attn_layer_dropout)
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+        )
         self.second_sub_layer = MultiHeadAttention(
-            hidden_size, num_attention_heads,
-            attn_score_dropout, attn_layer_dropout)
-        self.third_sub_layer = PositionWiseFF(
-            hidden_size, inner_size, ffn_dropout, hidden_act)
-
-    def forward(self, decoder_query, decoder_mask, decoder_keys,
-                encoder_states, encoder_mask):
-        self_attn_output = self.first_sub_layer(
-            decoder_query, decoder_keys, decoder_keys, decoder_mask)
-        enc_dec_attn_output = self.second_sub_layer(
-            self_attn_output, encoder_states, encoder_states, encoder_mask)
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+        )
+        self.third_sub_layer = PositionWiseFF(hidden_size, inner_size, ffn_dropout, hidden_act)
+
+    def forward(
+        self, decoder_query, decoder_mask, decoder_keys, encoder_states, encoder_mask,
+    ):
+        self_attn_output = self.first_sub_layer(decoder_query, decoder_keys, decoder_keys, decoder_mask)
+        enc_dec_attn_output = self.second_sub_layer(self_attn_output, encoder_states, encoder_states, encoder_mask)
         output_states = self.third_sub_layer(enc_dec_attn_output)
         return output_states
 
 
 class TransformerDecoder(nn.Module):
-
     def __init__(self, num_layers, hidden_size, **kwargs):
         super().__init__()
 
         layer = TransformerDecoderBlock(hidden_size, **kwargs)
-        self.layers = nn.ModuleList(
-            [copy.deepcopy(layer) for _ in range(num_layers)])
+        self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_layers)])
 
     def _get_memory_states(self, decoder_states, decoder_mems_list=None, i=0):
         if decoder_mems_list is not None:
-            memory_states = torch.cat(
-                (decoder_mems_list[i], decoder_states), dim=1)
+            memory_states = torch.cat((decoder_mems_list[i], decoder_states), dim=1)
         else:
             memory_states = decoder_states
         return memory_states
 
-    def forward(self, decoder_states, decoder_mask, encoder_states,
-                encoder_mask, decoder_mems_list=None, return_mems=False):
+    def forward(
+        self, decoder_states, decoder_mask, encoder_states, encoder_mask, decoder_mems_list=None, return_mems=False,
+    ):
         """
         Args:
             decoder_states: output of the embedding layer (B x L_dec x H)
@@ -84,16 +87,14 @@ def forward(self, decoder_states, decoder_mask, encoder_states,
         decoder_attn_mask = form_attention_mask(decoder_mask, diagonal=0)
         encoder_attn_mask = form_attention_mask(encoder_mask)
 
-        memory_states = self._get_memory_states(
-            decoder_states, decoder_mems_list, 0)
+        memory_states = self._get_memory_states(decoder_states, decoder_mems_list, 0)
         cached_mems_list = [memory_states]
 
         for i, layer in enumerate(self.layers):
             decoder_states = layer(
-                decoder_states, decoder_attn_mask, memory_states,
-                encoder_states, encoder_attn_mask)
-            memory_states = self._get_memory_states(decoder_states,
-                                                    decoder_mems_list, i + 1)
+                decoder_states, decoder_attn_mask, memory_states, encoder_states, encoder_attn_mask,
+            )
+            memory_states = self._get_memory_states(decoder_states, decoder_mems_list, i + 1)
             cached_mems_list.append(memory_states)
 
         if return_mems:
diff --git a/nemo/collections/nlp/transformer/encoders.py b/nemo/collections/nlp/transformer/encoders.py
index c57e1f87ea0c..1eb63eb55124 100644
--- a/nemo/collections/nlp/transformer/encoders.py
+++ b/nemo/collections/nlp/transformer/encoders.py
@@ -1,7 +1,9 @@
-__all__ = ['TransformerEncoderBlock',
-           'TransformerEncoder',
-           'XLNetEncoderBlock',
-           'XLNetEncoder']
+__all__ = [
+    'TransformerEncoderBlock',
+    'TransformerEncoder',
+    'XLNetEncoderBlock',
+    'XLNetEncoder',
+]
 
 import copy
 
@@ -28,20 +30,25 @@ class TransformerEncoderBlock(nn.Module):
         hidden_act: activation function used between two linear layers in FFN
     """
 
-    def __init__(self, hidden_size, inner_size, num_attention_heads=1,
-                 attn_score_dropout=0, attn_layer_dropout=0, ffn_dropout=0,
-                 hidden_act="relu"):
+    def __init__(
+        self,
+        hidden_size,
+        inner_size,
+        num_attention_heads=1,
+        attn_score_dropout=0,
+        attn_layer_dropout=0,
+        ffn_dropout=0,
+        hidden_act="relu",
+    ):
         super().__init__()
 
         self.first_sub_layer = MultiHeadAttention(
-            hidden_size, num_attention_heads,
-            attn_score_dropout, attn_layer_dropout)
-        self.second_sub_layer = PositionWiseFF(
-            hidden_size, inner_size, ffn_dropout, hidden_act)
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+        )
+        self.second_sub_layer = PositionWiseFF(hidden_size, inner_size, ffn_dropout, hidden_act)
 
     def forward(self, encoder_query, encoder_mask, encoder_keys):
-        self_attn_output = self.first_sub_layer(
-            encoder_query, encoder_keys, encoder_keys, encoder_mask)
+        self_attn_output = self.first_sub_layer(encoder_query, encoder_keys, encoder_keys, encoder_mask)
         output_states = self.second_sub_layer(self_attn_output)
         return output_states
 
@@ -51,20 +58,19 @@ def __init__(self, num_layers, hidden_size, mask_future=False, **kwargs):
         super().__init__()
 
         layer = TransformerEncoderBlock(hidden_size, **kwargs)
-        self.layers = nn.ModuleList(
-            [copy.deepcopy(layer) for _ in range(num_layers)])
+        self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_layers)])
         self.diag = 0 if mask_future else None
 
     def _get_memory_states(self, encoder_states, encoder_mems_list=None, i=0):
         if encoder_mems_list is not None:
-            memory_states = torch.cat(
-                (encoder_mems_list[i], encoder_states), dim=1)
+            memory_states = torch.cat((encoder_mems_list[i], encoder_states), dim=1)
         else:
             memory_states = encoder_states
         return memory_states
 
-    def forward(self, encoder_states, encoder_mask, encoder_mems_list=None,
-                return_mems=False):
+    def forward(
+        self, encoder_states, encoder_mask, encoder_mems_list=None, return_mems=False,
+    ):
         """
         Args:
             encoder_states: output of the embedding_layer (B x L_enc x H)
@@ -78,15 +84,12 @@ def forward(self, encoder_states, encoder_mask, encoder_mems_list=None,
 
         encoder_attn_mask = form_attention_mask(encoder_mask, self.diag)
 
-        memory_states = self._get_memory_states(
-            encoder_states, encoder_mems_list, 0)
+        memory_states = self._get_memory_states(encoder_states, encoder_mems_list, 0)
         cached_mems_list = [memory_states]
 
         for i, layer in enumerate(self.layers):
-            encoder_states = layer(
-                encoder_states, encoder_attn_mask, memory_states)
-            memory_states = self._get_memory_states(
-                encoder_states, encoder_mems_list, i + 1)
+            encoder_states = layer(encoder_states, encoder_attn_mask, memory_states)
+            memory_states = self._get_memory_states(encoder_states, encoder_mems_list, i + 1)
             cached_mems_list.append(memory_states)
 
         if return_mems:
@@ -96,40 +99,41 @@ def forward(self, encoder_states, encoder_mask, encoder_mems_list=None,
 
 
 class XLNetEncoderBlock(nn.Module):
-
-    def __init__(self, hidden_size, inner_size, num_attention_heads=1,
-                 attn_score_dropout=0, attn_layer_dropout=0, ffn_dropout=0,
-                 hidden_act="relu"):
+    def __init__(
+        self,
+        hidden_size,
+        inner_size,
+        num_attention_heads=1,
+        attn_score_dropout=0,
+        attn_layer_dropout=0,
+        ffn_dropout=0,
+        hidden_act="relu",
+    ):
         super().__init__()
 
         self.first_sub_layer = TwoStreamSelfAttention(
-            hidden_size, num_attention_heads,
-            attn_score_dropout, attn_layer_dropout)
-        self.second_sub_layer = PositionWiseFF(
-            hidden_size, inner_size, ffn_dropout, hidden_act)
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+        )
+        self.second_sub_layer = PositionWiseFF(hidden_size, inner_size, ffn_dropout, hidden_act)
 
-    def forward(self, query_states, content_states, query_attn_mask,
-                content_attn_mask):
+    def forward(self, query_states, content_states, query_attn_mask, content_attn_mask):
         output_query_states, output_content_states = self.first_sub_layer(
-            query_states, content_states, query_attn_mask, content_attn_mask)
+            query_states, content_states, query_attn_mask, content_attn_mask
+        )
         output_content_states = self.second_sub_layer(output_content_states)
         return output_query_states, output_content_states
 
 
 class XLNetEncoder(nn.Module):
-
     def __init__(self, num_layers, hidden_size, **kwargs):
         super().__init__()
 
         layer = XLNetEncoderBlock(hidden_size, **kwargs)
-        self.layers = nn.ModuleList(
-            [copy.deepcopy(layer) for _ in range(num_layers)])
+        self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_layers)])
 
     def forward(self, query_states, content_states, input_mask):
         query_attn_mask = form_attention_mask(input_mask, diagonal=-1)
         content_attn_mask = form_attention_mask(input_mask, diagonal=0)
         for layer in self.layers:
-            query_states, content_states = layer(
-                query_states, content_states,
-                query_attn_mask, content_attn_mask)
+            query_states, content_states = layer(query_states, content_states, query_attn_mask, content_attn_mask,)
         return query_states, content_states
diff --git a/nemo/collections/nlp/transformer/generators.py b/nemo/collections/nlp/transformer/generators.py
index 2dbafe87870e..d1a4efbbd2c9 100644
--- a/nemo/collections/nlp/transformer/generators.py
+++ b/nemo/collections/nlp/transformer/generators.py
@@ -1,25 +1,29 @@
-__all__ = ['GreedySequenceGenerator',
-           'TopKSequenceGenerator',
-           'BeamSearchSequenceGenerator']
+__all__ = [
+    'GreedySequenceGenerator',
+    'TopKSequenceGenerator',
+    'BeamSearchSequenceGenerator',
+]
 
 import torch
 import torch.nn as nn
 
-from .utils import NEG_INF
 from ..utils.nlp_utils import mask_padded_tokens
+from .utils import NEG_INF
 
 
 class GreedySequenceGenerator(nn.Module):
-    def __init__(self,
-                 embedding,
-                 decoder,
-                 log_softmax,
-                 pad=0,
-                 bos=1,
-                 eos=2,
-                 max_sequence_length=512,
-                 max_delta_length=20,
-                 batch_size=1):
+    def __init__(
+        self,
+        embedding,
+        decoder,
+        log_softmax,
+        pad=0,
+        bos=1,
+        eos=2,
+        max_sequence_length=512,
+        max_delta_length=20,
+        batch_size=1,
+    ):
         """
         Greedy sequence generator based on the decoder followed by log_softmax.
 
@@ -50,12 +54,14 @@ def __init__(self,
         self.device = next(self.decoder.parameters()).device
 
     @torch.no_grad()
-    def _forward(self,
-                 decoder_input_ids=None,
-                 encoder_hidden_states=None,
-                 encoder_input_mask=None,
-                 decoder_mems_list=None,
-                 pos=0):
+    def _forward(
+        self,
+        decoder_input_ids=None,
+        encoder_hidden_states=None,
+        encoder_input_mask=None,
+        decoder_mems_list=None,
+        pos=0,
+    ):
         """
         One step of autoregressive output generation.
 
@@ -71,10 +77,8 @@ def _forward(self,
             pos: starting position in positional encoding
         """
 
-        decoder_hidden_states = self.embedding.forward(
-            decoder_input_ids, start_pos=pos)
-        decoder_input_mask = mask_padded_tokens(
-            decoder_input_ids, self.pad).float()
+        decoder_hidden_states = self.embedding.forward(decoder_input_ids, start_pos=pos)
+        decoder_input_mask = mask_padded_tokens(decoder_input_ids, self.pad).float()
         # TODO: make sure float() work with mixed precision
 
         if encoder_hidden_states is not None:
@@ -84,19 +88,16 @@ def _forward(self,
                 encoder_hidden_states,
                 encoder_input_mask,
                 decoder_mems_list,
-                return_mems=True)
+                return_mems=True,
+            )
         else:
             decoder_mems_list = self.decoder.forward(
-                decoder_hidden_states,
-                decoder_input_mask,
-                decoder_mems_list,
-                return_mems=True)
+                decoder_hidden_states, decoder_input_mask, decoder_mems_list, return_mems=True,
+            )
         log_probs = self.log_softmax.forward(decoder_mems_list[-1])
         return log_probs, decoder_mems_list
 
-    def _prepare_for_search(self,
-                            decoder_input_ids=None,
-                            encoder_hidden_states=None):
+    def _prepare_for_search(self, decoder_input_ids=None, encoder_hidden_states=None):
         """
         Helper function which defines starting sequence to begin generating
         with and maximum allowed number of tokens to be generated.
@@ -108,8 +109,7 @@ def _prepare_for_search(self,
         # is min(max_sequence_length, src_len + max_delta_length)
         if encoder_hidden_states is not None:
             batch_size, src_len, _ = encoder_hidden_states.size()
-            max_seq_length = min(
-                self.max_seq_length, src_len + self.max_delta_len)
+            max_seq_length = min(self.max_seq_length, src_len + self.max_delta_len)
         else:
             max_seq_length = self.max_seq_length
 
@@ -118,20 +118,17 @@ def _prepare_for_search(self,
             tgt = decoder_input_ids
             batch_size, tgt_len = decoder_input_ids.size()
         else:
-            tgt = torch.zeros(
-                batch_size, 1).long().fill_(self.bos).to(self.device)
+            tgt = torch.zeros(batch_size, 1).long().fill_(self.bos).to(self.device)
             tgt_len = 1
         max_generation_length = max_seq_length - tgt_len
 
         return tgt, batch_size, max_generation_length
 
-    def forward(self,
-                decoder_input_ids=None,
-                encoder_hidden_states=None,
-                encoder_input_mask=None):
+    def forward(
+        self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None,
+    ):
 
-        tgt, batch_size, max_generation_length = self._prepare_for_search(
-            decoder_input_ids, encoder_hidden_states)
+        tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states)
 
         # pad profile tracks sequences ending with <eos> token to replace
         # everything after <eos> with <pad> token
@@ -141,14 +138,12 @@ def forward(self,
         for i in range(max_generation_length):
 
             log_probs, decoder_mems_list = self._forward(
-                tgt[:, -1:], encoder_hidden_states, encoder_input_mask,
-                decoder_mems_list, i)
+                tgt[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i,
+            )
 
             next_tokens = torch.argmax(log_probs[:, -1], dim=-1, keepdim=True)
-            next_tokens = self.pad * pad_profile + \
-                next_tokens * (1 - pad_profile)
-            pad_profile = torch.max(
-                pad_profile, (next_tokens == self.eos).long())
+            next_tokens = self.pad * pad_profile + next_tokens * (1 - pad_profile)
+            pad_profile = torch.max(pad_profile, (next_tokens == self.eos).long())
             tgt = torch.cat((tgt, next_tokens), dim=-1)
 
             # abort generation if all sequences end with <eos>
@@ -159,13 +154,7 @@ def forward(self,
 
 
 class TopKSequenceGenerator(GreedySequenceGenerator):
-    def __init__(self,
-                 embedding,
-                 decoder,
-                 log_softmax,
-                 beam_size=1,
-                 temperature=1.0,
-                 **kwargs):
+    def __init__(self, embedding, decoder, log_softmax, beam_size=1, temperature=1.0, **kwargs):
         """
         Top-k sequence generator based on the decoder followed by log_softmax.
 
@@ -184,22 +173,23 @@ def __init__(self,
         self.temp = temperature
 
     @torch.no_grad()
-    def _forward(self,
-                 decoder_input_ids=None,
-                 encoder_hidden_states=None,
-                 encoder_input_mask=None,
-                 decoder_mems_list=None,
-                 pos=0):
+    def _forward(
+        self,
+        decoder_input_ids=None,
+        encoder_hidden_states=None,
+        encoder_input_mask=None,
+        decoder_mems_list=None,
+        pos=0,
+    ):
 
         log_probs, decoder_mems_list = super()._forward(
-            decoder_input_ids, encoder_hidden_states, encoder_input_mask,
-            decoder_mems_list, pos)
+            decoder_input_ids, encoder_hidden_states, encoder_input_mask, decoder_mems_list, pos,
+        )
 
         batch_size, seq_len, vocab_size = log_probs.size()
         scores, indices = torch.topk(log_probs, self.beam_size, dim=-1)
 
-        rescaled_logexp = torch.zeros_like(
-            log_probs).scatter(-1, indices, scores.div(self.temp).exp())
+        rescaled_logexp = torch.zeros_like(log_probs).scatter(-1, indices, scores.div(self.temp).exp())
         probs = rescaled_logexp / rescaled_logexp.norm(1, -1, keepdim=True)
 
         # We randomly sample next tokens from rescaled probability distribution
@@ -207,21 +197,14 @@ def _forward(self,
         # candidates that have been selected. We call this object
         # `pseudo_log_probs` as genuine log_probs should have -infs instead of
         # 0s and 0s instead of 1s.
-        ids = torch.multinomial(
-            probs.view(-1, vocab_size), 1).view(-1, seq_len, 1)
+        ids = torch.multinomial(probs.view(-1, vocab_size), 1).view(-1, seq_len, 1)
         pseudo_log_probs = torch.zeros_like(log_probs).scatter(-1, ids, 1.0)
 
         return pseudo_log_probs, decoder_mems_list
 
 
 class BeamSearchSequenceGenerator(GreedySequenceGenerator):
-    def __init__(self,
-                 embedding,
-                 decoder,
-                 log_softmax,
-                 beam_size=1,
-                 len_pen=0,
-                 **kwargs):
+    def __init__(self, embedding, decoder, log_softmax, beam_size=1, len_pen=0, **kwargs):
         """
         Beam Search sequence generator based on the decoder followed by
         log_softmax.
@@ -238,35 +221,29 @@ def __init__(self,
         self.beam_size = beam_size
         self.len_pen = len_pen
 
-    def forward(self,
-                decoder_input_ids=None,
-                encoder_hidden_states=None,
-                encoder_input_mask=None):
+    def forward(
+        self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None,
+    ):
 
-        tgt, batch_size, max_generation_length = self._prepare_for_search(
-            decoder_input_ids, encoder_hidden_states)
+        tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states)
 
         # generate initial buffer of beam_size prefixes-hypotheses
-        log_probs, decoder_mems_list = self._forward(
-            tgt, encoder_hidden_states, encoder_input_mask, None, 0)
-        scores, prefixes = torch.topk(
-            log_probs.permute(0, 2, 1), self.beam_size, dim=1)
+        log_probs, decoder_mems_list = self._forward(tgt, encoder_hidden_states, encoder_input_mask, None, 0)
+        scores, prefixes = torch.topk(log_probs.permute(0, 2, 1), self.beam_size, dim=1)
         scores, prefixes = scores.view(-1, 1), prefixes.view(-1, 1)
 
         # repeat init target prefixes and cached memory states beam_size times
-        prefixes = torch.cat(
-            (tgt.repeat(1, self.beam_size).view(-1, 1), prefixes), dim=1)
+        prefixes = torch.cat((tgt.repeat(1, self.beam_size).view(-1, 1), prefixes), dim=1)
         for j in range(len(decoder_mems_list)):
-            decoder_mems_list[j] = \
-                decoder_mems_list[j].repeat(self.beam_size, 1, 1)
+            decoder_mems_list[j] = decoder_mems_list[j].repeat(self.beam_size, 1, 1)
 
         # repeat source sequence beam_size times for beam search
         if encoder_hidden_states is not None:
             _, src_length, hidden_size = encoder_hidden_states.size()
-            encoder_input_mask = encoder_input_mask.repeat(
-                1, self.beam_size).view(-1, src_length)
-            encoder_hidden_states = encoder_hidden_states.repeat(
-                1, self.beam_size, 1).view(-1, src_length, hidden_size)
+            encoder_input_mask = encoder_input_mask.repeat(1, self.beam_size).view(-1, src_length)
+            encoder_hidden_states = encoder_hidden_states.repeat(1, self.beam_size, 1).view(
+                -1, src_length, hidden_size
+            )
         else:
             hidden_size = decoder_mems_list[0].size(2)
 
@@ -285,10 +262,9 @@ def forward(self,
 
             # generate and score candidates for prefixes continuation
             log_probs, decoder_mems_list = self._forward(
-                prefixes[:, -1:], encoder_hidden_states, encoder_input_mask,
-                decoder_mems_list, i+1)
-            scores_i, prefixes_i = torch.topk(
-                log_probs[:, -1, :], self.beam_size, dim=-1)
+                prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i + 1,
+            )
+            scores_i, prefixes_i = torch.topk(log_probs[:, -1, :], self.beam_size, dim=-1)
 
             # for all prefixes ending with <eos> or <pad> replace generated
             # continuations with <pad>
@@ -302,31 +278,31 @@ def forward(self,
 
             # choose top-k hypotheses with length penalty applied
             scores = scores / prefixes_len.pow(self.len_pen)
-            scores, indices_i = torch.topk(scores.view(
-                -1, self.beam_size**2), self.beam_size, dim=1)
+            scores, indices_i = torch.topk(scores.view(-1, self.beam_size ** 2), self.beam_size, dim=1)
             scores = scores.view(-1, 1) * prefixes_len.pow(self.len_pen)
 
             # select prefixes which correspond to the chosen hypotheses
             prefixes = prefixes.unsqueeze(1).repeat(1, self.beam_size, 1)
             prefixes = torch.cat((prefixes, prefixes_i.unsqueeze(2)), dim=2)
-            prefixes = prefixes.view(batch_size, self.beam_size**2, -1)
+            prefixes = prefixes.view(batch_size, self.beam_size ** 2, -1)
             p_len = prefixes.size(2)
             prefixes_ids = indices_i.unsqueeze(2).repeat(1, 1, p_len)
             prefixes = prefixes.gather(1, prefixes_ids).view(-1, p_len)
 
             # reshuffle cached decoder memory states to restore the order
             # of hypotheses broken after top-k selection
-            mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(
-                1, 1, p_len-1, hidden_size) // self.beam_size
+            mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, hidden_size) // self.beam_size
             for j in range(len(decoder_mems_list)):
-                decoder_mems_list[j] = decoder_mems_list[j].view(
-                    -1, self.beam_size, p_len-1, hidden_size).gather(
-                    1, mems_ids).view(-1, p_len-1, hidden_size)
+                decoder_mems_list[j] = (
+                    decoder_mems_list[j]
+                    .view(-1, self.beam_size, p_len - 1, hidden_size)
+                    .gather(1, mems_ids)
+                    .view(-1, p_len - 1, hidden_size)
+                )
 
             # update prefixes_len and pad_profile
             not_eos_pad = prefixes.ne(self.eos) & prefixes.ne(self.pad)
-            prefixes_len = 1 + not_eos_pad.sum(
-                dim=1, keepdim=True).to(scores.dtype)
+            prefixes_len = 1 + not_eos_pad.sum(dim=1, keepdim=True).to(scores.dtype)
             pad_profile = (~not_eos_pad[:, -1:]).long()
 
             # if all hypotheses end with <eos> or <pad>, interrupt search
@@ -335,10 +311,9 @@ def forward(self,
 
         # select best performing hypotheses in each element of the batch
         scores = scores / prefixes_len.pow(self.len_pen)
-        best_guesses = torch.argmax(
-            scores.view(-1, self.beam_size), dim=1, keepdim=True).repeat(
-            1, prefixes.size(1)).unsqueeze(1)
-        tgt = prefixes.view(
-            batch_size, self.beam_size, -1).gather(1, best_guesses)
+        best_guesses = (
+            torch.argmax(scores.view(-1, self.beam_size), dim=1, keepdim=True).repeat(1, prefixes.size(1)).unsqueeze(1)
+        )
+        tgt = prefixes.view(batch_size, self.beam_size, -1).gather(1, best_guesses)
 
         return tgt.squeeze(1)
diff --git a/nemo/collections/nlp/transformer/modules.py b/nemo/collections/nlp/transformer/modules.py
index 618c5900040c..e958c1951c6c 100644
--- a/nemo/collections/nlp/transformer/modules.py
+++ b/nemo/collections/nlp/transformer/modules.py
@@ -22,27 +22,29 @@
 http://nlp.seas.harvard.edu/2018/04/03/attention.html
 Copyright by the HuggingFace and Annotated Transformer authors.
 """
-__all__ = ['FixedPositionalEncoding',
-           'TransformerEmbedding',
-           'MultiHeadAttention',
-           'LightweightConv1d',
-           'TwoStreamSelfAttention',
-           'PositionWiseFF']
+__all__ = [
+    'FixedPositionalEncoding',
+    'TransformerEmbedding',
+    'MultiHeadAttention',
+    'LightweightConv1d',
+    'TwoStreamSelfAttention',
+    'PositionWiseFF',
+]
 
 import math
-try:
-    from apex.normalization import FusedLayerNorm
-except (AttributeError, ModuleNotFoundError):
-    # this is lie - it isn't fused in this case
-    print("Unable to import APEX. Mixed precision, distributed training and "
-          "FusedLayerNorm are not available.")
-    from torch.nn import LayerNorm as FusedLayerNorm
 
 import torch
 from torch import nn
 
 from .utils import gelu
 
+try:
+    from apex.normalization import FusedLayerNorm
+except (AttributeError, ModuleNotFoundError):
+    # this is lie - it isn't fused in this case
+    print("Unable to import APEX. Mixed precision, distributed training and " "FusedLayerNorm are not available.")
+    from torch.nn import LayerNorm as FusedLayerNorm
+
 
 class FixedPositionalEncoding(nn.Module):
     """
@@ -84,20 +86,23 @@ class TransformerEmbedding(nn.Module):
         embedding_dropout: probability of dropout applied to embeddings
     """
 
-    def __init__(self, vocab_size, hidden_size, max_sequence_length=512,
-                 num_token_types=2, embedding_dropout=0.0,
-                 learn_positional_encodings=False):
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_sequence_length=512,
+        num_token_types=2,
+        embedding_dropout=0.0,
+        learn_positional_encodings=False,
+    ):
         super().__init__()
 
         self.max_sequence_length = max_sequence_length
-        self.token_embedding = nn.Embedding(
-            vocab_size, hidden_size, padding_idx=0)
+        self.token_embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
         if learn_positional_encodings:
-            self.position_embedding = nn.Embedding(
-                max_sequence_length, hidden_size)
+            self.position_embedding = nn.Embedding(max_sequence_length, hidden_size)
         else:
-            self.position_embedding = FixedPositionalEncoding(
-                hidden_size, max_sequence_length)
+            self.position_embedding = FixedPositionalEncoding(hidden_size, max_sequence_length)
         self.token_type_embedding = nn.Embedding(num_token_types, hidden_size)
         self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5)
         self.dropout = nn.Dropout(embedding_dropout)
@@ -105,11 +110,12 @@ def __init__(self, vocab_size, hidden_size, max_sequence_length=512,
     def forward(self, input_ids, token_type_ids=None, start_pos=0):
         seq_length = input_ids.size(1)
         if seq_length > self.max_sequence_length:
-            raise ValueError("Input sequence is longer than maximum allowed"
-                             " sequence length for positional encoding")
+            raise ValueError(
+                "Input sequence is longer than maximum allowed" " sequence length for positional encoding"
+            )
         position_ids = torch.arange(
-            start=start_pos, end=start_pos+seq_length,
-            dtype=torch.long, device=input_ids.device)
+            start=start_pos, end=start_pos + seq_length, dtype=torch.long, device=input_ids.device,
+        )
         position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
         token_embeddings = self.token_embedding(input_ids)
@@ -138,13 +144,15 @@ class MultiHeadAttention(nn.Module):
             whole layer, but before layer normalization
     """
 
-    def __init__(self, hidden_size, num_attention_heads,
-                 attn_score_dropout=0.0, attn_layer_dropout=0.0):
+    def __init__(
+        self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0,
+    ):
         super().__init__()
         if hidden_size % num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number "
-                "of attention heads (%d)" % (hidden_size, num_attention_heads))
+                "of attention heads (%d)" % (hidden_size, num_attention_heads)
+            )
         self.hidden_size = hidden_size
         self.num_attention_heads = num_attention_heads
         self.attn_head_size = int(hidden_size / num_attention_heads)
@@ -160,8 +168,7 @@ def __init__(self, hidden_size, num_attention_heads,
         self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5)
 
     def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + \
-            (self.num_attention_heads, self.attn_head_size)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attn_head_size,)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
@@ -187,7 +194,7 @@ def forward(self, queries, keys, values, attention_mask):
 
         context = torch.matmul(attention_probs, value)
         context = context.permute(0, 2, 1, 3).contiguous()
-        new_context_shape = context.size()[:-2] + (self.hidden_size, )
+        new_context_shape = context.size()[:-2] + (self.hidden_size,)
         context = context.view(*new_context_shape)
 
         # output projection
@@ -212,13 +219,13 @@ class LightweightConv1d(nn.Module):
             whole layer, but before layer normalization
     """
 
-    def __init__(self, hidden_size, num_attention_heads, kernel_size,
-                 conv_weight_dropout=0.0, conv_layer_dropout=0.0):
+    def __init__(
+        self, hidden_size, num_attention_heads, kernel_size, conv_weight_dropout=0.0, conv_layer_dropout=0.0,
+    ):
         super().__init__()
         self.num_heads = num_attention_heads
         self.kernel_size = kernel_size
-        self.weight = nn.Parameter(
-            torch.Tensor(num_attention_heads, 1, kernel_size))
+        self.weight = nn.Parameter(torch.Tensor(num_attention_heads, 1, kernel_size))
         self.in_projection = nn.Linear(hidden_size, hidden_size)
         self.out_projection = nn.Linear(hidden_size, hidden_size)
 
@@ -238,12 +245,8 @@ def forward(self, hidden_states, attention_mask):
             pivot = self.kernel_size // 2 + 1
             weight[:, :, pivot:] = 0
 
-        output_states = output_states.contiguous().view(
-            -1, self.num_heads, seq_len)
-        output_states = torch.conv1d(output_states,
-                                     weight,
-                                     padding=self.kernel_size // 2,
-                                     groups=self.num_heads)
+        output_states = output_states.contiguous().view(-1, self.num_heads, seq_len)
+        output_states = torch.conv1d(output_states, weight, padding=self.kernel_size // 2, groups=self.num_heads,)
         output_states = output_states.view(batch_size, hidden_size, seq_len)
         output_states = output_states.permute(0, 2, 1)
 
@@ -267,23 +270,24 @@ class TwoStreamSelfAttention(nn.Module):
             whole layer, but before layer normalization
     """
 
-    def __init__(self, hidden_size, num_attention_heads,
-                 attn_score_dropout=0.0, attn_layer_dropout=0.0):
+    def __init__(
+        self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0,
+    ):
         super().__init__()
         self.query_stream = MultiHeadAttention(
-            hidden_size, num_attention_heads,
-            attn_score_dropout, attn_layer_dropout)
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+        )
         self.content_stream = MultiHeadAttention(
-            hidden_size, num_attention_heads,
-            attn_score_dropout, attn_layer_dropout)
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+        )
 
-    def forward(self, query_states, content_states,
-                query_attention_mask, content_attention_mask):
-        output_query_states = self.query_stream(
-            query_states, content_states, content_states, query_attention_mask)
+    def forward(
+        self, query_states, content_states, query_attention_mask, content_attention_mask,
+    ):
+        output_query_states = self.query_stream(query_states, content_states, content_states, query_attention_mask)
         output_content_states = self.content_stream(
-            query_states, content_states,
-            content_states, content_attention_mask)
+            query_states, content_states, content_states, content_attention_mask,
+        )
         return output_query_states, output_content_states
 
 
@@ -299,8 +303,7 @@ class PositionWiseFF(nn.Module):
         hidden_act: activation function used between two linear layers
     """
 
-    def __init__(self, hidden_size, inner_size,
-                 ffn_dropout=0.0, hidden_act="relu"):
+    def __init__(self, hidden_size, inner_size, ffn_dropout=0.0, hidden_act="relu"):
         super().__init__()
         self.dense_in = nn.Linear(hidden_size, inner_size)
         self.dense_out = nn.Linear(inner_size, hidden_size)
diff --git a/nemo/collections/nlp/transformer/utils.py b/nemo/collections/nlp/transformer/utils.py
index e03554dc2daf..4f3f80ec670a 100644
--- a/nemo/collections/nlp/transformer/utils.py
+++ b/nemo/collections/nlp/transformer/utils.py
@@ -32,8 +32,7 @@ def form_attention_mask(input_mask, diagonal=None):
     attn_shape = (1, input_mask.shape[1], input_mask.shape[1])
     attn_mask = input_mask.byte().unsqueeze(1)
     if diagonal is not None:
-        future_mask = torch.tril(
-            torch.ones(attn_shape).byte().to(input_mask.device), diagonal)
+        future_mask = torch.tril(torch.ones(attn_shape).byte().to(input_mask.device), diagonal)
         attn_mask = attn_mask & future_mask
     attention_mask = (1 - attn_mask.to(torch.float)) * NEG_INF
     return attention_mask.unsqueeze(1)
diff --git a/nemo/collections/nlp/utils/__init__.py b/nemo/collections/nlp/utils/__init__.py
index c7abd0235d7e..894348fc3114 100644
--- a/nemo/collections/nlp/utils/__init__.py
+++ b/nemo/collections/nlp/utils/__init__.py
@@ -1,3 +1 @@
-from . import nlp_utils
-from . import callbacks
-from . import metrics
+from . import callbacks, metrics, nlp_utils
diff --git a/nemo/collections/nlp/utils/callbacks/bert_pretraining.py b/nemo/collections/nlp/utils/callbacks/bert_pretraining.py
index 1b0c1bcbf3b7..baeaabe2d701 100644
--- a/nemo/collections/nlp/utils/callbacks/bert_pretraining.py
+++ b/nemo/collections/nlp/utils/callbacks/bert_pretraining.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2019 NVIDIA Corporation
 __all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
 
-import nemo
 import numpy as np
 
+import nemo
+
 
 def eval_iter_callback(tensors, global_vars):
     if "dev_mlm_loss" not in global_vars.keys():
@@ -23,16 +24,14 @@ def eval_iter_callback(tensors, global_vars):
 def eval_epochs_done_callback(global_vars):
     if 'dev_mlm_loss' in global_vars:
         mlm_loss = np.mean(global_vars["dev_mlm_loss"])
-        nemo.logging.info(
-            "Dev MLM perplexity: {0}".format(np.round(np.exp(mlm_loss), 3)))
+        nemo.logging.info("Dev MLM perplexity: {0}".format(np.round(np.exp(mlm_loss), 3)))
         global_vars["dev_mlm_loss"] = []
     else:
         mlm_loss = -123.0
 
     if 'dev_nsp_loss' in global_vars:
         nsp_loss = np.mean(global_vars["dev_nsp_loss"])
-        nemo.logging.info(
-            "Dev NSP perplexity: {0}".format(np.round(np.exp(nsp_loss), 3)))
+        nemo.logging.info("Dev NSP perplexity: {0}".format(np.round(np.exp(nsp_loss), 3)))
         global_vars["dev_nsp_loss"] = []
     else:
         nsp_loss = -123.0
diff --git a/nemo/collections/nlp/utils/callbacks/glue.py b/nemo/collections/nlp/utils/callbacks/glue.py
index e1749f1279b5..3edb95fe6ea9 100644
--- a/nemo/collections/nlp/utils/callbacks/glue.py
+++ b/nemo/collections/nlp/utils/callbacks/glue.py
@@ -24,10 +24,11 @@
 import os
 import random
 
-import nemo
 import numpy as np
 from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import matthews_corrcoef, f1_score
+from sklearn.metrics import f1_score, matthews_corrcoef
+
+import nemo
 
 
 def eval_iter_callback(tensors, global_vars):
@@ -78,8 +79,8 @@ def eval_epochs_done_callback(global_vars, output_dir, task_name):
         i = random.randint(0, preds.shape[0] - 21)
 
     nemo.logging.info("Task name: %s" % task_name.upper())
-    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i:i+20]))
-    nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i:i+20]))
+    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + 20]))
+    nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + 20]))
 
     results = compute_metrics(task_name, preds, labels)
 
@@ -100,9 +101,7 @@ def accuracy(preds, labels):
 def acc_and_f1(preds, labels):
     accuracy = (preds == labels).mean()
     f1 = f1_score(y_true=labels, y_pred=preds)
-    return {"acc": accuracy,
-            "f1": f1,
-            "acc_and_f1": (accuracy + f1) / 2}
+    return {"acc": accuracy, "f1": f1, "acc_and_f1": (accuracy + f1) / 2}
 
 
 def mcc(preds, labels):
@@ -112,9 +111,11 @@ def mcc(preds, labels):
 def pearson_and_spearman(preds, labels):
     pearson_corr = pearsonr(preds, labels)[0]
     spearman_corr = spearmanr(preds, labels)[0]
-    return {"pearson": pearson_corr,
-            "spearmanr": spearman_corr,
-            "corr": (pearson_corr + spearman_corr) / 2}
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
 
 
 def compute_metrics(task_name, preds, labels):
diff --git a/nemo/collections/nlp/utils/callbacks/joint_intent_slot.py b/nemo/collections/nlp/utils/callbacks/joint_intent_slot.py
index 4ada1d2ba5b8..79db8a709f20 100644
--- a/nemo/collections/nlp/utils/callbacks/joint_intent_slot.py
+++ b/nemo/collections/nlp/utils/callbacks/joint_intent_slot.py
@@ -5,10 +5,11 @@
 import time
 
 import matplotlib
+import numpy as np
 from matplotlib import pyplot as plt
+from sklearn.metrics import classification_report, confusion_matrix
+
 import nemo
-import numpy as np
-from sklearn.metrics import confusion_matrix, classification_report
 
 __all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
 
@@ -17,9 +18,7 @@ def tensor2list(tensor):
     return tensor.detach().cpu().tolist()
 
 
-def eval_iter_callback(tensors,
-                       global_vars,
-                       eval_data_layer):
+def eval_iter_callback(tensors, global_vars, eval_data_layer):
     if "all_intent_preds" not in global_vars.keys():
         global_vars["all_intent_preds"] = []
     if "all_intent_labels" not in global_vars.keys():
@@ -58,8 +57,7 @@ def eval_iter_callback(tensors,
         if kv.startswith('subtokens_mask'):
             for v_tensor in v:
                 for subtokens_mask_tensor in v_tensor:
-                    all_subtokens_mask.extend(
-                        tensor2list(subtokens_mask_tensor))
+                    all_subtokens_mask.extend(tensor2list(subtokens_mask_tensor))
 
     all_intent_preds = list(np.argmax(np.asarray(all_intent_logits), 1))
     all_slot_preds = list(np.argmax(np.asarray(all_slot_logits), 2).flatten())
@@ -88,11 +86,10 @@ def eval_epochs_done_callback(global_vars, graph_fold):
     i = 0
     if intent_preds.shape[0] > 21:
         i = random.randint(0, intent_preds.shape[0] - 21)
-    nemo.logging.info("Sampled i_preds: [%s]" % list2str(intent_preds[i:i+20]))
-    nemo.logging.info(
-        "Sampled intents: [%s]" % list2str(intent_labels[i:i+20]))
-    nemo.logging.info("Sampled s_preds: [%s]" % list2str(slot_preds[i:i+20]))
-    nemo.logging.info("Sampled slots: [%s]" % list2str(slot_labels[i:i+20]))
+    nemo.logging.info("Sampled i_preds: [%s]" % list2str(intent_preds[i : i + 20]))
+    nemo.logging.info("Sampled intents: [%s]" % list2str(intent_labels[i : i + 20]))
+    nemo.logging.info("Sampled s_preds: [%s]" % list2str(slot_preds[i : i + 20]))
+    nemo.logging.info("Sampled slots: [%s]" % list2str(slot_labels[i : i + 20]))
     cm = confusion_matrix(intent_labels, intent_preds)
     nemo.logging.info(f'Confusion matrix:\n{cm}')
     fig = plt.figure()
@@ -109,14 +106,17 @@ def eval_epochs_done_callback(global_vars, graph_fold):
     correct_preds = sum(intent_labels == intent_preds)
     intent_accuracy = correct_preds / intent_labels.shape[0]
     nemo.logging.info(f'Intent accuracy: {intent_accuracy}')
-    nemo.logging.info(f'Classification report:\n \
-        {classification_report(intent_labels, intent_preds)}')
+    nemo.logging.info(
+        f'Classification report:\n \
+        {classification_report(intent_labels, intent_preds)}'
+    )
 
     nemo.logging.info('Slot prediction results')
     slot_accuracy = sum(slot_labels == slot_preds) / slot_labels.shape[0]
     nemo.logging.info(f'Slot accuracy: {slot_accuracy}')
-    nemo.logging.info(f'Classification report:\n \
-        {classification_report(slot_labels[:-2], slot_preds[:-2])}')
+    nemo.logging.info(
+        f'Classification report:\n \
+        {classification_report(slot_labels[:-2], slot_preds[:-2])}'
+    )
 
-    return dict({'intent_accuracy': intent_accuracy,
-                 'slot_accuracy': slot_accuracy})
+    return dict({'intent_accuracy': intent_accuracy, 'slot_accuracy': slot_accuracy})
diff --git a/nemo/collections/nlp/utils/callbacks/language_modeling.py b/nemo/collections/nlp/utils/callbacks/language_modeling.py
index 48ca507482f7..daffe2c64d2d 100644
--- a/nemo/collections/nlp/utils/callbacks/language_modeling.py
+++ b/nemo/collections/nlp/utils/callbacks/language_modeling.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2019 NVIDIA Corporation
 __all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
 
-import nemo
 import numpy as np
 
+import nemo
+
 GLOBAL_KEYS = ["eval_loss", "sys"]
 
 
diff --git a/nemo/collections/nlp/utils/callbacks/punctuation_capitalization.py b/nemo/collections/nlp/utils/callbacks/punctuation_capitalization.py
index 39d321530950..267e42d86fe6 100644
--- a/nemo/collections/nlp/utils/callbacks/punctuation_capitalization.py
+++ b/nemo/collections/nlp/utils/callbacks/punctuation_capitalization.py
@@ -3,10 +3,10 @@
 
 import random
 
-import nemo
 import numpy as np
 from sklearn.metrics import classification_report
 
+import nemo
 from nemo.collections.nlp.data.datasets.utils import list2str, tensor2list
 from nemo.collections.nlp.utils.nlp_utils import plot_confusion_matrix
 
@@ -51,27 +51,22 @@ def eval_iter_callback(tensors, global_vars):
         elif kv.startswith('subtokens_mask'):
             for v_tensor in v:
                 for subtokens_mask_tensor in v_tensor:
-                    all_subtokens_mask.extend(
-                        tensor2list(subtokens_mask_tensor))
+                    all_subtokens_mask.extend(tensor2list(subtokens_mask_tensor))
 
-    punct_all_preds = \
-        list(np.argmax(np.asarray(punct_all_logits), 2).flatten())
+    punct_all_preds = list(np.argmax(np.asarray(punct_all_logits), 2).flatten())
     global_vars["punct_all_preds"].extend(punct_all_preds)
     global_vars["punct_all_labels"].extend(punct_all_labels)
 
-    capit_all_preds = \
-        list(np.argmax(np.asarray(capit_all_logits), 2).flatten())
+    capit_all_preds = list(np.argmax(np.asarray(capit_all_logits), 2).flatten())
     global_vars["capit_all_preds"].extend(capit_all_preds)
     global_vars["capit_all_labels"].extend(capit_all_labels)
 
     global_vars["all_subtokens_mask"].extend(all_subtokens_mask)
 
 
-def eval_epochs_done_callback(global_vars,
-                              punct_label_ids,
-                              capit_label_ids,
-                              graph_fold=None,
-                              normalize_cm=True):
+def eval_epochs_done_callback(
+    global_vars, punct_label_ids, capit_label_ids, graph_fold=None, normalize_cm=True,
+):
 
     '''
     Args:
@@ -80,27 +75,17 @@ def eval_epochs_done_callback(global_vars,
         normalize confusion matrix
     '''
 
-    punct_accuracy = _eval_epochs_done_callback('punct',
-                                                global_vars,
-                                                punct_label_ids,
-                                                graph_fold,
-                                                normalize_cm)
+    punct_accuracy = _eval_epochs_done_callback('punct', global_vars, punct_label_ids, graph_fold, normalize_cm)
 
-    capit_accuracy = _eval_epochs_done_callback('capit',
-                                                global_vars,
-                                                capit_label_ids,
-                                                graph_fold,
-                                                normalize_cm)
+    capit_accuracy = _eval_epochs_done_callback('capit', global_vars, capit_label_ids, graph_fold, normalize_cm)
 
-    return {"Punctuation_task_accuracy": punct_accuracy,
-            "Capitalization_task_accuracy": capit_accuracy}
+    return {
+        "Punctuation_task_accuracy": punct_accuracy,
+        "Capitalization_task_accuracy": capit_accuracy,
+    }
 
 
-def _eval_epochs_done_callback(task_name,
-                               global_vars,
-                               label_ids,
-                               graph_fold=None,
-                               normalize_cm=True):
+def _eval_epochs_done_callback(task_name, global_vars, label_ids, graph_fold=None, normalize_cm=True):
     labels = np.asarray(global_vars[task_name + '_all_labels'])
     preds = np.asarray(global_vars[task_name + '_all_preds'])
     subtokens_mask = np.asarray(global_vars['all_subtokens_mask']) > 0.5
@@ -116,24 +101,18 @@ def _eval_epochs_done_callback(task_name,
     i = 0
     if preds.shape[0] > sample_size + 1:
         i = random.randint(0, preds.shape[0] - sample_size - 1)
-    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i:i+sample_size]))
-    nemo.logging.info(
-        "Sampled labels: [%s]" % list2str(labels[i:i+sample_size]))
+    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size]))
+    nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size]))
 
     # remove labels from label_ids that don't appear in the dev set
     used_labels = set(labels) | set(preds)
-    label_ids = \
-        {k: label_ids[k] for k, v in label_ids.items() if v in used_labels}
+    label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels}
 
-    nemo.logging.info(
-        classification_report(labels, preds, target_names=label_ids))
+    nemo.logging.info(classification_report(labels, preds, target_names=label_ids))
 
     # calculate and plot confusion_matrix
     if graph_fold:
-        plot_confusion_matrix(label_ids,
-                              labels,
-                              preds,
-                              graph_fold,
-                              normalize=normalize_cm,
-                              prefix=task_name)
+        plot_confusion_matrix(
+            label_ids, labels, preds, graph_fold, normalize=normalize_cm, prefix=task_name,
+        )
     return accuracy
diff --git a/nemo/collections/nlp/utils/callbacks/sentence_classification.py b/nemo/collections/nlp/utils/callbacks/sentence_classification.py
index feb73faa004c..4810bab9dde1 100644
--- a/nemo/collections/nlp/utils/callbacks/sentence_classification.py
+++ b/nemo/collections/nlp/utils/callbacks/sentence_classification.py
@@ -5,18 +5,16 @@
 import random
 import time
 
-from matplotlib import pyplot as plt  # nopep8
 import numpy as np  # nopep8
-from sklearn.metrics import confusion_matrix, classification_report  # nopep8
+from matplotlib import pyplot as plt  # nopep8
+from sklearn.metrics import classification_report, confusion_matrix  # nopep8
 
 import nemo
 
 __all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
 
 
-def eval_iter_callback(tensors,
-                       global_vars,
-                       eval_data_layer):
+def eval_iter_callback(tensors, global_vars, eval_data_layer):
     if "all_preds" not in global_vars.keys():
         global_vars["all_preds"] = []
     if "all_labels" not in global_vars.keys():
@@ -53,8 +51,8 @@ def eval_epochs_done_callback(global_vars, graph_fold):
     i = 0
     if preds.shape[0] > 21:
         i = random.randint(0, preds.shape[0] - 21)
-    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i:i+20]))
-    nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i:i+20]))
+    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + 20]))
+    nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + 20]))
     cm = confusion_matrix(labels, preds)
     fig = plt.figure()
     ax = fig.add_subplot(111)
diff --git a/nemo/collections/nlp/utils/callbacks/squad.py b/nemo/collections/nlp/utils/callbacks/squad.py
index 1cbfe7263dc0..5f87132bc7e4 100644
--- a/nemo/collections/nlp/utils/callbacks/squad.py
+++ b/nemo/collections/nlp/utils/callbacks/squad.py
@@ -43,10 +43,15 @@ def eval_iter_callback(tensors, global_vars):
             global_vars['eval_unique_ids'].extend(unique_ids)
 
 
-def eval_epochs_done_callback(global_vars, eval_data_layer, do_lower_case,
-                              n_best_size, max_answer_length,
-                              version_2_with_negative,
-                              null_score_diff_threshold):
+def eval_epochs_done_callback(
+    global_vars,
+    eval_data_layer,
+    do_lower_case,
+    n_best_size,
+    max_answer_length,
+    version_2_with_negative,
+    null_score_diff_threshold,
+):
     exact_match, f1, _ = eval_data_layer.dataset.evaluate(
         unique_ids=global_vars["eval_unique_ids"],
         start_logits=global_vars["eval_start_logits"],
@@ -55,7 +60,8 @@ def eval_epochs_done_callback(global_vars, eval_data_layer, do_lower_case,
         max_answer_length=max_answer_length,
         version_2_with_negative=version_2_with_negative,
         null_score_diff_threshold=null_score_diff_threshold,
-        do_lower_case=do_lower_case)
+        do_lower_case=do_lower_case,
+    )
 
     print(f"Exact_match = {exact_match}, f1 = {f1}")
 
diff --git a/nemo/collections/nlp/utils/callbacks/token_classification.py b/nemo/collections/nlp/utils/callbacks/token_classification.py
index e100627021a7..20d3036118f1 100644
--- a/nemo/collections/nlp/utils/callbacks/token_classification.py
+++ b/nemo/collections/nlp/utils/callbacks/token_classification.py
@@ -35,8 +35,7 @@ def eval_iter_callback(tensors, global_vars):
         elif kv.startswith('subtokens_mask'):
             for v_tensor in v:
                 for subtokens_mask_tensor in v_tensor:
-                    all_subtokens_mask.extend(
-                        tensor2list(subtokens_mask_tensor))
+                    all_subtokens_mask.extend(tensor2list(subtokens_mask_tensor))
 
     all_preds = list(np.argmax(np.asarray(all_logits), 2).flatten())
     global_vars["all_preds"].extend(all_preds)
@@ -44,11 +43,7 @@ def eval_iter_callback(tensors, global_vars):
     global_vars["all_subtokens_mask"].extend(all_subtokens_mask)
 
 
-def eval_epochs_done_callback(global_vars,
-                              label_ids,
-                              graph_fold=None,
-                              none_label_id=0,
-                              normalize_cm=True):
+def eval_epochs_done_callback(global_vars, label_ids, graph_fold=None, none_label_id=0, normalize_cm=True):
     labels = np.asarray(global_vars['all_labels'])
     preds = np.asarray(global_vars['all_preds'])
     subtokens_mask = np.asarray(global_vars['all_subtokens_mask']) > 0.5
@@ -64,24 +59,17 @@ def eval_epochs_done_callback(global_vars,
     i = 0
     if preds.shape[0] > sample_size + 1:
         i = random.randint(0, preds.shape[0] - sample_size - 1)
-    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i:i+sample_size]))
-    nemo.logging.info(
-        "Sampled labels: [%s]" % list2str(labels[i:i+sample_size]))
+    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size]))
+    nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size]))
 
     # remove labels from label_ids that don't appear in the dev set
     used_labels = set(labels) | set(preds)
-    label_ids = \
-        {k: label_ids[k] for k, v in label_ids.items() if v in used_labels}
+    label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels}
 
-    nemo.logging.info(
-        classification_report(labels, preds, target_names=label_ids))
+    nemo.logging.info(classification_report(labels, preds, target_names=label_ids))
 
     # calculate and plot confusion_matrix
     if graph_fold:
-        plot_confusion_matrix(label_ids,
-                              labels,
-                              preds,
-                              graph_fold,
-                              normalize=normalize_cm)
+        plot_confusion_matrix(label_ids, labels, preds, graph_fold, normalize=normalize_cm)
 
     return dict({'Accuracy': accuracy})
diff --git a/nemo/collections/nlp/utils/callbacks/translation.py b/nemo/collections/nlp/utils/callbacks/translation.py
index e5fcb19bc0f8..2aa3cc99df59 100644
--- a/nemo/collections/nlp/utils/callbacks/translation.py
+++ b/nemo/collections/nlp/utils/callbacks/translation.py
@@ -7,7 +7,6 @@
 
 from ..metrics.sacrebleu import corpus_bleu
 
-
 GLOBAL_KEYS = ["eval_loss", "ref", "sys", "sent_ids", "nonpad_tokens"]
 
 
@@ -80,10 +79,7 @@ def eval_epochs_done_callback(global_vars, validation_dataset=None):
     for key in GLOBAL_KEYS:
         global_vars[key] = []
 
-    metrics = dict(
-        {"eval_loss": eval_loss,
-         "token_bleu": token_bleu,
-         "sacre_bleu": sacre_bleu})
+    metrics = dict({"eval_loss": eval_loss, "token_bleu": token_bleu, "sacre_bleu": sacre_bleu,})
 
     return metrics
 
diff --git a/nemo/collections/nlp/utils/metrics/bleu.py b/nemo/collections/nlp/utils/metrics/bleu.py
index 2fc4a191b834..04e67d1788d6 100644
--- a/nemo/collections/nlp/utils/metrics/bleu.py
+++ b/nemo/collections/nlp/utils/metrics/bleu.py
@@ -49,15 +49,12 @@ def _get_ngrams(segment, max_order):
     ngram_counts = collections.Counter()
     for order in range(1, max_order + 1):
         for i in range(0, len(segment) - order + 1):
-            ngram = tuple(segment[i:i + order])
+            ngram = tuple(segment[i : i + order])
             ngram_counts[ngram] += 1
     return ngram_counts
 
 
-def compute_bleu(reference_corpus,
-                 translation_corpus,
-                 max_order=4,
-                 smooth=False):
+def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=False):
     """Computes BLEU for translated segments against one or more references.
 
     Args:
@@ -96,17 +93,15 @@ def compute_bleu(reference_corpus,
     precisions = [0] * max_order
     for i in range(0, max_order):
         if smooth:
-            precisions[i] = ((matches_by_order[i] + 1.) /
-                             (possible_matches_by_order[i] + 1.))
+            precisions[i] = (matches_by_order[i] + 1.0) / (possible_matches_by_order[i] + 1.0)
         else:
             if possible_matches_by_order[i] > 0:
-                precisions[i] = (float(matches_by_order[i]) /
-                                 possible_matches_by_order[i])
+                precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
             else:
                 precisions[i] = 0.0
 
     if min(precisions) > 0:
-        p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
+        p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
         geo_mean = math.exp(p_log_sum)
     else:
         geo_mean = 0
@@ -114,13 +109,19 @@ def compute_bleu(reference_corpus,
     ratio = float(translation_length) / reference_length
 
     if ratio > 1.0:
-        bp = 1.
+        bp = 1.0
     else:
-        bp = math.exp(1 - 1. / (ratio + 1e-6))
+        bp = math.exp(1 - 1.0 / (ratio + 1e-6))
 
     bleu = geo_mean * bp
 
     precisions = [p * 100 for p in precisions]
 
-    return (bleu * 100, precisions, bp, ratio, translation_length,
-            reference_length)
+    return (
+        bleu * 100,
+        precisions,
+        bp,
+        ratio,
+        translation_length,
+        reference_length,
+    )
diff --git a/nemo/collections/nlp/utils/metrics/fairseq_tokenizer.py b/nemo/collections/nlp/utils/metrics/fairseq_tokenizer.py
index ef82f100f3b7..f6bfdfad9473 100644
--- a/nemo/collections/nlp/utils/metrics/fairseq_tokenizer.py
+++ b/nemo/collections/nlp/utils/metrics/fairseq_tokenizer.py
@@ -3,8 +3,8 @@
 master/PyTorch/Translation/Transformer/fairseq/tokenizer.py
 """
 
-import sys
 import re
+import sys
 import unicodedata
 from collections import defaultdict
 
diff --git a/nemo/collections/nlp/utils/metrics/sacrebleu.py b/nemo/collections/nlp/utils/metrics/sacrebleu.py
index 2eb379f1d233..79abebe2aeb6 100755
--- a/nemo/collections/nlp/utils/metrics/sacrebleu.py
+++ b/nemo/collections/nlp/utils/metrics/sacrebleu.py
@@ -28,16 +28,16 @@
 import math
 import os
 import re
-import sys
 import ssl
+import sys
 import unicodedata
 import urllib.request
-
 from collections import Counter, namedtuple
 from itertools import zip_longest
-from typing import List, Iterable, Tuple, Union
+from typing import Iterable, List, Tuple, Union
 
 from nemo import logging
+
 from .fairseq_tokenizer import tokenize_en
 
 VERSION = '1.3.5'
@@ -52,9 +52,7 @@
     signal(SIGPIPE, SIG_DFL)
 
 except ImportError:
-    logging.warning(
-        'Could not import signal.SIGPIPE (this is expected on Windows machines)'
-    )
+    logging.warning('Could not import signal.SIGPIPE (this is expected on Windows machines)')
 
 # Where to store downloaded test sets.
 # Define the environment variable $SACREBLEU, or use the default of ~/.sacrebleu
@@ -63,8 +61,7 @@
 # in which case the os.path.join() throws a TypeError. Using expanduser() is
 # a safe way to get the user's home folder.
 USERHOME = os.path.expanduser("~")
-SACREBLEU_DIR = os.environ.get('SACREBLEU',
-                               os.path.join(USERHOME, '.sacrebleu'))
+SACREBLEU_DIR = os.environ.get('SACREBLEU', os.path.join(USERHOME, '.sacrebleu'))
 
 # n-gram order. Don't change this.
 NGRAM_ORDER = 4
@@ -97,13 +94,9 @@
         'ja-en': ['2:MTNT2019/ja-en.final.tsv', '3:MTNT2019/ja-en.final.tsv'],
     },
     'mtnt1.1/test': {
-        'data': [
-            'https://github.com/pmichel31415/mtnt/releases/download/v1.1/MTNT.1.1.tar.gz'
-        ],
-        'description':
-            'Test data for the Machine Translation of Noisy Text task: http://www.cs.cmu.edu/~pmichel1/mtnt/',
-        'citation':
-            '@InProceedings{michel2018a:mtnt,\n    author = "Michel, Paul and Neubig, Graham",\n    title = "MTNT: A Testbed for Machine Translation of Noisy Text",\n    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",\n    year = "2018",\n    publisher = "Association for Computational Linguistics",\n    pages = "543--553",\n    location = "Brussels, Belgium",\n    url = "http://aclweb.org/anthology/D18-1050"\n}',
+        'data': ['https://github.com/pmichel31415/mtnt/releases/download/v1.1/MTNT.1.1.tar.gz'],
+        'description': 'Test data for the Machine Translation of Noisy Text task: http://www.cs.cmu.edu/~pmichel1/mtnt/',
+        'citation': '@InProceedings{michel2018a:mtnt,\n    author = "Michel, Paul and Neubig, Graham",\n    title = "MTNT: A Testbed for Machine Translation of Noisy Text",\n    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",\n    year = "2018",\n    publisher = "Association for Computational Linguistics",\n    pages = "543--553",\n    location = "Brussels, Belgium",\n    url = "http://aclweb.org/anthology/D18-1050"\n}',
         'md5': ['8ce1831ac584979ba8cdcd9d4be43e1d'],
         'en-fr': ['1:MTNT/test/test.en-fr.tsv', '2:MTNT/test/test.en-fr.tsv'],
         'fr-en': ['1:MTNT/test/test.fr-en.tsv', '2:MTNT/test/test.fr-en.tsv'],
@@ -111,214 +104,83 @@
         'ja-en': ['1:MTNT/test/test.ja-en.tsv', '2:MTNT/test/test.ja-en.tsv'],
     },
     'mtnt1.1/valid': {
-        'data': [
-            'https://github.com/pmichel31415/mtnt/releases/download/v1.1/MTNT.1.1.tar.gz'
-        ],
-        'description':
-            'Validation data for the Machine Translation of Noisy Text task: http://www.cs.cmu.edu/~pmichel1/mtnt/',
-        'citation':
-            '@InProceedings{michel2018a:mtnt,\n    author = "Michel, Paul and Neubig, Graham",\n    title = "MTNT: A Testbed for Machine Translation of Noisy Text",\n    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",\n    year = "2018",\n    publisher = "Association for Computational Linguistics",\n    pages = "543--553",\n    location = "Brussels, Belgium",\n    url = "http://aclweb.org/anthology/D18-1050"\n}',
+        'data': ['https://github.com/pmichel31415/mtnt/releases/download/v1.1/MTNT.1.1.tar.gz'],
+        'description': 'Validation data for the Machine Translation of Noisy Text task: http://www.cs.cmu.edu/~pmichel1/mtnt/',
+        'citation': '@InProceedings{michel2018a:mtnt,\n    author = "Michel, Paul and Neubig, Graham",\n    title = "MTNT: A Testbed for Machine Translation of Noisy Text",\n    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",\n    year = "2018",\n    publisher = "Association for Computational Linguistics",\n    pages = "543--553",\n    location = "Brussels, Belgium",\n    url = "http://aclweb.org/anthology/D18-1050"\n}',
         'md5': ['8ce1831ac584979ba8cdcd9d4be43e1d'],
-        'en-fr':
-            ['1:MTNT/valid/valid.en-fr.tsv', '2:MTNT/valid/valid.en-fr.tsv'],
-        'fr-en':
-            ['1:MTNT/valid/valid.fr-en.tsv', '2:MTNT/valid/valid.fr-en.tsv'],
-        'en-ja':
-            ['1:MTNT/valid/valid.en-ja.tsv', '2:MTNT/valid/valid.en-ja.tsv'],
-        'ja-en':
-            ['1:MTNT/valid/valid.ja-en.tsv', '2:MTNT/valid/valid.ja-en.tsv'],
+        'en-fr': ['1:MTNT/valid/valid.en-fr.tsv', '2:MTNT/valid/valid.en-fr.tsv',],
+        'fr-en': ['1:MTNT/valid/valid.fr-en.tsv', '2:MTNT/valid/valid.fr-en.tsv',],
+        'en-ja': ['1:MTNT/valid/valid.en-ja.tsv', '2:MTNT/valid/valid.en-ja.tsv',],
+        'ja-en': ['1:MTNT/valid/valid.ja-en.tsv', '2:MTNT/valid/valid.ja-en.tsv',],
     },
     'mtnt1.1/train': {
-        'data': [
-            'https://github.com/pmichel31415/mtnt/releases/download/v1.1/MTNT.1.1.tar.gz'
-        ],
-        'description':
-            'Training data for the Machine Translation of Noisy Text task: http://www.cs.cmu.edu/~pmichel1/mtnt/',
-        'citation':
-            '@InProceedings{michel2018a:mtnt,\n    author = "Michel, Paul and Neubig, Graham",\n    title = "MTNT: A Testbed for Machine Translation of Noisy Text",\n    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",\n    year = "2018",\n    publisher = "Association for Computational Linguistics",\n    pages = "543--553",\n    location = "Brussels, Belgium",\n    url = "http://aclweb.org/anthology/D18-1050"\n}',
+        'data': ['https://github.com/pmichel31415/mtnt/releases/download/v1.1/MTNT.1.1.tar.gz'],
+        'description': 'Training data for the Machine Translation of Noisy Text task: http://www.cs.cmu.edu/~pmichel1/mtnt/',
+        'citation': '@InProceedings{michel2018a:mtnt,\n    author = "Michel, Paul and Neubig, Graham",\n    title = "MTNT: A Testbed for Machine Translation of Noisy Text",\n    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",\n    year = "2018",\n    publisher = "Association for Computational Linguistics",\n    pages = "543--553",\n    location = "Brussels, Belgium",\n    url = "http://aclweb.org/anthology/D18-1050"\n}',
         'md5': ['8ce1831ac584979ba8cdcd9d4be43e1d'],
-        'en-fr':
-            ['1:MTNT/train/train.en-fr.tsv', '2:MTNT/train/train.en-fr.tsv'],
-        'fr-en':
-            ['1:MTNT/train/train.fr-en.tsv', '2:MTNT/train/train.fr-en.tsv'],
-        'en-ja':
-            ['1:MTNT/train/train.en-ja.tsv', '2:MTNT/train/train.en-ja.tsv'],
-        'ja-en':
-            ['1:MTNT/train/train.ja-en.tsv', '2:MTNT/train/train.ja-en.tsv'],
+        'en-fr': ['1:MTNT/train/train.en-fr.tsv', '2:MTNT/train/train.en-fr.tsv',],
+        'fr-en': ['1:MTNT/train/train.fr-en.tsv', '2:MTNT/train/train.fr-en.tsv',],
+        'en-ja': ['1:MTNT/train/train.en-ja.tsv', '2:MTNT/train/train.en-ja.tsv',],
+        'ja-en': ['1:MTNT/train/train.ja-en.tsv', '2:MTNT/train/train.ja-en.tsv',],
     },
     'wmt19': {
         'data': ['http://data.statmt.org/wmt19/translation-task/test.tgz'],
         'md5': ['84de7162d158e28403103b01aeefc39a'],
-        'cs-de': [
-            'sgm/newstest2019-csde-src.cs.sgm',
-            'sgm/newstest2019-csde-ref.de.sgm'
-        ],
-        'de-cs': [
-            'sgm/newstest2019-decs-src.de.sgm',
-            'sgm/newstest2019-decs-ref.cs.sgm'
-        ],
-        'de-en': [
-            'sgm/newstest2019-deen-src.de.sgm',
-            'sgm/newstest2019-deen-ref.en.sgm'
-        ],
-        'de-fr': [
-            'sgm/newstest2019-defr-src.de.sgm',
-            'sgm/newstest2019-defr-ref.fr.sgm'
-        ],
-        'en-cs': [
-            'sgm/newstest2019-encs-src.en.sgm',
-            'sgm/newstest2019-encs-ref.cs.sgm'
-        ],
-        'en-de': [
-            'sgm/newstest2019-ende-src.en.sgm',
-            'sgm/newstest2019-ende-ref.de.sgm'
-        ],
-        'en-fi': [
-            'sgm/newstest2019-enfi-src.en.sgm',
-            'sgm/newstest2019-enfi-ref.fi.sgm'
-        ],
-        'en-gu': [
-            'sgm/newstest2019-engu-src.en.sgm',
-            'sgm/newstest2019-engu-ref.gu.sgm'
-        ],
-        'en-kk': [
-            'sgm/newstest2019-enkk-src.en.sgm',
-            'sgm/newstest2019-enkk-ref.kk.sgm'
-        ],
-        'en-lt': [
-            'sgm/newstest2019-enlt-src.en.sgm',
-            'sgm/newstest2019-enlt-ref.lt.sgm'
-        ],
-        'en-ru': [
-            'sgm/newstest2019-enru-src.en.sgm',
-            'sgm/newstest2019-enru-ref.ru.sgm'
-        ],
-        'en-zh': [
-            'sgm/newstest2019-enzh-src.en.sgm',
-            'sgm/newstest2019-enzh-ref.zh.sgm'
-        ],
-        'fi-en': [
-            'sgm/newstest2019-fien-src.fi.sgm',
-            'sgm/newstest2019-fien-ref.en.sgm'
-        ],
-        'fr-de': [
-            'sgm/newstest2019-frde-src.fr.sgm',
-            'sgm/newstest2019-frde-ref.de.sgm'
-        ],
-        'gu-en': [
-            'sgm/newstest2019-guen-src.gu.sgm',
-            'sgm/newstest2019-guen-ref.en.sgm'
-        ],
-        'kk-en': [
-            'sgm/newstest2019-kken-src.kk.sgm',
-            'sgm/newstest2019-kken-ref.en.sgm'
-        ],
-        'lt-en': [
-            'sgm/newstest2019-lten-src.lt.sgm',
-            'sgm/newstest2019-lten-ref.en.sgm'
-        ],
-        'ru-en': [
-            'sgm/newstest2019-ruen-src.ru.sgm',
-            'sgm/newstest2019-ruen-ref.en.sgm'
-        ],
-        'zh-en': [
-            'sgm/newstest2019-zhen-src.zh.sgm',
-            'sgm/newstest2019-zhen-ref.en.sgm'
-        ],
+        'cs-de': ['sgm/newstest2019-csde-src.cs.sgm', 'sgm/newstest2019-csde-ref.de.sgm',],
+        'de-cs': ['sgm/newstest2019-decs-src.de.sgm', 'sgm/newstest2019-decs-ref.cs.sgm',],
+        'de-en': ['sgm/newstest2019-deen-src.de.sgm', 'sgm/newstest2019-deen-ref.en.sgm',],
+        'de-fr': ['sgm/newstest2019-defr-src.de.sgm', 'sgm/newstest2019-defr-ref.fr.sgm',],
+        'en-cs': ['sgm/newstest2019-encs-src.en.sgm', 'sgm/newstest2019-encs-ref.cs.sgm',],
+        'en-de': ['sgm/newstest2019-ende-src.en.sgm', 'sgm/newstest2019-ende-ref.de.sgm',],
+        'en-fi': ['sgm/newstest2019-enfi-src.en.sgm', 'sgm/newstest2019-enfi-ref.fi.sgm',],
+        'en-gu': ['sgm/newstest2019-engu-src.en.sgm', 'sgm/newstest2019-engu-ref.gu.sgm',],
+        'en-kk': ['sgm/newstest2019-enkk-src.en.sgm', 'sgm/newstest2019-enkk-ref.kk.sgm',],
+        'en-lt': ['sgm/newstest2019-enlt-src.en.sgm', 'sgm/newstest2019-enlt-ref.lt.sgm',],
+        'en-ru': ['sgm/newstest2019-enru-src.en.sgm', 'sgm/newstest2019-enru-ref.ru.sgm',],
+        'en-zh': ['sgm/newstest2019-enzh-src.en.sgm', 'sgm/newstest2019-enzh-ref.zh.sgm',],
+        'fi-en': ['sgm/newstest2019-fien-src.fi.sgm', 'sgm/newstest2019-fien-ref.en.sgm',],
+        'fr-de': ['sgm/newstest2019-frde-src.fr.sgm', 'sgm/newstest2019-frde-ref.de.sgm',],
+        'gu-en': ['sgm/newstest2019-guen-src.gu.sgm', 'sgm/newstest2019-guen-ref.en.sgm',],
+        'kk-en': ['sgm/newstest2019-kken-src.kk.sgm', 'sgm/newstest2019-kken-ref.en.sgm',],
+        'lt-en': ['sgm/newstest2019-lten-src.lt.sgm', 'sgm/newstest2019-lten-ref.en.sgm',],
+        'ru-en': ['sgm/newstest2019-ruen-src.ru.sgm', 'sgm/newstest2019-ruen-ref.en.sgm',],
+        'zh-en': ['sgm/newstest2019-zhen-src.zh.sgm', 'sgm/newstest2019-zhen-ref.en.sgm',],
     },
     'wmt19/dev': {
         'data': ['http://data.statmt.org/wmt19/translation-task/dev.tgz'],
-        'description':
-            'Development data for tasks new to 2019.',
+        'description': 'Development data for tasks new to 2019.',
         'md5': ['f2ec7af5947c19e0cacb3882eb208002'],
-        'lt-en':
-            ['dev/newsdev2019-lten-src.lt.sgm',
-             'dev/newsdev2019-lten-ref.en.sgm'],
-        'en-lt':
-            ['dev/newsdev2019-enlt-src.en.sgm',
-             'dev/newsdev2019-enlt-ref.lt.sgm'],
-        'gu-en':
-            ['dev/newsdev2019-guen-src.gu.sgm',
-             'dev/newsdev2019-guen-ref.en.sgm'],
-        'en-gu':
-            ['dev/newsdev2019-engu-src.en.sgm',
-             'dev/newsdev2019-engu-ref.gu.sgm'],
-        'kk-en':
-            ['dev/newsdev2019-kken-src.kk.sgm',
-             'dev/newsdev2019-kken-ref.en.sgm'],
-        'en-kk':
-            ['dev/newsdev2019-enkk-src.en.sgm',
-             'dev/newsdev2019-enkk-ref.kk.sgm'],
+        'lt-en': ['dev/newsdev2019-lten-src.lt.sgm', 'dev/newsdev2019-lten-ref.en.sgm',],
+        'en-lt': ['dev/newsdev2019-enlt-src.en.sgm', 'dev/newsdev2019-enlt-ref.lt.sgm',],
+        'gu-en': ['dev/newsdev2019-guen-src.gu.sgm', 'dev/newsdev2019-guen-ref.en.sgm',],
+        'en-gu': ['dev/newsdev2019-engu-src.en.sgm', 'dev/newsdev2019-engu-ref.gu.sgm',],
+        'kk-en': ['dev/newsdev2019-kken-src.kk.sgm', 'dev/newsdev2019-kken-ref.en.sgm',],
+        'en-kk': ['dev/newsdev2019-enkk-src.en.sgm', 'dev/newsdev2019-enkk-ref.kk.sgm',],
     },
     'wmt18': {
         'data': ['http://data.statmt.org/wmt18/translation-task/test.tgz'],
         'md5': ['f996c245ecffea23d0006fa4c34e9064'],
-        'description':
-            'Official evaluation data.',
-        'citation':
-            '@inproceedings{bojar-etal-2018-findings,\n    title = "Findings of the 2018 Conference on Machine Translation ({WMT}18)",\n    author = "Bojar, Ond{\v{r}}ej  and\n      Federmann, Christian  and\n      Fishel, Mark  and\n      Graham, Yvette  and\n      Haddow, Barry  and\n      Koehn, Philipp  and\n      Monz, Christof",\n    booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",\n    month = oct,\n    year = "2018",\n    address = "Belgium, Brussels",\n    publisher = "Association for Computational Linguistics",\n    url = "https://www.aclweb.org/anthology/W18-6401",\n    pages = "272--303",\n}',
-        'cs-en': [
-            'test/newstest2018-csen-src.cs.sgm',
-            'test/newstest2018-csen-ref.en.sgm'
-        ],
-        'de-en': [
-            'test/newstest2018-deen-src.de.sgm',
-            'test/newstest2018-deen-ref.en.sgm'
-        ],
-        'en-cs': [
-            'test/newstest2018-encs-src.en.sgm',
-            'test/newstest2018-encs-ref.cs.sgm'
-        ],
-        'en-de': [
-            'test/newstest2018-ende-src.en.sgm',
-            'test/newstest2018-ende-ref.de.sgm'
-        ],
-        'en-et': [
-            'test/newstest2018-enet-src.en.sgm',
-            'test/newstest2018-enet-ref.et.sgm'
-        ],
-        'en-fi': [
-            'test/newstest2018-enfi-src.en.sgm',
-            'test/newstest2018-enfi-ref.fi.sgm'
-        ],
-        'en-ru': [
-            'test/newstest2018-enru-src.en.sgm',
-            'test/newstest2018-enru-ref.ru.sgm'
-        ],
-        'et-en': [
-            'test/newstest2018-eten-src.et.sgm',
-            'test/newstest2018-eten-ref.en.sgm'
-        ],
-        'fi-en': [
-            'test/newstest2018-fien-src.fi.sgm',
-            'test/newstest2018-fien-ref.en.sgm'
-        ],
-        'ru-en': [
-            'test/newstest2018-ruen-src.ru.sgm',
-            'test/newstest2018-ruen-ref.en.sgm'
-        ],
-        'en-tr': [
-            'test/newstest2018-entr-src.en.sgm',
-            'test/newstest2018-entr-ref.tr.sgm'
-        ],
-        'tr-en': [
-            'test/newstest2018-tren-src.tr.sgm',
-            'test/newstest2018-tren-ref.en.sgm'
-        ],
-        'en-zh': [
-            'test/newstest2018-enzh-src.en.sgm',
-            'test/newstest2018-enzh-ref.zh.sgm'
-        ],
-        'zh-en': [
-            'test/newstest2018-zhen-src.zh.sgm',
-            'test/newstest2018-zhen-ref.en.sgm'
-        ],
+        'description': 'Official evaluation data.',
+        'citation': '@inproceedings{bojar-etal-2018-findings,\n    title = "Findings of the 2018 Conference on Machine Translation ({WMT}18)",\n    author = "Bojar, Ond{\v{r}}ej  and\n      Federmann, Christian  and\n      Fishel, Mark  and\n      Graham, Yvette  and\n      Haddow, Barry  and\n      Koehn, Philipp  and\n      Monz, Christof",\n    booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",\n    month = oct,\n    year = "2018",\n    address = "Belgium, Brussels",\n    publisher = "Association for Computational Linguistics",\n    url = "https://www.aclweb.org/anthology/W18-6401",\n    pages = "272--303",\n}',
+        'cs-en': ['test/newstest2018-csen-src.cs.sgm', 'test/newstest2018-csen-ref.en.sgm',],
+        'de-en': ['test/newstest2018-deen-src.de.sgm', 'test/newstest2018-deen-ref.en.sgm',],
+        'en-cs': ['test/newstest2018-encs-src.en.sgm', 'test/newstest2018-encs-ref.cs.sgm',],
+        'en-de': ['test/newstest2018-ende-src.en.sgm', 'test/newstest2018-ende-ref.de.sgm',],
+        'en-et': ['test/newstest2018-enet-src.en.sgm', 'test/newstest2018-enet-ref.et.sgm',],
+        'en-fi': ['test/newstest2018-enfi-src.en.sgm', 'test/newstest2018-enfi-ref.fi.sgm',],
+        'en-ru': ['test/newstest2018-enru-src.en.sgm', 'test/newstest2018-enru-ref.ru.sgm',],
+        'et-en': ['test/newstest2018-eten-src.et.sgm', 'test/newstest2018-eten-ref.en.sgm',],
+        'fi-en': ['test/newstest2018-fien-src.fi.sgm', 'test/newstest2018-fien-ref.en.sgm',],
+        'ru-en': ['test/newstest2018-ruen-src.ru.sgm', 'test/newstest2018-ruen-ref.en.sgm',],
+        'en-tr': ['test/newstest2018-entr-src.en.sgm', 'test/newstest2018-entr-ref.tr.sgm',],
+        'tr-en': ['test/newstest2018-tren-src.tr.sgm', 'test/newstest2018-tren-ref.en.sgm',],
+        'en-zh': ['test/newstest2018-enzh-src.en.sgm', 'test/newstest2018-enzh-ref.zh.sgm',],
+        'zh-en': ['test/newstest2018-zhen-src.zh.sgm', 'test/newstest2018-zhen-ref.en.sgm',],
     },
     'wmt18/test-ts': {
         'data': ['http://data.statmt.org/wmt18/translation-task/test-ts.tgz'],
         'md5': ['5c621a34d512cc2dd74162ae7d00b320'],
-        'description':
-            'Official evaluation sources with extra test sets interleaved.',
+        'description': 'Official evaluation sources with extra test sets interleaved.',
         'cs-en': ['test/newstest2018-csen-src-ts.cs.sgm'],
         'de-en': ['test/newstest2018-deen-src-ts.de.sgm'],
         'en-cs': ['test/newstest2018-encs-src-ts.en.sgm'],
@@ -337,440 +199,202 @@
     'wmt18/dev': {
         'data': ['http://data.statmt.org/wmt18/translation-task/dev.tgz'],
         'md5': ['486f391da54a7a3247f02ebd25996f24'],
-        'description':
-            'Development data (Estonian<>English).',
-        'et-en':
-            ['dev/newsdev2018-eten-src.et.sgm',
-             'dev/newsdev2018-eten-ref.en.sgm'],
-        'en-et':
-            ['dev/newsdev2018-enet-src.en.sgm',
-             'dev/newsdev2018-enet-ref.et.sgm'],
+        'description': 'Development data (Estonian<>English).',
+        'et-en': ['dev/newsdev2018-eten-src.et.sgm', 'dev/newsdev2018-eten-ref.en.sgm',],
+        'en-et': ['dev/newsdev2018-enet-src.en.sgm', 'dev/newsdev2018-enet-ref.et.sgm',],
     },
     'wmt17': {
         'data': ['http://data.statmt.org/wmt17/translation-task/test.tgz'],
         'md5': ['86a1724c276004aa25455ae2a04cef26'],
-        'description':
-            'Official evaluation data.',
-        'citation':
-            '@InProceedings{bojar-EtAl:2017:WMT1,\n  author    = {Bojar, Ond\\v{r}ej  and  Chatterjee, Rajen  and  Federmann, Christian  and  Graham, Yvette  and  Haddow, Barry  and  Huang, Shujian  and  Huck, Matthias  and  Koehn, Philipp  and  Liu, Qun  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Post, Matt  and  Rubino, Raphael  and  Specia, Lucia  and  Turchi, Marco},\n  title     = {Findings of the 2017 Conference on Machine Translation (WMT17)},\n  booktitle = {Proceedings of the Second Conference on Machine Translation, Volume 2: Shared Task Papers},\n  month     = {September},\n  year      = {2017},\n  address   = {Copenhagen, Denmark},\n  publisher = {Association for Computational Linguistics},\n  pages     = {169--214},\n  url       = {http://www.aclweb.org/anthology/W17-4717}\n}',
-        'cs-en': [
-            'test/newstest2017-csen-src.cs.sgm',
-            'test/newstest2017-csen-ref.en.sgm'
-        ],
-        'de-en': [
-            'test/newstest2017-deen-src.de.sgm',
-            'test/newstest2017-deen-ref.en.sgm'
-        ],
-        'en-cs': [
-            'test/newstest2017-encs-src.en.sgm',
-            'test/newstest2017-encs-ref.cs.sgm'
-        ],
-        'en-de': [
-            'test/newstest2017-ende-src.en.sgm',
-            'test/newstest2017-ende-ref.de.sgm'
-        ],
-        'en-fi': [
-            'test/newstest2017-enfi-src.en.sgm',
-            'test/newstest2017-enfi-ref.fi.sgm'
-        ],
-        'en-lv': [
-            'test/newstest2017-enlv-src.en.sgm',
-            'test/newstest2017-enlv-ref.lv.sgm'
-        ],
-        'en-ru': [
-            'test/newstest2017-enru-src.en.sgm',
-            'test/newstest2017-enru-ref.ru.sgm'
-        ],
-        'en-tr': [
-            'test/newstest2017-entr-src.en.sgm',
-            'test/newstest2017-entr-ref.tr.sgm'
-        ],
-        'en-zh': [
-            'test/newstest2017-enzh-src.en.sgm',
-            'test/newstest2017-enzh-ref.zh.sgm'
-        ],
-        'fi-en': [
-            'test/newstest2017-fien-src.fi.sgm',
-            'test/newstest2017-fien-ref.en.sgm'
-        ],
-        'lv-en': [
-            'test/newstest2017-lven-src.lv.sgm',
-            'test/newstest2017-lven-ref.en.sgm'
-        ],
-        'ru-en': [
-            'test/newstest2017-ruen-src.ru.sgm',
-            'test/newstest2017-ruen-ref.en.sgm'
-        ],
-        'tr-en': [
-            'test/newstest2017-tren-src.tr.sgm',
-            'test/newstest2017-tren-ref.en.sgm'
-        ],
-        'zh-en': [
-            'test/newstest2017-zhen-src.zh.sgm',
-            'test/newstest2017-zhen-ref.en.sgm'
-        ],
+        'description': 'Official evaluation data.',
+        'citation': '@InProceedings{bojar-EtAl:2017:WMT1,\n  author    = {Bojar, Ond\\v{r}ej  and  Chatterjee, Rajen  and  Federmann, Christian  and  Graham, Yvette  and  Haddow, Barry  and  Huang, Shujian  and  Huck, Matthias  and  Koehn, Philipp  and  Liu, Qun  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Post, Matt  and  Rubino, Raphael  and  Specia, Lucia  and  Turchi, Marco},\n  title     = {Findings of the 2017 Conference on Machine Translation (WMT17)},\n  booktitle = {Proceedings of the Second Conference on Machine Translation, Volume 2: Shared Task Papers},\n  month     = {September},\n  year      = {2017},\n  address   = {Copenhagen, Denmark},\n  publisher = {Association for Computational Linguistics},\n  pages     = {169--214},\n  url       = {http://www.aclweb.org/anthology/W17-4717}\n}',
+        'cs-en': ['test/newstest2017-csen-src.cs.sgm', 'test/newstest2017-csen-ref.en.sgm',],
+        'de-en': ['test/newstest2017-deen-src.de.sgm', 'test/newstest2017-deen-ref.en.sgm',],
+        'en-cs': ['test/newstest2017-encs-src.en.sgm', 'test/newstest2017-encs-ref.cs.sgm',],
+        'en-de': ['test/newstest2017-ende-src.en.sgm', 'test/newstest2017-ende-ref.de.sgm',],
+        'en-fi': ['test/newstest2017-enfi-src.en.sgm', 'test/newstest2017-enfi-ref.fi.sgm',],
+        'en-lv': ['test/newstest2017-enlv-src.en.sgm', 'test/newstest2017-enlv-ref.lv.sgm',],
+        'en-ru': ['test/newstest2017-enru-src.en.sgm', 'test/newstest2017-enru-ref.ru.sgm',],
+        'en-tr': ['test/newstest2017-entr-src.en.sgm', 'test/newstest2017-entr-ref.tr.sgm',],
+        'en-zh': ['test/newstest2017-enzh-src.en.sgm', 'test/newstest2017-enzh-ref.zh.sgm',],
+        'fi-en': ['test/newstest2017-fien-src.fi.sgm', 'test/newstest2017-fien-ref.en.sgm',],
+        'lv-en': ['test/newstest2017-lven-src.lv.sgm', 'test/newstest2017-lven-ref.en.sgm',],
+        'ru-en': ['test/newstest2017-ruen-src.ru.sgm', 'test/newstest2017-ruen-ref.en.sgm',],
+        'tr-en': ['test/newstest2017-tren-src.tr.sgm', 'test/newstest2017-tren-ref.en.sgm',],
+        'zh-en': ['test/newstest2017-zhen-src.zh.sgm', 'test/newstest2017-zhen-ref.en.sgm',],
     },
     'wmt17/B': {
         'data': ['http://data.statmt.org/wmt17/translation-task/test.tgz'],
         'md5': ['86a1724c276004aa25455ae2a04cef26'],
-        'description':
-            'Additional reference for EN-FI and FI-EN.',
-        'en-fi': [
-            'test/newstestB2017-enfi-src.en.sgm',
-            'test/newstestB2017-enfi-ref.fi.sgm'
-        ],
+        'description': 'Additional reference for EN-FI and FI-EN.',
+        'en-fi': ['test/newstestB2017-enfi-src.en.sgm', 'test/newstestB2017-enfi-ref.fi.sgm',],
     },
     'wmt17/tworefs': {
         'data': ['http://data.statmt.org/wmt17/translation-task/test.tgz'],
         'md5': ['86a1724c276004aa25455ae2a04cef26'],
-        'description':
-            'Systems with two references.',
+        'description': 'Systems with two references.',
         'en-fi': [
             'test/newstest2017-enfi-src.en.sgm',
             'test/newstest2017-enfi-ref.fi.sgm',
-            'test/newstestB2017-enfi-ref.fi.sgm'
+            'test/newstestB2017-enfi-ref.fi.sgm',
         ],
     },
     'wmt17/improved': {
-        'data':
-            ['http://data.statmt.org/wmt17/translation-task/test-update-1.tgz'],
+        'data': ['http://data.statmt.org/wmt17/translation-task/test-update-1.tgz'],
         'md5': ['91dbfd5af99bc6891a637a68e04dfd41'],
         'description': 'Improved zh-en and en-zh translations.',
-        'en-zh':
-            ['newstest2017-enzh-src.en.sgm', 'newstest2017-enzh-ref.zh.sgm'],
-        'zh-en':
-            ['newstest2017-zhen-src.zh.sgm', 'newstest2017-zhen-ref.en.sgm'],
+        'en-zh': ['newstest2017-enzh-src.en.sgm', 'newstest2017-enzh-ref.zh.sgm',],
+        'zh-en': ['newstest2017-zhen-src.zh.sgm', 'newstest2017-zhen-ref.en.sgm',],
     },
     'wmt17/dev': {
         'data': ['http://data.statmt.org/wmt17/translation-task/dev.tgz'],
         'md5': ['9b1aa63c1cf49dccdd20b962fe313989'],
-        'description':
-            'Development sets released for new languages in 2017.',
-        'en-lv':
-            ['dev/newsdev2017-enlv-src.en.sgm',
-             'dev/newsdev2017-enlv-ref.lv.sgm'],
-        'en-zh':
-            ['dev/newsdev2017-enzh-src.en.sgm',
-             'dev/newsdev2017-enzh-ref.zh.sgm'],
-        'lv-en':
-            ['dev/newsdev2017-lven-src.lv.sgm',
-             'dev/newsdev2017-lven-ref.en.sgm'],
-        'zh-en':
-            ['dev/newsdev2017-zhen-src.zh.sgm',
-             'dev/newsdev2017-zhen-ref.en.sgm'],
+        'description': 'Development sets released for new languages in 2017.',
+        'en-lv': ['dev/newsdev2017-enlv-src.en.sgm', 'dev/newsdev2017-enlv-ref.lv.sgm',],
+        'en-zh': ['dev/newsdev2017-enzh-src.en.sgm', 'dev/newsdev2017-enzh-ref.zh.sgm',],
+        'lv-en': ['dev/newsdev2017-lven-src.lv.sgm', 'dev/newsdev2017-lven-ref.en.sgm',],
+        'zh-en': ['dev/newsdev2017-zhen-src.zh.sgm', 'dev/newsdev2017-zhen-ref.en.sgm',],
     },
     'wmt17/ms': {
         'data': [
             'https://github.com/MicrosoftTranslator/Translator-HumanParityData/archive/master.zip',
-            'http://data.statmt.org/wmt17/translation-task/test-update-1.tgz'
-        ],
-        'md5': [
-            '18fdaa7a3c84cf6ef688da1f6a5fa96f',
-            '91dbfd5af99bc6891a637a68e04dfd41'
+            'http://data.statmt.org/wmt17/translation-task/test-update-1.tgz',
         ],
-        'description':
-            'Additional Chinese-English references from Microsoft Research.',
-        'citation':
-            '@inproceedings{achieving-human-parity-on-automatic-chinese-to-english-news-translation,\n  author = {Hassan Awadalla, Hany and Aue, Anthony and Chen, Chang and Chowdhary, Vishal and Clark, Jonathan and Federmann, Christian and Huang, Xuedong and Junczys-Dowmunt, Marcin and Lewis, Will and Li, Mu and Liu, Shujie and Liu, Tie-Yan and Luo, Renqian and Menezes, Arul and Qin, Tao and Seide, Frank and Tan, Xu and Tian, Fei and Wu, Lijun and Wu, Shuangzhi and Xia, Yingce and Zhang, Dongdong and Zhang, Zhirui and Zhou, Ming},\n  title = {Achieving Human Parity on Automatic Chinese to English News Translation},\n  booktitle = {},\n  year = {2018},\n  month = {March},\n  abstract = {Machine translation has made rapid advances in recent years. Millions of people are using it today in online translation systems and mobile applications in order to communicate across language barriers. The question naturally arises whether such systems can approach or achieve parity with human translations. In this paper, we first address the problem of how to define and accurately measure human parity in translation. We then describe Microsoft’s machine translation system and measure the quality of its translations on the widely used WMT 2017 news translation task from Chinese to English. We find that our latest neural machine translation system has reached a new state-of-the-art, and that the translation quality is at human parity when compared to professional human translations. We also find that it significantly exceeds the quality of crowd-sourced non-professional translations.},\n  publisher = {},\n  url = {https://www.microsoft.com/en-us/research/publication/achieving-human-parity-on-automatic-chinese-to-english-news-translation/},\n  address = {},\n  pages = {},\n  journal = {},\n  volume = {},\n  chapter = {},\n  isbn = {},\n}',
+        'md5': ['18fdaa7a3c84cf6ef688da1f6a5fa96f', '91dbfd5af99bc6891a637a68e04dfd41',],
+        'description': 'Additional Chinese-English references from Microsoft Research.',
+        'citation': '@inproceedings{achieving-human-parity-on-automatic-chinese-to-english-news-translation,\n  author = {Hassan Awadalla, Hany and Aue, Anthony and Chen, Chang and Chowdhary, Vishal and Clark, Jonathan and Federmann, Christian and Huang, Xuedong and Junczys-Dowmunt, Marcin and Lewis, Will and Li, Mu and Liu, Shujie and Liu, Tie-Yan and Luo, Renqian and Menezes, Arul and Qin, Tao and Seide, Frank and Tan, Xu and Tian, Fei and Wu, Lijun and Wu, Shuangzhi and Xia, Yingce and Zhang, Dongdong and Zhang, Zhirui and Zhou, Ming},\n  title = {Achieving Human Parity on Automatic Chinese to English News Translation},\n  booktitle = {},\n  year = {2018},\n  month = {March},\n  abstract = {Machine translation has made rapid advances in recent years. Millions of people are using it today in online translation systems and mobile applications in order to communicate across language barriers. The question naturally arises whether such systems can approach or achieve parity with human translations. In this paper, we first address the problem of how to define and accurately measure human parity in translation. We then describe Microsoft’s machine translation system and measure the quality of its translations on the widely used WMT 2017 news translation task from Chinese to English. We find that our latest neural machine translation system has reached a new state-of-the-art, and that the translation quality is at human parity when compared to professional human translations. We also find that it significantly exceeds the quality of crowd-sourced non-professional translations.},\n  publisher = {},\n  url = {https://www.microsoft.com/en-us/research/publication/achieving-human-parity-on-automatic-chinese-to-english-news-translation/},\n  address = {},\n  pages = {},\n  journal = {},\n  volume = {},\n  chapter = {},\n  isbn = {},\n}',
         'zh-en': [
-            'newstest2017-zhen-src.zh.sgm', 'newstest2017-zhen-ref.en.sgm',
+            'newstest2017-zhen-src.zh.sgm',
+            'newstest2017-zhen-ref.en.sgm',
             'Translator-HumanParityData-master/Translator-HumanParityData/References/Translator-HumanParityData-Reference-HT.txt',
-            'Translator-HumanParityData-master/Translator-HumanParityData/References/Translator-HumanParityData-Reference-PE.txt'
+            'Translator-HumanParityData-master/Translator-HumanParityData/References/Translator-HumanParityData-Reference-PE.txt',
         ],
     },
     'wmt16': {
         'data': ['http://data.statmt.org/wmt16/translation-task/test.tgz'],
         'md5': ['3d809cd0c2c86adb2c67034d15c4e446'],
-        'description':
-            'Official evaluation data.',
-        'citation':
-            '@InProceedings{bojar-EtAl:2016:WMT1,\n  author    = {Bojar, Ond\\v{r}ej  and  Chatterjee, Rajen  and  Federmann, Christian  and  Graham, Yvette  and  Haddow, Barry  and  Huck, Matthias  and  Jimeno Yepes, Antonio  and  Koehn, Philipp  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Neveol, Aurelie  and  Neves, Mariana  and  Popel, Martin  and  Post, Matt  and  Rubino, Raphael  and  Scarton, Carolina  and  Specia, Lucia  and  Turchi, Marco  and  Verspoor, Karin  and  Zampieri, Marcos},\n  title     = {Findings of the 2016 Conference on Machine Translation},\n  booktitle = {Proceedings of the First Conference on Machine Translation},\n  month     = {August},\n  year      = {2016},\n  address   = {Berlin, Germany},\n  publisher = {Association for Computational Linguistics},\n  pages     = {131--198},\n  url       = {http://www.aclweb.org/anthology/W/W16/W16-2301}\n}',
-        'cs-en': [
-            'test/newstest2016-csen-src.cs.sgm',
-            'test/newstest2016-csen-ref.en.sgm'
-        ],
-        'de-en': [
-            'test/newstest2016-deen-src.de.sgm',
-            'test/newstest2016-deen-ref.en.sgm'
-        ],
-        'en-cs': [
-            'test/newstest2016-encs-src.en.sgm',
-            'test/newstest2016-encs-ref.cs.sgm'
-        ],
-        'en-de': [
-            'test/newstest2016-ende-src.en.sgm',
-            'test/newstest2016-ende-ref.de.sgm'
-        ],
-        'en-fi': [
-            'test/newstest2016-enfi-src.en.sgm',
-            'test/newstest2016-enfi-ref.fi.sgm'
-        ],
-        'en-ro': [
-            'test/newstest2016-enro-src.en.sgm',
-            'test/newstest2016-enro-ref.ro.sgm'
-        ],
-        'en-ru': [
-            'test/newstest2016-enru-src.en.sgm',
-            'test/newstest2016-enru-ref.ru.sgm'
-        ],
-        'en-tr': [
-            'test/newstest2016-entr-src.en.sgm',
-            'test/newstest2016-entr-ref.tr.sgm'
-        ],
-        'fi-en': [
-            'test/newstest2016-fien-src.fi.sgm',
-            'test/newstest2016-fien-ref.en.sgm'
-        ],
-        'ro-en': [
-            'test/newstest2016-roen-src.ro.sgm',
-            'test/newstest2016-roen-ref.en.sgm'
-        ],
-        'ru-en': [
-            'test/newstest2016-ruen-src.ru.sgm',
-            'test/newstest2016-ruen-ref.en.sgm'
-        ],
-        'tr-en': [
-            'test/newstest2016-tren-src.tr.sgm',
-            'test/newstest2016-tren-ref.en.sgm'
-        ],
+        'description': 'Official evaluation data.',
+        'citation': '@InProceedings{bojar-EtAl:2016:WMT1,\n  author    = {Bojar, Ond\\v{r}ej  and  Chatterjee, Rajen  and  Federmann, Christian  and  Graham, Yvette  and  Haddow, Barry  and  Huck, Matthias  and  Jimeno Yepes, Antonio  and  Koehn, Philipp  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Neveol, Aurelie  and  Neves, Mariana  and  Popel, Martin  and  Post, Matt  and  Rubino, Raphael  and  Scarton, Carolina  and  Specia, Lucia  and  Turchi, Marco  and  Verspoor, Karin  and  Zampieri, Marcos},\n  title     = {Findings of the 2016 Conference on Machine Translation},\n  booktitle = {Proceedings of the First Conference on Machine Translation},\n  month     = {August},\n  year      = {2016},\n  address   = {Berlin, Germany},\n  publisher = {Association for Computational Linguistics},\n  pages     = {131--198},\n  url       = {http://www.aclweb.org/anthology/W/W16/W16-2301}\n}',
+        'cs-en': ['test/newstest2016-csen-src.cs.sgm', 'test/newstest2016-csen-ref.en.sgm',],
+        'de-en': ['test/newstest2016-deen-src.de.sgm', 'test/newstest2016-deen-ref.en.sgm',],
+        'en-cs': ['test/newstest2016-encs-src.en.sgm', 'test/newstest2016-encs-ref.cs.sgm',],
+        'en-de': ['test/newstest2016-ende-src.en.sgm', 'test/newstest2016-ende-ref.de.sgm',],
+        'en-fi': ['test/newstest2016-enfi-src.en.sgm', 'test/newstest2016-enfi-ref.fi.sgm',],
+        'en-ro': ['test/newstest2016-enro-src.en.sgm', 'test/newstest2016-enro-ref.ro.sgm',],
+        'en-ru': ['test/newstest2016-enru-src.en.sgm', 'test/newstest2016-enru-ref.ru.sgm',],
+        'en-tr': ['test/newstest2016-entr-src.en.sgm', 'test/newstest2016-entr-ref.tr.sgm',],
+        'fi-en': ['test/newstest2016-fien-src.fi.sgm', 'test/newstest2016-fien-ref.en.sgm',],
+        'ro-en': ['test/newstest2016-roen-src.ro.sgm', 'test/newstest2016-roen-ref.en.sgm',],
+        'ru-en': ['test/newstest2016-ruen-src.ru.sgm', 'test/newstest2016-ruen-ref.en.sgm',],
+        'tr-en': ['test/newstest2016-tren-src.tr.sgm', 'test/newstest2016-tren-ref.en.sgm',],
     },
     'wmt16/B': {
         'data': ['http://data.statmt.org/wmt16/translation-task/test.tgz'],
         'md5': ['3d809cd0c2c86adb2c67034d15c4e446'],
-        'description':
-            'Additional reference for EN-FI.',
-        'en-fi': [
-            'test/newstest2016-enfi-src.en.sgm',
-            'test/newstestB2016-enfi-ref.fi.sgm'
-        ],
+        'description': 'Additional reference for EN-FI.',
+        'en-fi': ['test/newstest2016-enfi-src.en.sgm', 'test/newstestB2016-enfi-ref.fi.sgm',],
     },
     'wmt16/tworefs': {
         'data': ['http://data.statmt.org/wmt16/translation-task/test.tgz'],
         'md5': ['3d809cd0c2c86adb2c67034d15c4e446'],
-        'description':
-            'EN-FI with two references.',
+        'description': 'EN-FI with two references.',
         'en-fi': [
             'test/newstest2016-enfi-src.en.sgm',
             'test/newstest2016-enfi-ref.fi.sgm',
-            'test/newstestB2016-enfi-ref.fi.sgm'
+            'test/newstestB2016-enfi-ref.fi.sgm',
         ],
     },
     'wmt16/dev': {
         'data': ['http://data.statmt.org/wmt16/translation-task/dev.tgz'],
         'md5': ['4a3dc2760bb077f4308cce96b06e6af6'],
-        'description':
-            'Development sets released for new languages in 2016.',
-        'en-ro':
-            ['dev/newsdev2016-enro-src.en.sgm',
-             'dev/newsdev2016-enro-ref.ro.sgm'],
-        'en-tr':
-            ['dev/newsdev2016-entr-src.en.sgm',
-             'dev/newsdev2016-entr-ref.tr.sgm'],
-        'ro-en':
-            ['dev/newsdev2016-roen-src.ro.sgm',
-             'dev/newsdev2016-roen-ref.en.sgm'],
-        'tr-en':
-            ['dev/newsdev2016-tren-src.tr.sgm',
-             'dev/newsdev2016-tren-ref.en.sgm']
+        'description': 'Development sets released for new languages in 2016.',
+        'en-ro': ['dev/newsdev2016-enro-src.en.sgm', 'dev/newsdev2016-enro-ref.ro.sgm',],
+        'en-tr': ['dev/newsdev2016-entr-src.en.sgm', 'dev/newsdev2016-entr-ref.tr.sgm',],
+        'ro-en': ['dev/newsdev2016-roen-src.ro.sgm', 'dev/newsdev2016-roen-ref.en.sgm',],
+        'tr-en': ['dev/newsdev2016-tren-src.tr.sgm', 'dev/newsdev2016-tren-ref.en.sgm',],
     },
     'wmt15': {
         'data': ['http://statmt.org/wmt15/test.tgz'],
         'md5': ['67e3beca15e69fe3d36de149da0a96df'],
-        'description':
-            'Official evaluation data.',
-        'citation':
-            '@InProceedings{bojar-EtAl:2015:WMT,\n  author    = {Bojar, Ond\\v{r}ej  and  Chatterjee, Rajen  and  Federmann, Christian  and  Haddow, Barry  and  Huck, Matthias  and  Hokamp, Chris  and  Koehn, Philipp  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Post, Matt  and  Scarton, Carolina  and  Specia, Lucia  and  Turchi, Marco},\n  title     = {Findings of the 2015 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Tenth Workshop on Statistical Machine Translation},\n  month     = {September},\n  year      = {2015},\n  address   = {Lisbon, Portugal},\n  publisher = {Association for Computational Linguistics},\n  pages     = {1--46},\n  url       = {http://aclweb.org/anthology/W15-3001}\n}',
-        'en-fr': [
-            'test/newsdiscusstest2015-enfr-src.en.sgm',
-            'test/newsdiscusstest2015-enfr-ref.fr.sgm'
-        ],
-        'fr-en': [
-            'test/newsdiscusstest2015-fren-src.fr.sgm',
-            'test/newsdiscusstest2015-fren-ref.en.sgm'
-        ],
-        'cs-en': [
-            'test/newstest2015-csen-src.cs.sgm',
-            'test/newstest2015-csen-ref.en.sgm'
-        ],
-        'de-en': [
-            'test/newstest2015-deen-src.de.sgm',
-            'test/newstest2015-deen-ref.en.sgm'
-        ],
-        'en-cs': [
-            'test/newstest2015-encs-src.en.sgm',
-            'test/newstest2015-encs-ref.cs.sgm'
-        ],
-        'en-de': [
-            'test/newstest2015-ende-src.en.sgm',
-            'test/newstest2015-ende-ref.de.sgm'
-        ],
-        'en-fi': [
-            'test/newstest2015-enfi-src.en.sgm',
-            'test/newstest2015-enfi-ref.fi.sgm'
-        ],
-        'en-ru': [
-            'test/newstest2015-enru-src.en.sgm',
-            'test/newstest2015-enru-ref.ru.sgm'
-        ],
-        'fi-en': [
-            'test/newstest2015-fien-src.fi.sgm',
-            'test/newstest2015-fien-ref.en.sgm'
-        ],
-        'ru-en': [
-            'test/newstest2015-ruen-src.ru.sgm',
-            'test/newstest2015-ruen-ref.en.sgm'
-        ]
+        'description': 'Official evaluation data.',
+        'citation': '@InProceedings{bojar-EtAl:2015:WMT,\n  author    = {Bojar, Ond\\v{r}ej  and  Chatterjee, Rajen  and  Federmann, Christian  and  Haddow, Barry  and  Huck, Matthias  and  Hokamp, Chris  and  Koehn, Philipp  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Post, Matt  and  Scarton, Carolina  and  Specia, Lucia  and  Turchi, Marco},\n  title     = {Findings of the 2015 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Tenth Workshop on Statistical Machine Translation},\n  month     = {September},\n  year      = {2015},\n  address   = {Lisbon, Portugal},\n  publisher = {Association for Computational Linguistics},\n  pages     = {1--46},\n  url       = {http://aclweb.org/anthology/W15-3001}\n}',
+        'en-fr': ['test/newsdiscusstest2015-enfr-src.en.sgm', 'test/newsdiscusstest2015-enfr-ref.fr.sgm',],
+        'fr-en': ['test/newsdiscusstest2015-fren-src.fr.sgm', 'test/newsdiscusstest2015-fren-ref.en.sgm',],
+        'cs-en': ['test/newstest2015-csen-src.cs.sgm', 'test/newstest2015-csen-ref.en.sgm',],
+        'de-en': ['test/newstest2015-deen-src.de.sgm', 'test/newstest2015-deen-ref.en.sgm',],
+        'en-cs': ['test/newstest2015-encs-src.en.sgm', 'test/newstest2015-encs-ref.cs.sgm',],
+        'en-de': ['test/newstest2015-ende-src.en.sgm', 'test/newstest2015-ende-ref.de.sgm',],
+        'en-fi': ['test/newstest2015-enfi-src.en.sgm', 'test/newstest2015-enfi-ref.fi.sgm',],
+        'en-ru': ['test/newstest2015-enru-src.en.sgm', 'test/newstest2015-enru-ref.ru.sgm',],
+        'fi-en': ['test/newstest2015-fien-src.fi.sgm', 'test/newstest2015-fien-ref.en.sgm',],
+        'ru-en': ['test/newstest2015-ruen-src.ru.sgm', 'test/newstest2015-ruen-ref.en.sgm',],
     },
     'wmt14': {
         'data': ['http://statmt.org/wmt14/test-filtered.tgz'],
         'md5': ['84c597844c1542e29c2aff23aaee4310'],
-        'description':
-            'Official evaluation data.',
-        'citation':
-            '@InProceedings{bojar-EtAl:2014:W14-33,\n  author    = {Bojar, Ondrej  and  Buck, Christian  and  Federmann, Christian  and  Haddow, Barry  and  Koehn, Philipp  and  Leveling, Johannes  and  Monz, Christof  and  Pecina, Pavel  and  Post, Matt  and  Saint-Amand, Herve  and  Soricut, Radu  and  Specia, Lucia  and  Tamchyna, Ale\\v{s}},\n  title     = {Findings of the 2014 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Ninth Workshop on Statistical Machine Translation},\n  month     = {June},\n  year      = {2014},\n  address   = {Baltimore, Maryland, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {12--58},\n  url       = {http://www.aclweb.org/anthology/W/W14/W14-3302}\n}',
-        'cs-en': [
-            'test/newstest2014-csen-src.cs.sgm',
-            'test/newstest2014-csen-ref.en.sgm'
-        ],
-        'en-cs': [
-            'test/newstest2014-csen-src.en.sgm',
-            'test/newstest2014-csen-ref.cs.sgm'
-        ],
-        'de-en': [
-            'test/newstest2014-deen-src.de.sgm',
-            'test/newstest2014-deen-ref.en.sgm'
-        ],
-        'en-de': [
-            'test/newstest2014-deen-src.en.sgm',
-            'test/newstest2014-deen-ref.de.sgm'
-        ],
-        'en-fr': [
-            'test/newstest2014-fren-src.en.sgm',
-            'test/newstest2014-fren-ref.fr.sgm'
-        ],
-        'fr-en': [
-            'test/newstest2014-fren-src.fr.sgm',
-            'test/newstest2014-fren-ref.en.sgm'
-        ],
-        'en-hi': [
-            'test/newstest2014-hien-src.en.sgm',
-            'test/newstest2014-hien-ref.hi.sgm'
-        ],
-        'hi-en': [
-            'test/newstest2014-hien-src.hi.sgm',
-            'test/newstest2014-hien-ref.en.sgm'
-        ],
-        'en-ru': [
-            'test/newstest2014-ruen-src.en.sgm',
-            'test/newstest2014-ruen-ref.ru.sgm'
-        ],
-        'ru-en': [
-            'test/newstest2014-ruen-src.ru.sgm',
-            'test/newstest2014-ruen-ref.en.sgm'
-        ]
+        'description': 'Official evaluation data.',
+        'citation': '@InProceedings{bojar-EtAl:2014:W14-33,\n  author    = {Bojar, Ondrej  and  Buck, Christian  and  Federmann, Christian  and  Haddow, Barry  and  Koehn, Philipp  and  Leveling, Johannes  and  Monz, Christof  and  Pecina, Pavel  and  Post, Matt  and  Saint-Amand, Herve  and  Soricut, Radu  and  Specia, Lucia  and  Tamchyna, Ale\\v{s}},\n  title     = {Findings of the 2014 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Ninth Workshop on Statistical Machine Translation},\n  month     = {June},\n  year      = {2014},\n  address   = {Baltimore, Maryland, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {12--58},\n  url       = {http://www.aclweb.org/anthology/W/W14/W14-3302}\n}',
+        'cs-en': ['test/newstest2014-csen-src.cs.sgm', 'test/newstest2014-csen-ref.en.sgm',],
+        'en-cs': ['test/newstest2014-csen-src.en.sgm', 'test/newstest2014-csen-ref.cs.sgm',],
+        'de-en': ['test/newstest2014-deen-src.de.sgm', 'test/newstest2014-deen-ref.en.sgm',],
+        'en-de': ['test/newstest2014-deen-src.en.sgm', 'test/newstest2014-deen-ref.de.sgm',],
+        'en-fr': ['test/newstest2014-fren-src.en.sgm', 'test/newstest2014-fren-ref.fr.sgm',],
+        'fr-en': ['test/newstest2014-fren-src.fr.sgm', 'test/newstest2014-fren-ref.en.sgm',],
+        'en-hi': ['test/newstest2014-hien-src.en.sgm', 'test/newstest2014-hien-ref.hi.sgm',],
+        'hi-en': ['test/newstest2014-hien-src.hi.sgm', 'test/newstest2014-hien-ref.en.sgm',],
+        'en-ru': ['test/newstest2014-ruen-src.en.sgm', 'test/newstest2014-ruen-ref.ru.sgm',],
+        'ru-en': ['test/newstest2014-ruen-src.ru.sgm', 'test/newstest2014-ruen-ref.en.sgm',],
     },
     'wmt14/full': {
         'data': ['http://statmt.org/wmt14/test-full.tgz'],
         'md5': ['a8cd784e006feb32ac6f3d9ec7eb389a'],
-        'description':
-            'Evaluation data released after official evaluation for further research.',
-        'cs-en': [
-            'test-full/newstest2014-csen-src.cs.sgm',
-            'test-full/newstest2014-csen-ref.en.sgm'
-        ],
-        'en-cs': [
-            'test-full/newstest2014-csen-src.en.sgm',
-            'test-full/newstest2014-csen-ref.cs.sgm'
-        ],
-        'de-en': [
-            'test-full/newstest2014-deen-src.de.sgm',
-            'test-full/newstest2014-deen-ref.en.sgm'
-        ],
-        'en-de': [
-            'test-full/newstest2014-deen-src.en.sgm',
-            'test-full/newstest2014-deen-ref.de.sgm'
-        ],
-        'en-fr': [
-            'test-full/newstest2014-fren-src.en.sgm',
-            'test-full/newstest2014-fren-ref.fr.sgm'
-        ],
-        'fr-en': [
-            'test-full/newstest2014-fren-src.fr.sgm',
-            'test-full/newstest2014-fren-ref.en.sgm'
-        ],
-        'en-hi': [
-            'test-full/newstest2014-hien-src.en.sgm',
-            'test-full/newstest2014-hien-ref.hi.sgm'
-        ],
-        'hi-en': [
-            'test-full/newstest2014-hien-src.hi.sgm',
-            'test-full/newstest2014-hien-ref.en.sgm'
-        ],
-        'en-ru': [
-            'test-full/newstest2014-ruen-src.en.sgm',
-            'test-full/newstest2014-ruen-ref.ru.sgm'
-        ],
-        'ru-en': [
-            'test-full/newstest2014-ruen-src.ru.sgm',
-            'test-full/newstest2014-ruen-ref.en.sgm'
-        ]
+        'description': 'Evaluation data released after official evaluation for further research.',
+        'cs-en': ['test-full/newstest2014-csen-src.cs.sgm', 'test-full/newstest2014-csen-ref.en.sgm',],
+        'en-cs': ['test-full/newstest2014-csen-src.en.sgm', 'test-full/newstest2014-csen-ref.cs.sgm',],
+        'de-en': ['test-full/newstest2014-deen-src.de.sgm', 'test-full/newstest2014-deen-ref.en.sgm',],
+        'en-de': ['test-full/newstest2014-deen-src.en.sgm', 'test-full/newstest2014-deen-ref.de.sgm',],
+        'en-fr': ['test-full/newstest2014-fren-src.en.sgm', 'test-full/newstest2014-fren-ref.fr.sgm',],
+        'fr-en': ['test-full/newstest2014-fren-src.fr.sgm', 'test-full/newstest2014-fren-ref.en.sgm',],
+        'en-hi': ['test-full/newstest2014-hien-src.en.sgm', 'test-full/newstest2014-hien-ref.hi.sgm',],
+        'hi-en': ['test-full/newstest2014-hien-src.hi.sgm', 'test-full/newstest2014-hien-ref.en.sgm',],
+        'en-ru': ['test-full/newstest2014-ruen-src.en.sgm', 'test-full/newstest2014-ruen-ref.ru.sgm',],
+        'ru-en': ['test-full/newstest2014-ruen-src.ru.sgm', 'test-full/newstest2014-ruen-ref.en.sgm',],
     },
     'wmt13': {
         'data': ['http://statmt.org/wmt13/test.tgz'],
         'md5': ['48eca5d02f637af44e85186847141f67'],
         'description': 'Official evaluation data.',
-        'citation':
-            '@InProceedings{bojar-EtAl:2013:WMT,\n  author    = {Bojar, Ond\\v{r}ej  and  Buck, Christian  and  Callison-Burch, Chris  and  Federmann, Christian  and  Haddow, Barry  and  Koehn, Philipp  and  Monz, Christof  and  Post, Matt  and  Soricut, Radu  and  Specia, Lucia},\n  title     = {Findings of the 2013 {Workshop on Statistical Machine Translation}},\n  booktitle = {Proceedings of the Eighth Workshop on Statistical Machine Translation},\n  month     = {August},\n  year      = {2013},\n  address   = {Sofia, Bulgaria},\n  publisher = {Association for Computational Linguistics},\n  pages     = {1--44},\n  url       = {http://www.aclweb.org/anthology/W13-2201}\n}',
-        'cs-en':
-            ['test/newstest2013-src.cs.sgm', 'test/newstest2013-src.en.sgm'],
-        'en-cs':
-            ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.cs.sgm'],
-        'de-en':
-            ['test/newstest2013-src.de.sgm', 'test/newstest2013-src.en.sgm'],
-        'en-de':
-            ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.de.sgm'],
-        'es-en':
-            ['test/newstest2013-src.es.sgm', 'test/newstest2013-src.en.sgm'],
-        'en-es':
-            ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.es.sgm'],
-        'fr-en':
-            ['test/newstest2013-src.fr.sgm', 'test/newstest2013-src.en.sgm'],
-        'en-fr':
-            ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.fr.sgm'],
-        'ru-en':
-            ['test/newstest2013-src.ru.sgm', 'test/newstest2013-src.en.sgm'],
-        'en-ru':
-            ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.ru.sgm']
+        'citation': '@InProceedings{bojar-EtAl:2013:WMT,\n  author    = {Bojar, Ond\\v{r}ej  and  Buck, Christian  and  Callison-Burch, Chris  and  Federmann, Christian  and  Haddow, Barry  and  Koehn, Philipp  and  Monz, Christof  and  Post, Matt  and  Soricut, Radu  and  Specia, Lucia},\n  title     = {Findings of the 2013 {Workshop on Statistical Machine Translation}},\n  booktitle = {Proceedings of the Eighth Workshop on Statistical Machine Translation},\n  month     = {August},\n  year      = {2013},\n  address   = {Sofia, Bulgaria},\n  publisher = {Association for Computational Linguistics},\n  pages     = {1--44},\n  url       = {http://www.aclweb.org/anthology/W13-2201}\n}',
+        'cs-en': ['test/newstest2013-src.cs.sgm', 'test/newstest2013-src.en.sgm',],
+        'en-cs': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.cs.sgm',],
+        'de-en': ['test/newstest2013-src.de.sgm', 'test/newstest2013-src.en.sgm',],
+        'en-de': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.de.sgm',],
+        'es-en': ['test/newstest2013-src.es.sgm', 'test/newstest2013-src.en.sgm',],
+        'en-es': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.es.sgm',],
+        'fr-en': ['test/newstest2013-src.fr.sgm', 'test/newstest2013-src.en.sgm',],
+        'en-fr': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.fr.sgm',],
+        'ru-en': ['test/newstest2013-src.ru.sgm', 'test/newstest2013-src.en.sgm',],
+        'en-ru': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.ru.sgm',],
     },
     'wmt12': {
         'data': ['http://statmt.org/wmt12/test.tgz'],
         'md5': ['608232d34ebc4ba2ff70fead45674e47'],
         'description': 'Official evaluation data.',
-        'citation':
-            '@InProceedings{callisonburch-EtAl:2012:WMT,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Post, Matt  and  Soricut, Radu  and  Specia, Lucia},\n  title     = {Findings of the 2012 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Seventh Workshop on Statistical Machine Translation},\n  month     = {June},\n  year      = {2012},\n  address   = {Montr{\'e}al, Canada},\n  publisher = {Association for Computational Linguistics},\n  pages     = {10--51},\n  url       = {http://www.aclweb.org/anthology/W12-3102}\n}',
-        'cs-en':
-            ['test/newstest2012-src.cs.sgm', 'test/newstest2012-src.en.sgm'],
-        'en-cs':
-            ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.cs.sgm'],
-        'de-en':
-            ['test/newstest2012-src.de.sgm', 'test/newstest2012-src.en.sgm'],
-        'en-de':
-            ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.de.sgm'],
-        'es-en':
-            ['test/newstest2012-src.es.sgm', 'test/newstest2012-src.en.sgm'],
-        'en-es':
-            ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.es.sgm'],
-        'fr-en':
-            ['test/newstest2012-src.fr.sgm', 'test/newstest2012-src.en.sgm'],
-        'en-fr':
-            ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.fr.sgm']
+        'citation': '@InProceedings{callisonburch-EtAl:2012:WMT,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Post, Matt  and  Soricut, Radu  and  Specia, Lucia},\n  title     = {Findings of the 2012 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Seventh Workshop on Statistical Machine Translation},\n  month     = {June},\n  year      = {2012},\n  address   = {Montr{\'e}al, Canada},\n  publisher = {Association for Computational Linguistics},\n  pages     = {10--51},\n  url       = {http://www.aclweb.org/anthology/W12-3102}\n}',
+        'cs-en': ['test/newstest2012-src.cs.sgm', 'test/newstest2012-src.en.sgm',],
+        'en-cs': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.cs.sgm',],
+        'de-en': ['test/newstest2012-src.de.sgm', 'test/newstest2012-src.en.sgm',],
+        'en-de': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.de.sgm',],
+        'es-en': ['test/newstest2012-src.es.sgm', 'test/newstest2012-src.en.sgm',],
+        'en-es': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.es.sgm',],
+        'fr-en': ['test/newstest2012-src.fr.sgm', 'test/newstest2012-src.en.sgm',],
+        'en-fr': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.fr.sgm',],
     },
     'wmt11': {
         'data': ['http://statmt.org/wmt11/test.tgz'],
         'md5': ['b0c9680adf32d394aefc2b24e3a5937e'],
         'description': 'Official evaluation data.',
-        'citation':
-            '@InProceedings{callisonburch-EtAl:2011:WMT,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Zaidan, Omar},\n  title     = {Findings of the 2011 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n  month     = {July},\n  year      = {2011},\n  address   = {Edinburgh, Scotland},\n  publisher = {Association for Computational Linguistics},\n  pages     = {22--64},\n  url       = {http://www.aclweb.org/anthology/W11-2103}\n}',
+        'citation': '@InProceedings{callisonburch-EtAl:2011:WMT,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Zaidan, Omar},\n  title     = {Findings of the 2011 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n  month     = {July},\n  year      = {2011},\n  address   = {Edinburgh, Scotland},\n  publisher = {Association for Computational Linguistics},\n  pages     = {22--64},\n  url       = {http://www.aclweb.org/anthology/W11-2103}\n}',
         'cs-en': ['newstest2011-src.cs.sgm', 'newstest2011-src.en.sgm'],
         'en-cs': ['newstest2011-src.en.sgm', 'newstest2011-src.cs.sgm'],
         'de-en': ['newstest2011-src.de.sgm', 'newstest2011-src.en.sgm'],
@@ -778,98 +402,62 @@
         'fr-en': ['newstest2011-src.fr.sgm', 'newstest2011-src.en.sgm'],
         'en-fr': ['newstest2011-src.en.sgm', 'newstest2011-src.fr.sgm'],
         'es-en': ['newstest2011-src.es.sgm', 'newstest2011-src.en.sgm'],
-        'en-es': ['newstest2011-src.en.sgm', 'newstest2011-src.es.sgm']
+        'en-es': ['newstest2011-src.en.sgm', 'newstest2011-src.es.sgm'],
     },
     'wmt10': {
         'data': ['http://statmt.org/wmt10/test.tgz'],
         'md5': ['491cb885a355da5a23ea66e7b3024d5c'],
         'description': 'Official evaluation data.',
-        'citation':
-            '@InProceedings{callisonburch-EtAl:2010:WMT,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Peterson, Kay  and  Przybocki, Mark  and  Zaidan, Omar},\n  title     = {Findings of the 2010 Joint Workshop on Statistical Machine Translation and Metrics for Machine Translation},\n  booktitle = {Proceedings of the Joint Fifth Workshop on Statistical Machine Translation and MetricsMATR},\n  month     = {July},\n  year      = {2010},\n  address   = {Uppsala, Sweden},\n  publisher = {Association for Computational Linguistics},\n  pages     = {17--53},\n  note      = {Revised August 2010},\n  url       = {http://www.aclweb.org/anthology/W10-1703}\n}',
-        'cs-en':
-            ['test/newstest2010-src.cz.sgm', 'test/newstest2010-src.en.sgm'],
-        'en-cs':
-            ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.cz.sgm'],
-        'de-en':
-            ['test/newstest2010-src.de.sgm', 'test/newstest2010-src.en.sgm'],
-        'en-de':
-            ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.de.sgm'],
-        'es-en':
-            ['test/newstest2010-src.es.sgm', 'test/newstest2010-src.en.sgm'],
-        'en-es':
-            ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.es.sgm'],
-        'fr-en':
-            ['test/newstest2010-src.fr.sgm', 'test/newstest2010-src.en.sgm'],
-        'en-fr':
-            ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.fr.sgm']
+        'citation': '@InProceedings{callisonburch-EtAl:2010:WMT,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Peterson, Kay  and  Przybocki, Mark  and  Zaidan, Omar},\n  title     = {Findings of the 2010 Joint Workshop on Statistical Machine Translation and Metrics for Machine Translation},\n  booktitle = {Proceedings of the Joint Fifth Workshop on Statistical Machine Translation and MetricsMATR},\n  month     = {July},\n  year      = {2010},\n  address   = {Uppsala, Sweden},\n  publisher = {Association for Computational Linguistics},\n  pages     = {17--53},\n  note      = {Revised August 2010},\n  url       = {http://www.aclweb.org/anthology/W10-1703}\n}',
+        'cs-en': ['test/newstest2010-src.cz.sgm', 'test/newstest2010-src.en.sgm',],
+        'en-cs': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.cz.sgm',],
+        'de-en': ['test/newstest2010-src.de.sgm', 'test/newstest2010-src.en.sgm',],
+        'en-de': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.de.sgm',],
+        'es-en': ['test/newstest2010-src.es.sgm', 'test/newstest2010-src.en.sgm',],
+        'en-es': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.es.sgm',],
+        'fr-en': ['test/newstest2010-src.fr.sgm', 'test/newstest2010-src.en.sgm',],
+        'en-fr': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.fr.sgm',],
     },
     'wmt09': {
         'data': ['http://statmt.org/wmt09/test.tgz'],
         'md5': ['da227abfbd7b666ec175b742a0d27b37'],
         'description': 'Official evaluation data.',
-        'citation':
-            '@InProceedings{callisonburch-EtAl:2009:WMT-09,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Schroeder, Josh},\n  title     = {Findings of the 2009 {W}orkshop on {S}tatistical {M}achine {T}ranslation},\n  booktitle = {Proceedings of the Fourth Workshop on Statistical Machine Translation},\n  month     = {March},\n  year      = {2009},\n  address   = {Athens, Greece},\n  publisher = {Association for Computational Linguistics},\n  pages     = {1--28},\n  url       = {http://www.aclweb.org/anthology/W/W09/W09-0401}\n}',
-        'cs-en':
-            ['test/newstest2009-src.cz.sgm', 'test/newstest2009-src.en.sgm'],
-        'en-cs':
-            ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.cz.sgm'],
-        'de-en':
-            ['test/newstest2009-src.de.sgm', 'test/newstest2009-src.en.sgm'],
-        'en-de':
-            ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.de.sgm'],
-        'es-en':
-            ['test/newstest2009-src.es.sgm', 'test/newstest2009-src.en.sgm'],
-        'en-es':
-            ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.es.sgm'],
-        'fr-en':
-            ['test/newstest2009-src.fr.sgm', 'test/newstest2009-src.en.sgm'],
-        'en-fr':
-            ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.fr.sgm'],
-        'hu-en':
-            ['test/newstest2009-src.hu.sgm', 'test/newstest2009-src.en.sgm'],
-        'en-hu':
-            ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.hu.sgm'],
-        'it-en':
-            ['test/newstest2009-src.it.sgm', 'test/newstest2009-src.en.sgm'],
-        'en-it': [
-            'test/newstest2009-src.en.sgm', 'test/newstest2009-src.it.sgm'
-        ]
+        'citation': '@InProceedings{callisonburch-EtAl:2009:WMT-09,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Schroeder, Josh},\n  title     = {Findings of the 2009 {W}orkshop on {S}tatistical {M}achine {T}ranslation},\n  booktitle = {Proceedings of the Fourth Workshop on Statistical Machine Translation},\n  month     = {March},\n  year      = {2009},\n  address   = {Athens, Greece},\n  publisher = {Association for Computational Linguistics},\n  pages     = {1--28},\n  url       = {http://www.aclweb.org/anthology/W/W09/W09-0401}\n}',
+        'cs-en': ['test/newstest2009-src.cz.sgm', 'test/newstest2009-src.en.sgm',],
+        'en-cs': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.cz.sgm',],
+        'de-en': ['test/newstest2009-src.de.sgm', 'test/newstest2009-src.en.sgm',],
+        'en-de': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.de.sgm',],
+        'es-en': ['test/newstest2009-src.es.sgm', 'test/newstest2009-src.en.sgm',],
+        'en-es': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.es.sgm',],
+        'fr-en': ['test/newstest2009-src.fr.sgm', 'test/newstest2009-src.en.sgm',],
+        'en-fr': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.fr.sgm',],
+        'hu-en': ['test/newstest2009-src.hu.sgm', 'test/newstest2009-src.en.sgm',],
+        'en-hu': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.hu.sgm',],
+        'it-en': ['test/newstest2009-src.it.sgm', 'test/newstest2009-src.en.sgm',],
+        'en-it': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.it.sgm',],
     },
     'wmt08': {
         'data': ['http://statmt.org/wmt08/test.tgz'],
         'md5': ['0582e4e894a3342044059c894e1aea3d'],
         'description': 'Official evaluation data.',
-        'citation':
-            '@InProceedings{callisonburch-EtAl:2008:WMT,\n  author    = {Callison-Burch, Chris  and  Fordyce, Cameron  and  Koehn, Philipp  and  Monz, Christof  and  Schroeder, Josh},\n  title     = {Further Meta-Evaluation of Machine Translation},\n  booktitle = {Proceedings of the Third Workshop on Statistical Machine Translation},\n  month     = {June},\n  year      = {2008},\n  address   = {Columbus, Ohio},\n  publisher = {Association for Computational Linguistics},\n  pages     = {70--106},\n  url       = {http://www.aclweb.org/anthology/W/W08/W08-0309}\n}',
-        'cs-en':
-            ['test/newstest2008-src.cz.sgm', 'test/newstest2008-src.en.sgm'],
-        'en-cs':
-            ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.cz.sgm'],
-        'de-en':
-            ['test/newstest2008-src.de.sgm', 'test/newstest2008-src.en.sgm'],
-        'en-de':
-            ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.de.sgm'],
-        'es-en':
-            ['test/newstest2008-src.es.sgm', 'test/newstest2008-src.en.sgm'],
-        'en-es':
-            ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.es.sgm'],
-        'fr-en':
-            ['test/newstest2008-src.fr.sgm', 'test/newstest2008-src.en.sgm'],
-        'en-fr':
-            ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.fr.sgm'],
-        'hu-en':
-            ['test/newstest2008-src.hu.sgm', 'test/newstest2008-src.en.sgm'],
-        'en-hu':
-            ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.hu.sgm']
+        'citation': '@InProceedings{callisonburch-EtAl:2008:WMT,\n  author    = {Callison-Burch, Chris  and  Fordyce, Cameron  and  Koehn, Philipp  and  Monz, Christof  and  Schroeder, Josh},\n  title     = {Further Meta-Evaluation of Machine Translation},\n  booktitle = {Proceedings of the Third Workshop on Statistical Machine Translation},\n  month     = {June},\n  year      = {2008},\n  address   = {Columbus, Ohio},\n  publisher = {Association for Computational Linguistics},\n  pages     = {70--106},\n  url       = {http://www.aclweb.org/anthology/W/W08/W08-0309}\n}',
+        'cs-en': ['test/newstest2008-src.cz.sgm', 'test/newstest2008-src.en.sgm',],
+        'en-cs': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.cz.sgm',],
+        'de-en': ['test/newstest2008-src.de.sgm', 'test/newstest2008-src.en.sgm',],
+        'en-de': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.de.sgm',],
+        'es-en': ['test/newstest2008-src.es.sgm', 'test/newstest2008-src.en.sgm',],
+        'en-es': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.es.sgm',],
+        'fr-en': ['test/newstest2008-src.fr.sgm', 'test/newstest2008-src.en.sgm',],
+        'en-fr': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.fr.sgm',],
+        'hu-en': ['test/newstest2008-src.hu.sgm', 'test/newstest2008-src.en.sgm',],
+        'en-hu': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.hu.sgm',],
     },
     'wmt08/nc': {
         'data': ['http://statmt.org/wmt08/test.tgz'],
         'md5': ['0582e4e894a3342044059c894e1aea3d'],
         'description': 'Official evaluation data (news commentary).',
-        'cs-en':
-            ['test/nc-test2008-src.cz.sgm', 'test/nc-test2008-src.en.sgm'],
-        'en-cs':
-            ['test/nc-test2008-src.en.sgm', 'test/nc-test2008-src.cz.sgm']
+        'cs-en': ['test/nc-test2008-src.cz.sgm', 'test/nc-test2008-src.en.sgm',],
+        'en-cs': ['test/nc-test2008-src.en.sgm', 'test/nc-test2008-src.cz.sgm',],
     },
     'wmt08/europarl': {
         'data': ['http://statmt.org/wmt08/test.tgz'],
@@ -880,7 +468,7 @@
         'es-en': ['test/test2008-src.es.sgm', 'test/test2008-src.en.sgm'],
         'en-es': ['test/test2008-src.en.sgm', 'test/test2008-src.es.sgm'],
         'fr-en': ['test/test2008-src.fr.sgm', 'test/test2008-src.en.sgm'],
-        'en-fr': ['test/test2008-src.en.sgm', 'test/test2008-src.fr.sgm']
+        'en-fr': ['test/test2008-src.en.sgm', 'test/test2008-src.fr.sgm'],
     },
     'iwslt17': {
         'data': [
@@ -895,7 +483,7 @@
             'https://wit3.fbk.eu/archive/2017-01-ted-test/texts/en/ko/en-ko.tgz',
             'https://wit3.fbk.eu/archive/2017-01-ted-test/texts/ko/en/ko-en.tgz',
             'https://wit3.fbk.eu/archive/2017-01-ted-test/texts/en/zh/en-zh.tgz',
-            'https://wit3.fbk.eu/archive/2017-01-ted-test/texts/zh/en/zh-en.tgz'
+            'https://wit3.fbk.eu/archive/2017-01-ted-test/texts/zh/en/zh-en.tgz',
         ],
         'md5': [
             "1849bcc3b006dc0642a8843b11aa7192",
@@ -909,36 +497,16 @@
             "59f6a81c707378176e9ad8bb8d811f5f",
             "7e580af973bb389ec1d1378a1850742f",
             "975a858783a0ebec8c57d83ddd5bd381",
-            "cc51d9b7fe1ff2af858c6a0dd80b8815"
-        ],
-        'description':
-            'Official evaluation data for IWSLT.',
-        'citation':
-            '@InProceedings{iwslt2017,\n  author    = {Cettolo, Mauro and Federico, Marcello and Bentivogli, Luisa and Niehues, Jan and Stüker, Sebastian and Sudoh, Katsuitho and Yoshino, Koichiro and Federmann, Christian},\n  title     = {Overview of the IWSLT 2017 Evaluation Campaign},\n  booktitle = {14th International Workshop on Spoken Language Translation},\n  month     = {December},\n  year      = {2017},\n  address   = {Tokyo, Japan},\n  pages     = {2--14},\n  url       = {http://workshop2017.iwslt.org/downloads/iwslt2017_proceeding_v2.pdf}\n}',
-        'en-fr': [
-            'en-fr/IWSLT17.TED.tst2017.en-fr.en.xml',
-            'fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml'
-        ],
-        'fr-en': [
-            'fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml',
-            'en-fr/IWSLT17.TED.tst2017.en-fr.en.xml'
-        ],
-        'en-de': [
-            'en-de/IWSLT17.TED.tst2017.en-de.en.xml',
-            'de-en/IWSLT17.TED.tst2017.de-en.de.xml'
-        ],
-        'de-en': [
-            'de-en/IWSLT17.TED.tst2017.de-en.de.xml',
-            'en-de/IWSLT17.TED.tst2017.en-de.en.xml'
-        ],
-        'en-zh': [
-            'en-zh/IWSLT17.TED.tst2017.en-zh.en.xml',
-            'zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml'
-        ],
-        'zh-en': [
-            'zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml',
-            'en-zh/IWSLT17.TED.tst2017.en-zh.en.xml'
-        ],
+            "cc51d9b7fe1ff2af858c6a0dd80b8815",
+        ],
+        'description': 'Official evaluation data for IWSLT.',
+        'citation': '@InProceedings{iwslt2017,\n  author    = {Cettolo, Mauro and Federico, Marcello and Bentivogli, Luisa and Niehues, Jan and Stüker, Sebastian and Sudoh, Katsuitho and Yoshino, Koichiro and Federmann, Christian},\n  title     = {Overview of the IWSLT 2017 Evaluation Campaign},\n  booktitle = {14th International Workshop on Spoken Language Translation},\n  month     = {December},\n  year      = {2017},\n  address   = {Tokyo, Japan},\n  pages     = {2--14},\n  url       = {http://workshop2017.iwslt.org/downloads/iwslt2017_proceeding_v2.pdf}\n}',
+        'en-fr': ['en-fr/IWSLT17.TED.tst2017.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml',],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2017.en-fr.en.xml',],
+        'en-de': ['en-de/IWSLT17.TED.tst2017.en-de.en.xml', 'de-en/IWSLT17.TED.tst2017.de-en.de.xml',],
+        'de-en': ['de-en/IWSLT17.TED.tst2017.de-en.de.xml', 'en-de/IWSLT17.TED.tst2017.en-de.en.xml',],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2017.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml',],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2017.en-zh.en.xml',],
     },
     'iwslt17/tst2016': {
         'data': [
@@ -947,7 +515,7 @@
             'https://wit3.fbk.eu/archive/2017-01-ted-test/texts/en/de/en-de.tgz',
             'https://wit3.fbk.eu/archive/2017-01-ted-test/texts/de/en/de-en.tgz',
             'https://wit3.fbk.eu/archive/2017-01-ted-test/texts/en/zh/en-zh.tgz',
-            'https://wit3.fbk.eu/archive/2017-01-ted-test/texts/zh/en/zh-en.tgz'
+            'https://wit3.fbk.eu/archive/2017-01-ted-test/texts/zh/en/zh-en.tgz',
         ],
         "md5": [
             "1849bcc3b006dc0642a8843b11aa7192",
@@ -955,34 +523,15 @@
             "b68e7097b179491f6c466ef41ad72b9b",
             "e3f5b2a075a2da1a395c8b60bf1e9be1",
             "975a858783a0ebec8c57d83ddd5bd381",
-            "cc51d9b7fe1ff2af858c6a0dd80b8815"
-        ],
-        'description':
-            'Development data for IWSLT 2017.',
-        'en-fr': [
-            'en-fr/IWSLT17.TED.tst2016.en-fr.en.xml',
-            'fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml'
-        ],
-        'fr-en': [
-            'fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml',
-            'en-fr/IWSLT17.TED.tst2016.en-fr.en.xml'
-        ],
-        'en-de': [
-            'en-de/IWSLT17.TED.tst2016.en-de.en.xml',
-            'de-en/IWSLT17.TED.tst2016.de-en.de.xml'
-        ],
-        'de-en': [
-            'de-en/IWSLT17.TED.tst2016.de-en.de.xml',
-            'en-de/IWSLT17.TED.tst2016.en-de.en.xml'
-        ],
-        'en-zh': [
-            'en-zh/IWSLT17.TED.tst2016.en-zh.en.xml',
-            'zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml'
-        ],
-        'zh-en': [
-            'zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml',
-            'en-zh/IWSLT17.TED.tst2016.en-zh.en.xml'
-        ],
+            "cc51d9b7fe1ff2af858c6a0dd80b8815",
+        ],
+        'description': 'Development data for IWSLT 2017.',
+        'en-fr': ['en-fr/IWSLT17.TED.tst2016.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml',],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2016.en-fr.en.xml',],
+        'en-de': ['en-de/IWSLT17.TED.tst2016.en-de.en.xml', 'de-en/IWSLT17.TED.tst2016.de-en.de.xml',],
+        'de-en': ['de-en/IWSLT17.TED.tst2016.de-en.de.xml', 'en-de/IWSLT17.TED.tst2016.en-de.en.xml',],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2016.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml',],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2016.en-zh.en.xml',],
     },
     'iwslt17/tst2015': {
         'data': [
@@ -991,7 +540,7 @@
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/fr/en-fr.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/zh/en-zh.tgz',
-            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz'
+            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz',
         ],
         "md5": [
             "d8a32cfc002a4f12b17429cfa78050e6",
@@ -999,34 +548,15 @@
             "3cf07ebe305312b12f7f1a4d5f8f8377",
             "19927da9de0f40348cad9c0fc61642ac",
             "575b788dad6c5b9c5cee636f9ac1094a",
-            "1c0ae40171d52593df8a6963d3828116"
-        ],
-        'description':
-            'Development data for IWSLT 2017.',
-        'en-fr': [
-            'en-fr/IWSLT17.TED.tst2015.en-fr.en.xml',
-            'fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml'
-        ],
-        'fr-en': [
-            'fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml',
-            'en-fr/IWSLT17.TED.tst2015.en-fr.en.xml'
-        ],
-        'en-de': [
-            'en-de/IWSLT17.TED.tst2015.en-de.en.xml',
-            'de-en/IWSLT17.TED.tst2015.de-en.de.xml'
-        ],
-        'de-en': [
-            'de-en/IWSLT17.TED.tst2015.de-en.de.xml',
-            'en-de/IWSLT17.TED.tst2015.en-de.en.xml'
-        ],
-        'en-zh': [
-            'en-zh/IWSLT17.TED.tst2015.en-zh.en.xml',
-            'zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml'
-        ],
-        'zh-en': [
-            'zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml',
-            'en-zh/IWSLT17.TED.tst2015.en-zh.en.xml'
-        ],
+            "1c0ae40171d52593df8a6963d3828116",
+        ],
+        'description': 'Development data for IWSLT 2017.',
+        'en-fr': ['en-fr/IWSLT17.TED.tst2015.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml',],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2015.en-fr.en.xml',],
+        'en-de': ['en-de/IWSLT17.TED.tst2015.en-de.en.xml', 'de-en/IWSLT17.TED.tst2015.de-en.de.xml',],
+        'de-en': ['de-en/IWSLT17.TED.tst2015.de-en.de.xml', 'en-de/IWSLT17.TED.tst2015.en-de.en.xml',],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2015.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml',],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2015.en-zh.en.xml',],
     },
     'iwslt17/tst2014': {
         'data': [
@@ -1035,7 +565,7 @@
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/fr/en-fr.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/zh/en-zh.tgz',
-            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz'
+            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz',
         ],
         "md5": [
             "d8a32cfc002a4f12b17429cfa78050e6",
@@ -1043,34 +573,15 @@
             "3cf07ebe305312b12f7f1a4d5f8f8377",
             "19927da9de0f40348cad9c0fc61642ac",
             "575b788dad6c5b9c5cee636f9ac1094a",
-            "1c0ae40171d52593df8a6963d3828116"
-        ],
-        'description':
-            'Development data for IWSLT 2017.',
-        'en-fr': [
-            'en-fr/IWSLT17.TED.tst2014.en-fr.en.xml',
-            'fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml'
-        ],
-        'fr-en': [
-            'fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml',
-            'en-fr/IWSLT17.TED.tst2014.en-fr.en.xml'
-        ],
-        'en-de': [
-            'en-de/IWSLT17.TED.tst2014.en-de.en.xml',
-            'de-en/IWSLT17.TED.tst2014.de-en.de.xml'
-        ],
-        'de-en': [
-            'de-en/IWSLT17.TED.tst2014.de-en.de.xml',
-            'en-de/IWSLT17.TED.tst2014.en-de.en.xml'
-        ],
-        'en-zh': [
-            'en-zh/IWSLT17.TED.tst2014.en-zh.en.xml',
-            'zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml'
-        ],
-        'zh-en': [
-            'zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml',
-            'en-zh/IWSLT17.TED.tst2014.en-zh.en.xml'
-        ],
+            "1c0ae40171d52593df8a6963d3828116",
+        ],
+        'description': 'Development data for IWSLT 2017.',
+        'en-fr': ['en-fr/IWSLT17.TED.tst2014.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml',],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2014.en-fr.en.xml',],
+        'en-de': ['en-de/IWSLT17.TED.tst2014.en-de.en.xml', 'de-en/IWSLT17.TED.tst2014.de-en.de.xml',],
+        'de-en': ['de-en/IWSLT17.TED.tst2014.de-en.de.xml', 'en-de/IWSLT17.TED.tst2014.en-de.en.xml',],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2014.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml',],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2014.en-zh.en.xml',],
     },
     'iwslt17/tst2013': {
         'data': [
@@ -1079,7 +590,7 @@
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/fr/en-fr.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/zh/en-zh.tgz',
-            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz'
+            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz',
         ],
         "md5": [
             "d8a32cfc002a4f12b17429cfa78050e6",
@@ -1087,34 +598,15 @@
             "3cf07ebe305312b12f7f1a4d5f8f8377",
             "19927da9de0f40348cad9c0fc61642ac",
             "575b788dad6c5b9c5cee636f9ac1094a",
-            "1c0ae40171d52593df8a6963d3828116"
-        ],
-        'description':
-            'Development data for IWSLT 2017.',
-        'en-fr': [
-            'en-fr/IWSLT17.TED.tst2013.en-fr.en.xml',
-            'fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml'
-        ],
-        'fr-en': [
-            'fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml',
-            'en-fr/IWSLT17.TED.tst2013.en-fr.en.xml'
-        ],
-        'en-de': [
-            'en-de/IWSLT17.TED.tst2013.en-de.en.xml',
-            'de-en/IWSLT17.TED.tst2013.de-en.de.xml'
-        ],
-        'de-en': [
-            'de-en/IWSLT17.TED.tst2013.de-en.de.xml',
-            'en-de/IWSLT17.TED.tst2013.en-de.en.xml'
-        ],
-        'en-zh': [
-            'en-zh/IWSLT17.TED.tst2013.en-zh.en.xml',
-            'zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml'
-        ],
-        'zh-en': [
-            'zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml',
-            'en-zh/IWSLT17.TED.tst2013.en-zh.en.xml'
-        ],
+            "1c0ae40171d52593df8a6963d3828116",
+        ],
+        'description': 'Development data for IWSLT 2017.',
+        'en-fr': ['en-fr/IWSLT17.TED.tst2013.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml',],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2013.en-fr.en.xml',],
+        'en-de': ['en-de/IWSLT17.TED.tst2013.en-de.en.xml', 'de-en/IWSLT17.TED.tst2013.de-en.de.xml',],
+        'de-en': ['de-en/IWSLT17.TED.tst2013.de-en.de.xml', 'en-de/IWSLT17.TED.tst2013.en-de.en.xml',],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2013.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml',],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2013.en-zh.en.xml',],
     },
     'iwslt17/tst2012': {
         'data': [
@@ -1123,7 +615,7 @@
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/fr/en-fr.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/zh/en-zh.tgz',
-            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz'
+            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz',
         ],
         "md5": [
             "d8a32cfc002a4f12b17429cfa78050e6",
@@ -1131,34 +623,15 @@
             "3cf07ebe305312b12f7f1a4d5f8f8377",
             "19927da9de0f40348cad9c0fc61642ac",
             "575b788dad6c5b9c5cee636f9ac1094a",
-            "1c0ae40171d52593df8a6963d3828116"
-        ],
-        'description':
-            'Development data for IWSLT 2017.',
-        'en-fr': [
-            'en-fr/IWSLT17.TED.tst2012.en-fr.en.xml',
-            'fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml'
-        ],
-        'fr-en': [
-            'fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml',
-            'en-fr/IWSLT17.TED.tst2012.en-fr.en.xml'
-        ],
-        'en-de': [
-            'en-de/IWSLT17.TED.tst2012.en-de.en.xml',
-            'de-en/IWSLT17.TED.tst2012.de-en.de.xml'
-        ],
-        'de-en': [
-            'de-en/IWSLT17.TED.tst2012.de-en.de.xml',
-            'en-de/IWSLT17.TED.tst2012.en-de.en.xml'
-        ],
-        'en-zh': [
-            'en-zh/IWSLT17.TED.tst2012.en-zh.en.xml',
-            'zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml'
-        ],
-        'zh-en': [
-            'zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml',
-            'en-zh/IWSLT17.TED.tst2012.en-zh.en.xml'
-        ],
+            "1c0ae40171d52593df8a6963d3828116",
+        ],
+        'description': 'Development data for IWSLT 2017.',
+        'en-fr': ['en-fr/IWSLT17.TED.tst2012.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml',],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2012.en-fr.en.xml',],
+        'en-de': ['en-de/IWSLT17.TED.tst2012.en-de.en.xml', 'de-en/IWSLT17.TED.tst2012.de-en.de.xml',],
+        'de-en': ['de-en/IWSLT17.TED.tst2012.de-en.de.xml', 'en-de/IWSLT17.TED.tst2012.en-de.en.xml',],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2012.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml',],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2012.en-zh.en.xml',],
     },
     'iwslt17/tst2011': {
         'data': [
@@ -1167,7 +640,7 @@
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/fr/en-fr.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/zh/en-zh.tgz',
-            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz'
+            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz',
         ],
         "md5": [
             "d8a32cfc002a4f12b17429cfa78050e6",
@@ -1175,34 +648,15 @@
             "3cf07ebe305312b12f7f1a4d5f8f8377",
             "19927da9de0f40348cad9c0fc61642ac",
             "575b788dad6c5b9c5cee636f9ac1094a",
-            "1c0ae40171d52593df8a6963d3828116"
-        ],
-        'description':
-            'Development data for IWSLT 2017.',
-        'en-fr': [
-            'en-fr/IWSLT17.TED.tst2011.en-fr.en.xml',
-            'fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml'
-        ],
-        'fr-en': [
-            'fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml',
-            'en-fr/IWSLT17.TED.tst2011.en-fr.en.xml'
-        ],
-        'en-de': [
-            'en-de/IWSLT17.TED.tst2011.en-de.en.xml',
-            'de-en/IWSLT17.TED.tst2011.de-en.de.xml'
-        ],
-        'de-en': [
-            'de-en/IWSLT17.TED.tst2011.de-en.de.xml',
-            'en-de/IWSLT17.TED.tst2011.en-de.en.xml'
-        ],
-        'en-zh': [
-            'en-zh/IWSLT17.TED.tst2011.en-zh.en.xml',
-            'zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml'
-        ],
-        'zh-en': [
-            'zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml',
-            'en-zh/IWSLT17.TED.tst2011.en-zh.en.xml'
-        ],
+            "1c0ae40171d52593df8a6963d3828116",
+        ],
+        'description': 'Development data for IWSLT 2017.',
+        'en-fr': ['en-fr/IWSLT17.TED.tst2011.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml',],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2011.en-fr.en.xml',],
+        'en-de': ['en-de/IWSLT17.TED.tst2011.en-de.en.xml', 'de-en/IWSLT17.TED.tst2011.de-en.de.xml',],
+        'de-en': ['de-en/IWSLT17.TED.tst2011.de-en.de.xml', 'en-de/IWSLT17.TED.tst2011.en-de.en.xml',],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2011.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml',],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2011.en-zh.en.xml',],
     },
     'iwslt17/tst2010': {
         'data': [
@@ -1211,7 +665,7 @@
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/fr/en-fr.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/zh/en-zh.tgz',
-            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz'
+            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz',
         ],
         "md5": [
             "d8a32cfc002a4f12b17429cfa78050e6",
@@ -1219,34 +673,15 @@
             "3cf07ebe305312b12f7f1a4d5f8f8377",
             "19927da9de0f40348cad9c0fc61642ac",
             "575b788dad6c5b9c5cee636f9ac1094a",
-            "1c0ae40171d52593df8a6963d3828116"
-        ],
-        'description':
-            'Development data for IWSLT 2017.',
-        'en-fr': [
-            'en-fr/IWSLT17.TED.tst2010.en-fr.en.xml',
-            'fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml'
-        ],
-        'fr-en': [
-            'fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml',
-            'en-fr/IWSLT17.TED.tst2010.en-fr.en.xml'
-        ],
-        'en-de': [
-            'en-de/IWSLT17.TED.tst2010.en-de.en.xml',
-            'de-en/IWSLT17.TED.tst2010.de-en.de.xml'
-        ],
-        'de-en': [
-            'de-en/IWSLT17.TED.tst2010.de-en.de.xml',
-            'en-de/IWSLT17.TED.tst2010.en-de.en.xml'
-        ],
-        'en-zh': [
-            'en-zh/IWSLT17.TED.tst2010.en-zh.en.xml',
-            'zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml'
-        ],
-        'zh-en': [
-            'zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml',
-            'en-zh/IWSLT17.TED.tst2010.en-zh.en.xml'
-        ],
+            "1c0ae40171d52593df8a6963d3828116",
+        ],
+        'description': 'Development data for IWSLT 2017.',
+        'en-fr': ['en-fr/IWSLT17.TED.tst2010.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml',],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2010.en-fr.en.xml',],
+        'en-de': ['en-de/IWSLT17.TED.tst2010.en-de.en.xml', 'de-en/IWSLT17.TED.tst2010.de-en.de.xml',],
+        'de-en': ['de-en/IWSLT17.TED.tst2010.de-en.de.xml', 'en-de/IWSLT17.TED.tst2010.en-de.en.xml',],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2010.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml',],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2010.en-zh.en.xml',],
     },
     'iwslt17/dev2010': {
         'data': [
@@ -1255,7 +690,7 @@
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/fr/en-fr.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz',
             'https://wit3.fbk.eu/archive/2017-01-trnted/texts/en/zh/en-zh.tgz',
-            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz'
+            'https://wit3.fbk.eu/archive/2017-01-trnted/texts/zh/en/zh-en.tgz',
         ],
         "md5": [
             "d8a32cfc002a4f12b17429cfa78050e6",
@@ -1263,34 +698,15 @@
             "3cf07ebe305312b12f7f1a4d5f8f8377",
             "19927da9de0f40348cad9c0fc61642ac",
             "575b788dad6c5b9c5cee636f9ac1094a",
-            "1c0ae40171d52593df8a6963d3828116"
-        ],
-        'description':
-            'Development data for IWSLT 2017.',
-        'en-fr': [
-            'en-fr/IWSLT17.TED.dev2010.en-fr.en.xml',
-            'fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml'
-        ],
-        'fr-en': [
-            'fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml',
-            'en-fr/IWSLT17.TED.dev2010.en-fr.en.xml'
-        ],
-        'en-de': [
-            'en-de/IWSLT17.TED.dev2010.en-de.en.xml',
-            'de-en/IWSLT17.TED.dev2010.de-en.de.xml'
-        ],
-        'de-en': [
-            'de-en/IWSLT17.TED.dev2010.de-en.de.xml',
-            'en-de/IWSLT17.TED.dev2010.en-de.en.xml'
-        ],
-        'en-zh': [
-            'en-zh/IWSLT17.TED.dev2010.en-zh.en.xml',
-            'zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml'
-        ],
-        'zh-en': [
-            'zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml',
-            'en-zh/IWSLT17.TED.dev2010.en-zh.en.xml'
-        ],
+            "1c0ae40171d52593df8a6963d3828116",
+        ],
+        'description': 'Development data for IWSLT 2017.',
+        'en-fr': ['en-fr/IWSLT17.TED.dev2010.en-fr.en.xml', 'fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml',],
+        'fr-en': ['fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml', 'en-fr/IWSLT17.TED.dev2010.en-fr.en.xml',],
+        'en-de': ['en-de/IWSLT17.TED.dev2010.en-de.en.xml', 'de-en/IWSLT17.TED.dev2010.de-en.de.xml',],
+        'de-en': ['de-en/IWSLT17.TED.dev2010.de-en.de.xml', 'en-de/IWSLT17.TED.dev2010.en-de.en.xml',],
+        'en-zh': ['en-zh/IWSLT17.TED.dev2010.en-zh.en.xml', 'zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml',],
+        'zh-en': ['zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml', 'en-zh/IWSLT17.TED.dev2010.en-zh.en.xml',],
     },
 }
 
@@ -1321,12 +737,9 @@ def tokenize_13a(line):
     # language-dependent part (assuming Western languages):
     norm = " {} ".format(norm)
     norm = re.sub(r'([\{-\~\[-\` -\&\(-\+\:-\@\/])', ' \\1 ', norm)
-    norm = re.sub(r'([^0-9])([\.,])', '\\1 \\2 ',
-                  norm)  # tokenize period and comma unless preceded by a digit
-    norm = re.sub(r'([\.,])([^0-9])', ' \\1 \\2',
-                  norm)  # tokenize period and comma unless followed by a digit
-    norm = re.sub(r'([0-9])(-)', '\\1 \\2 ',
-                  norm)  # tokenize dash when preceded by a digit
+    norm = re.sub(r'([^0-9])([\.,])', '\\1 \\2 ', norm)  # tokenize period and comma unless preceded by a digit
+    norm = re.sub(r'([\.,])([^0-9])', ' \\1 \\2', norm)  # tokenize period and comma unless followed by a digit
+    norm = re.sub(r'([0-9])(-)', '\\1 \\2 ', norm)  # tokenize dash when preceded by a digit
     norm = re.sub(r'\s+', ' ', norm)  # one space only between words
     norm = re.sub(r'^\s+', '', norm)  # no leading space
     norm = re.sub(r'\s+$', '', norm)  # no trailing space
@@ -1340,9 +753,7 @@ class UnicodeRegex:
     without depending on https://pypi.python.org/pypi/regex/."""
 
     def _property_chars(prefix):
-        return ''.join(
-            chr(x) for x in range(sys.maxunicode)
-            if unicodedata.category(chr(x)).startswith(prefix))
+        return ''.join(chr(x) for x in range(sys.maxunicode) if unicodedata.category(chr(x)).startswith(prefix))
 
     punctuation = _property_chars('P')
     nondigit_punct_re = re.compile(r'([^\d])([' + punctuation + r'])')
@@ -1566,7 +977,7 @@ def bleu_signature(args, numrefs):
         'case': 'c',
         'tok': 'tok',
         'numrefs': '#',
-        'version': 'v'
+        'version': 'v',
     }
 
     signature = {
@@ -1574,7 +985,7 @@ def bleu_signature(args, numrefs):
         'version': VERSION,
         'smooth': args.smooth,
         'numrefs': numrefs,
-        'case': 'lc' if args.lc else 'mixed'
+        'case': 'lc' if args.lc else 'mixed',
     }
 
     if args.test_set is not None:
@@ -1583,10 +994,7 @@ def bleu_signature(args, numrefs):
     if args.langpair is not None:
         signature['lang'] = args.langpair
 
-    sigstr = '+'.join([
-        '{}.{}'.format(abbr[x] if args.short else x, signature[x])
-        for x in sorted(signature.keys())
-    ])
+    sigstr = '+'.join(['{}.{}'.format(abbr[x] if args.short else x, signature[x]) for x in sorted(signature.keys())])
 
     return sigstr
 
@@ -1606,7 +1014,7 @@ def chrf_signature(args, numrefs):
         'space': 's',
         'case': 'c',
         'numrefs': '#',
-        'version': 'v'
+        'version': 'v',
     }
 
     signature = {
@@ -1615,7 +1023,7 @@ def chrf_signature(args, numrefs):
         'space': args.chrf_whitespace,
         'numchars': args.chrf_order,
         'numrefs': numrefs,
-        'case': 'lc' if args.lc else 'mixed'
+        'case': 'lc' if args.lc else 'mixed',
     }
 
     if args.test_set is not None:
@@ -1624,10 +1032,7 @@ def chrf_signature(args, numrefs):
     if args.langpair is not None:
         signature['lang'] = args.langpair
 
-    sigstr = '+'.join([
-        '{}.{}'.format(abbr[x] if args.short else x, signature[x])
-        for x in sorted(signature.keys())
-    ])
+    sigstr = '+'.join(['{}.{}'.format(abbr[x] if args.short else x, signature[x]) for x in sorted(signature.keys())])
 
     return sigstr
 
@@ -1644,7 +1049,7 @@ def extract_ngrams(line, min_order=1, max_order=NGRAM_ORDER) -> Counter:
     tokens = line.split()
     for n in range(min_order, max_order + 1):
         for i in range(0, len(tokens) - n + 1):
-            ngram = ' '.join(tokens[i:i + n])
+            ngram = ' '.join(tokens[i : i + n])
             ngrams[ngram] += 1
 
     return ngrams
@@ -1654,7 +1059,7 @@ def extract_char_ngrams(s: str, n: int) -> Counter:
     """
     Yields counts of character n-grams from string s of order n.
     """
-    return Counter([s[i:i + n] for i in range(len(s) - n + 1)])
+    return Counter([s[i : i + n] for i in range(len(s) - n + 1)])
 
 
 def ref_stats(output, refs):
@@ -1703,16 +1108,16 @@ def process_to_text(rawfile, txtfile, field: int = None):
             with smart_open(rawfile) as fin, smart_open(txtfile, 'wt') as fout:
                 for line in fin:
                     if line.startswith('<seg '):
-                        print(_clean(
-                            re.sub(r'<seg.*?>(.*)</seg>.*?', '\\1', line)),
-                            file=fout)
+                        print(
+                            _clean(re.sub(r'<seg.*?>(.*)</seg>.*?', '\\1', line)), file=fout,
+                        )
         elif rawfile.endswith('.xml'):  # IWSLT
             with smart_open(rawfile) as fin, smart_open(txtfile, 'wt') as fout:
                 for line in fin:
                     if line.startswith('<seg '):
-                        print(_clean(
-                            re.sub(r'<seg.*?>(.*)</seg>.*?', '\\1', line)),
-                            file=fout)
+                        print(
+                            _clean(re.sub(r'<seg.*?>(.*)</seg>.*?', '\\1', line)), file=fout,
+                        )
         elif rawfile.endswith('.txt'):  # wmt17/ms
             with smart_open(rawfile) as fin, smart_open(txtfile, 'wt') as fout:
                 for line in fin:
@@ -1753,17 +1158,14 @@ def download_test_set(test_set, langpair=None):
         logging.info('Creating %s', outdir)
         os.makedirs(outdir)
 
-    expected_checksums = DATASETS[test_set].get('md5', [None] *
-                                                len(DATASETS[test_set]))
-    for dataset, expected_md5 in zip(DATASETS[test_set]['data'],
-                                     expected_checksums):
+    expected_checksums = DATASETS[test_set].get('md5', [None] * len(DATASETS[test_set]))
+    for dataset, expected_md5 in zip(DATASETS[test_set]['data'], expected_checksums):
         tarball = os.path.join(outdir, os.path.basename(dataset))
         rawdir = os.path.join(outdir, 'raw')
         if not os.path.exists(tarball) or os.path.getsize(tarball) == 0:
             logging.info("Downloading %s to %s", dataset, tarball)
             try:
-                with urllib.request.urlopen(dataset) as f, open(tarball,
-                                                                'wb') as out:
+                with urllib.request.urlopen(dataset) as f, open(tarball, 'wb') as out:
                     out.write(f.read())
             except ssl.SSLError:
                 logging.warning(
@@ -1771,7 +1173,8 @@ def download_test_set(test_set, langpair=None):
                     'If you\'re on a Mac, '
                     'you may need to run the "Install Certificates.command" '
                     'file located in the '
-                    '"Python 3" folder, often found under /Applications')
+                    '"Python 3" folder, often found under /Applications'
+                )
                 sys.exit(1)
 
             # Check md5sum
@@ -1783,11 +1186,9 @@ def download_test_set(test_set, langpair=None):
                 if md5.hexdigest() != expected_md5:
                     logging.error(
                         'Fatal: MD5 sum of downloaded file was incorrect (got '
-                        '{}, expected {}).'.format(
-                            md5.hexdigest(), expected_md5))
-                    logging.error(
-                        'Please manually delete "{}" and rerun the command.'.
-                            format(tarball))
+                        '{}, expected {}).'.format(md5.hexdigest(), expected_md5)
+                    )
+                    logging.error('Please manually delete "{}" and rerun the command.'.format(tarball))
                     logging.error(
                         'If the problem persists, the tarball may have '
                         'changed, in which case, please contact the SacreBLEU '
@@ -1801,10 +1202,12 @@ def download_test_set(test_set, langpair=None):
             logging.info('Extracting %s', tarball)
             if tarball.endswith('.tar.gz') or tarball.endswith('.tgz'):
                 import tarfile
+
                 tar = tarfile.open(tarball)
                 tar.extractall(path=rawdir)
             elif tarball.endswith('.zip'):
                 import zipfile
+
                 zipfile = zipfile.ZipFile(tarball, 'r')
                 zipfile.extractall(path=rawdir)
                 zipfile.close()
@@ -1844,28 +1247,30 @@ def download_test_set(test_set, langpair=None):
     return found
 
 
-class BLEU(
-    namedtuple('BaseBLEU',
-               'score, counts, totals, precisions, bp, sys_len, ref_len')):
+class BLEU(namedtuple('BaseBLEU', 'score, counts, totals, precisions, bp, sys_len, ref_len')):
     def format(self, width=2):
         precisions = "/".join(["{:.1f}".format(p) for p in self.precisions])
-        return f'BLEU = {self.score:.{width}f} {precisions}' \
-               f'(BP = {self.bp:.3f}' \
-               f' ratio = {(self.sys_len / self.ref_len):.3f}' \
-               f' hyp_len = {self.sys_len:d}' \
-               f' ref_len = {self.ref_len:d})'
+        return (
+            f'BLEU = {self.score:.{width}f} {precisions}'
+            f'(BP = {self.bp:.3f}'
+            f' ratio = {(self.sys_len / self.ref_len):.3f}'
+            f' hyp_len = {self.sys_len:d}'
+            f' ref_len = {self.ref_len:d})'
+        )
 
     def __str__(self):
         return self.format()
 
 
-def compute_bleu(correct: List[int],
-                 total: List[int],
-                 sys_len: int,
-                 ref_len: int,
-                 smooth_method='none',
-                 smooth_value=SMOOTH_VALUE_DEFAULT,
-                 use_effective_order=False) -> BLEU:
+def compute_bleu(
+    correct: List[int],
+    total: List[int],
+    sys_len: int,
+    ref_len: int,
+    smooth_method='none',
+    smooth_value=SMOOTH_VALUE_DEFAULT,
+    use_effective_order=False,
+) -> BLEU:
     """Computes BLEU score from its sufficient statistics. Adds smoothing.
 
     Smoothing methods (citing "A Systematic Comparison of Smoothing
@@ -1888,7 +1293,7 @@ def compute_bleu(correct: List[int],
 
     precisions = [0 for x in range(NGRAM_ORDER)]
 
-    smooth_mteval = 1.
+    smooth_mteval = 1.0
     effective_order = NGRAM_ORDER
     for n in range(NGRAM_ORDER):
         if smooth_method == 'add-k' and n > 1:
@@ -1903,11 +1308,11 @@ def compute_bleu(correct: List[int],
         if correct[n] == 0:
             if smooth_method == 'exp':
                 smooth_mteval *= 2
-                precisions[n] = 100. / (smooth_mteval * total[n])
+                precisions[n] = 100.0 / (smooth_mteval * total[n])
             elif smooth_method == 'floor':
-                precisions[n] = 100. * smooth_value / total[n]
+                precisions[n] = 100.0 * smooth_value / total[n]
         else:
-            precisions[n] = 100. * correct[n] / total[n]
+            precisions[n] = 100.0 * correct[n] / total[n]
 
     # If the system guesses no i-grams, 1 <= i <= NGRAM_ORDER, the BLEU score
     # is 0 (technically undefined). This is a problem for sentence-level BLEU
@@ -1918,21 +1323,20 @@ def compute_bleu(correct: List[int],
 
     brevity_penalty = 1.0
     if sys_len < ref_len:
-        brevity_penalty = math.exp(1 -
-                                   ref_len / sys_len) if sys_len > 0 else 0.0
+        brevity_penalty = math.exp(1 - ref_len / sys_len) if sys_len > 0 else 0.0
 
-    bleu = brevity_penalty * math.exp(
-        sum(map(my_log, precisions[:effective_order])) / effective_order)
+    bleu = brevity_penalty * math.exp(sum(map(my_log, precisions[:effective_order])) / effective_order)
 
-    return BLEU._make(
-        [bleu, correct, total, precisions, brevity_penalty, sys_len, ref_len])
+    return BLEU._make([bleu, correct, total, precisions, brevity_penalty, sys_len, ref_len])
 
 
-def sentence_bleu(hypothesis: str,
-                  reference: str,
-                  smooth_method: str = 'floor',
-                  smooth_value: float = SMOOTH_VALUE_DEFAULT,
-                  use_effective_order: bool = True):
+def sentence_bleu(
+    hypothesis: str,
+    reference: str,
+    smooth_method: str = 'floor',
+    smooth_value: float = SMOOTH_VALUE_DEFAULT,
+    use_effective_order: bool = True,
+):
     """
     Computes BLEU on a single sentence pair.
 
@@ -1944,22 +1348,26 @@ def sentence_bleu(hypothesis: str,
     :param use_effective_order: Account for references that are shorter than
     the largest n-gram. :return: Returns a single BLEU score as a float.
     """
-    bleu = corpus_bleu(hypothesis,
-                       reference,
-                       smooth_method=smooth_method,
-                       smooth_value=smooth_value,
-                       use_effective_order=use_effective_order)
+    bleu = corpus_bleu(
+        hypothesis,
+        reference,
+        smooth_method=smooth_method,
+        smooth_value=smooth_value,
+        use_effective_order=use_effective_order,
+    )
     return bleu.score
 
 
-def corpus_bleu(sys_stream: Union[str, Iterable[str]],
-                ref_streams: Union[str, List[Iterable[str]]],
-                smooth_method='exp',
-                smooth_value=SMOOTH_VALUE_DEFAULT,
-                force=False,
-                lowercase=False,
-                tokenize=DEFAULT_TOKENIZER,
-                use_effective_order=False) -> BLEU:
+def corpus_bleu(
+    sys_stream: Union[str, Iterable[str]],
+    ref_streams: Union[str, List[Iterable[str]]],
+    smooth_method='exp',
+    smooth_value=SMOOTH_VALUE_DEFAULT,
+    force=False,
+    lowercase=False,
+    tokenize=DEFAULT_TOKENIZER,
+    use_effective_order=False,
+) -> BLEU:
     """Produces BLEU scores along with its sufficient statistics from a
     source against one or more references.
 
@@ -1990,22 +1398,18 @@ def corpus_bleu(sys_stream: Union[str, Iterable[str]],
     fhs = [sys_stream] + ref_streams
     for lines in zip_longest(*fhs):
         if None in lines:
-            raise EOFError(
-                "Source and reference streams have different lengths!")
+            raise EOFError("Source and reference streams have different lengths!")
 
         if lowercase:
             lines = [x.lower() for x in lines]
 
-        if not (force
-                or tokenize == 'none') and lines[0].rstrip().endswith(' .'):
+        if not (force or tokenize == 'none') and lines[0].rstrip().endswith(' .'):
             tokenized_count += 1
 
             if tokenized_count == 100:
+                logging.warning('That\'s 100 lines that end in a tokenized period (\'.\')')
                 logging.warning(
-                    'That\'s 100 lines that end in a tokenized period (\'.\')')
-                logging.warning(
-                    'It looks like you forgot to detokenize your test data, '
-                    'which may hurt your score. '
+                    'It looks like you forgot to detokenize your test data, ' 'which may hurt your score. '
                 )
                 logging.warning(
                     'If you insist your data is detokenized, or don\'t care, '
@@ -2025,17 +1429,18 @@ def corpus_bleu(sys_stream: Union[str, Iterable[str]],
             correct[n - 1] += min(sys_ngrams[ngram], ref_ngrams.get(ngram, 0))
             total[n - 1] += sys_ngrams[ngram]
 
-    return compute_bleu(correct,
-                        total,
-                        sys_len,
-                        ref_len,
-                        smooth_method=smooth_method,
-                        smooth_value=smooth_value,
-                        use_effective_order=use_effective_order)
+    return compute_bleu(
+        correct,
+        total,
+        sys_len,
+        ref_len,
+        smooth_method=smooth_method,
+        smooth_value=smooth_value,
+        use_effective_order=use_effective_order,
+    )
 
 
-def raw_corpus_bleu(sys_stream, ref_streams,
-                    smooth_value=SMOOTH_VALUE_DEFAULT) -> BLEU:
+def raw_corpus_bleu(sys_stream, ref_streams, smooth_value=SMOOTH_VALUE_DEFAULT) -> BLEU:
     """Convenience function that wraps corpus_bleu(). This is convenient if
     you're using sacrebleu as a library, say for scoring on dev. It uses no
     tokenization and 'floor' smoothing, with the floor default to 0 (no
@@ -2045,13 +1450,15 @@ def raw_corpus_bleu(sys_stream, ref_streams,
     ref_streams: a list of one or more reference streams (each a sequence of
     segments)
     """
-    return corpus_bleu(sys_stream,
-                       ref_streams,
-                       smooth_method='floor',
-                       smooth_value=smooth_value,
-                       force=True,
-                       tokenize='none',
-                       use_effective_order=True)
+    return corpus_bleu(
+        sys_stream,
+        ref_streams,
+        smooth_method='floor',
+        smooth_value=smooth_value,
+        force=True,
+        tokenize='none',
+        use_effective_order=True,
+    )
 
 
 def delete_whitespace(text: str) -> str:
@@ -2061,14 +1468,11 @@ def delete_whitespace(text: str) -> str:
     return re.sub(r'\s+', '', text).strip()
 
 
-def get_sentence_statistics(hypothesis: str,
-                            reference: str,
-                            order: int = CHRF_ORDER,
-                            remove_whitespace: bool = True) -> List[float]:
-    hypothesis = delete_whitespace(
-        hypothesis) if remove_whitespace else hypothesis
-    reference = delete_whitespace(
-        reference) if remove_whitespace else reference
+def get_sentence_statistics(
+    hypothesis: str, reference: str, order: int = CHRF_ORDER, remove_whitespace: bool = True,
+) -> List[float]:
+    hypothesis = delete_whitespace(hypothesis) if remove_whitespace else hypothesis
+    reference = delete_whitespace(reference) if remove_whitespace else reference
     statistics = [0] * (order * 3)
     for i in range(order):
         n = i + 1
@@ -2081,24 +1485,18 @@ def get_sentence_statistics(hypothesis: str,
     return statistics
 
 
-def get_corpus_statistics(hypotheses: Iterable[str],
-                          references: Iterable[str],
-                          order: int = CHRF_ORDER,
-                          remove_whitespace: bool = True) -> List[float]:
+def get_corpus_statistics(
+    hypotheses: Iterable[str], references: Iterable[str], order: int = CHRF_ORDER, remove_whitespace: bool = True,
+) -> List[float]:
     corpus_statistics = [0] * (order * 3)
     for hypothesis, reference in zip(hypotheses, references):
-        statistics = get_sentence_statistics(
-            hypothesis,
-            reference,
-            order=order,
-            remove_whitespace=remove_whitespace)
+        statistics = get_sentence_statistics(hypothesis, reference, order=order, remove_whitespace=remove_whitespace,)
         for i in range(len(statistics)):
             corpus_statistics[i] += statistics[i]
     return corpus_statistics
 
 
-def _avg_precision_and_recall(statistics: List[float],
-                              order: int) -> Tuple[float, float]:
+def _avg_precision_and_recall(statistics: List[float], order: int) -> Tuple[float, float]:
     avg_precision = 0.0
     avg_recall = 0.0
     effective_order = 0
@@ -2121,16 +1519,17 @@ def _chrf(avg_precision, avg_recall, beta: int = CHRF_BETA) -> float:
     if avg_precision + avg_recall == 0:
         return 0.0
     beta_square = beta ** 2
-    score = (1 + beta_square) * (avg_precision * avg_recall) / (
-            (beta_square * avg_precision) + avg_recall)
+    score = (1 + beta_square) * (avg_precision * avg_recall) / ((beta_square * avg_precision) + avg_recall)
     return score
 
 
-def corpus_chrf(hypotheses: Iterable[str],
-                references: Iterable[str],
-                order: int = CHRF_ORDER,
-                beta: float = CHRF_BETA,
-                remove_whitespace: bool = True) -> float:
+def corpus_chrf(
+    hypotheses: Iterable[str],
+    references: Iterable[str],
+    order: int = CHRF_ORDER,
+    beta: float = CHRF_BETA,
+    remove_whitespace: bool = True,
+) -> float:
     """
     Computes Chrf on a corpus.
 
@@ -2141,20 +1540,15 @@ def corpus_chrf(hypotheses: Iterable[str],
     same importance. :return: Chrf score.
     """
     corpus_statistics = get_corpus_statistics(
-        hypotheses,
-        references,
-        order=order,
-        remove_whitespace=remove_whitespace)
-    avg_precision, avg_recall = _avg_precision_and_recall(
-        corpus_statistics, order)
+        hypotheses, references, order=order, remove_whitespace=remove_whitespace,
+    )
+    avg_precision, avg_recall = _avg_precision_and_recall(corpus_statistics, order)
     return _chrf(avg_precision, avg_recall, beta=beta)
 
 
-def sentence_chrf(hypothesis: str,
-                  reference: str,
-                  order: int = CHRF_ORDER,
-                  beta: float = CHRF_BETA,
-                  remove_whitespace: bool = True) -> float:
+def sentence_chrf(
+    hypothesis: str, reference: str, order: int = CHRF_ORDER, beta: float = CHRF_BETA, remove_whitespace: bool = True,
+) -> float:
     """
     Computes ChrF on a single sentence pair.
 
@@ -2164,10 +1558,7 @@ def sentence_chrf(hypothesis: str,
     Defines importance of recall w.r.t precision. If beta=1, same importance.
     :return: Chrf score.
     """
-    statistics = get_sentence_statistics(hypothesis,
-                                         reference,
-                                         order=order,
-                                         remove_whitespace=remove_whitespace)
+    statistics = get_sentence_statistics(hypothesis, reference, order=order, remove_whitespace=remove_whitespace)
     avg_precision, avg_recall = _avg_precision_and_recall(statistics, order)
     return _chrf(avg_precision, avg_recall, beta=beta)
 
@@ -2175,146 +1566,116 @@ def sentence_chrf(hypothesis: str,
 def main():
     arg_parser = argparse.ArgumentParser(
         description='sacreBLEU: Hassle-free computation of shareable BLEU '
-                    'scores. Quick usage: score your detokenized output '
-                    'against WMT\'14 EN-DE: '
-                    '    cat output.detok.de | ./sacreBLEU -t wmt14 -l en-de')
-    arg_parser.add_argument('--test-set',
-                            '-t',
-                            type=str,
-                            default=None,
-                            choices=DATASETS.keys(),
-                            help='the test set to use')
+        'scores. Quick usage: score your detokenized output '
+        'against WMT\'14 EN-DE: '
+        '    cat output.detok.de | ./sacreBLEU -t wmt14 -l en-de'
+    )
     arg_parser.add_argument(
-        '-lc',
-        action='store_true',
-        default=False,
-        help='use case-insensitive BLEU (default: actual case)')
+        '--test-set', '-t', type=str, default=None, choices=DATASETS.keys(), help='the test set to use',
+    )
+    arg_parser.add_argument(
+        '-lc', action='store_true', default=False, help='use case-insensitive BLEU (default: actual case)',
+    )
     arg_parser.add_argument(
         '--smooth',
         '-s',
         choices=['exp', 'floor', 'add-n', 'none'],
         default='exp',
         help='smoothing method: exponential decay (default), floor (increment '
-             'zero counts), add-k (increment num/denom by k for n>1), or none '
+        'zero counts), add-k (increment num/denom by k for n>1), or none ',
     )
     arg_parser.add_argument(
         '--smooth-value',
         '-sv',
         type=float,
         default=SMOOTH_VALUE_DEFAULT,
-        help='The value to pass to the smoothing technique, when relevant. '
-             'Default: %(default)s. '
+        help='The value to pass to the smoothing technique, when relevant. ' 'Default: %(default)s. ',
+    )
+    arg_parser.add_argument(
+        '--tokenize', '-tok', choices=TOKENIZERS.keys(), default=None, help='tokenization method to use',
     )
-    arg_parser.add_argument('--tokenize',
-                            '-tok',
-                            choices=TOKENIZERS.keys(),
-                            default=None,
-                            help='tokenization method to use')
     arg_parser.add_argument(
         '--language-pair',
         '-l',
         dest='langpair',
         default=None,
-        help='source-target language pair (2-char ISO639-1 codes)')
-    arg_parser.add_argument('--download',
-                            type=str,
-                            default=None,
-                            help='download a test set and quit')
+        help='source-target language pair (2-char ISO639-1 codes)',
+    )
+    arg_parser.add_argument(
+        '--download', type=str, default=None, help='download a test set and quit',
+    )
     arg_parser.add_argument(
         '--echo',
         choices=['src', 'ref', 'both'],
         type=str,
         default=None,
-        help='output the source (src), reference (ref), or both (both, '
-             'pasted) to STDOUT and quit '
+        help='output the source (src), reference (ref), or both (both, ' 'pasted) to STDOUT and quit ',
+    )
+    arg_parser.add_argument(
+        '--input', '-i', type=str, default='-', help='Read input from a file instead of STDIN',
     )
-    arg_parser.add_argument('--input',
-                            '-i',
-                            type=str,
-                            default='-',
-                            help='Read input from a file instead of STDIN')
     arg_parser.add_argument(
         'refs',
         nargs='*',
         default=[],
-        help='optional list of references (for backwards-compatibility with '
-             'older scripts) '
+        help='optional list of references (for backwards-compatibility with ' 'older scripts) ',
+    )
+    arg_parser.add_argument(
+        '--metrics',
+        '-m',
+        choices=['bleu', 'chrf'],
+        nargs='+',
+        default=['bleu'],
+        help='metrics to compute (default: bleu)',
+    )
+    arg_parser.add_argument(
+        '--chrf-order', type=int, default=CHRF_ORDER, help='chrf character order (default: %(default)s)',
+    )
+    arg_parser.add_argument(
+        '--chrf-beta', type=int, default=CHRF_BETA, help='chrf BETA parameter (default: %(default)s)',
     )
-    arg_parser.add_argument('--metrics',
-                            '-m',
-                            choices=['bleu', 'chrf'],
-                            nargs='+',
-                            default=['bleu'],
-                            help='metrics to compute (default: bleu)')
-    arg_parser.add_argument('--chrf-order',
-                            type=int,
-                            default=CHRF_ORDER,
-                            help='chrf character order (default: %(default)s)')
-    arg_parser.add_argument('--chrf-beta',
-                            type=int,
-                            default=CHRF_BETA,
-                            help='chrf BETA parameter (default: %(default)s)')
     arg_parser.add_argument(
         '--chrf-whitespace',
         action='store_true',
         default=False,
-        help='include whitespace in chrF calculation (default: %(default)s)')
+        help='include whitespace in chrF calculation (default: %(default)s)',
+    )
     arg_parser.add_argument(
-        '--short',
-        default=False,
-        action='store_true',
-        help='produce a shorter (less human readable) signature')
-    arg_parser.add_argument('--score-only',
-                            '-b',
-                            default=False,
-                            action='store_true',
-                            help='output only the BLEU score')
+        '--short', default=False, action='store_true', help='produce a shorter (less human readable) signature',
+    )
     arg_parser.add_argument(
-        '--force',
-        default=False,
-        action='store_true',
-        help='insist that your tokenized input is actually detokenized')
-    arg_parser.add_argument('--quiet',
-                            '-q',
-                            default=False,
-                            action='store_true',
-                            help='suppress informative output')
+        '--score-only', '-b', default=False, action='store_true', help='output only the BLEU score',
+    )
+    arg_parser.add_argument(
+        '--force', default=False, action='store_true', help='insist that your tokenized input is actually detokenized',
+    )
+    arg_parser.add_argument(
+        '--quiet', '-q', default=False, action='store_true', help='suppress informative output',
+    )
     arg_parser.add_argument(
         '--encoding',
         '-e',
         type=str,
         default='utf-8',
-        help='open text files with specified encoding (default: %(default)s)')
-    arg_parser.add_argument('--citation',
-                            '--cite',
-                            default=False,
-                            action='store_true',
-                            help='dump the bibtex citation and quit.')
-    arg_parser.add_argument('--width',
-                            '-w',
-                            type=int,
-                            default=1,
-                            help='floating point width (default: %(default)s)')
-    arg_parser.add_argument('-V',
-                            '--version',
-                            action='version',
-                            version='%(prog)s {}'.format(VERSION))
+        help='open text files with specified encoding (default: %(default)s)',
+    )
+    arg_parser.add_argument(
+        '--citation', '--cite', default=False, action='store_true', help='dump the bibtex citation and quit.',
+    )
+    arg_parser.add_argument(
+        '--width', '-w', type=int, default=1, help='floating point width (default: %(default)s)',
+    )
+    arg_parser.add_argument(
+        '-V', '--version', action='version', version='%(prog)s {}'.format(VERSION),
+    )
     args = arg_parser.parse_args()
 
     # Explicitly set the encoding
-    sys.stdin = open(sys.stdin.fileno(),
-                     mode='r',
-                     encoding='utf-8',
-                     buffering=True,
-                     newline="\n")
-    sys.stdout = open(sys.stdout.fileno(),
-                      mode='w',
-                      encoding='utf-8',
-                      buffering=True)
+    sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True, newline="\n",)
+    sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True)
 
     if not args.quiet:
-        logging.basicConfig(level=logging.INFO,
-                            format='sacreBLEU: %(message)s')
+        logging.basicConfig(level=logging.INFO, format='sacreBLEU: %(message)s')
 
     if args.download:
         download_test_set(args.download, args.langpair)
@@ -2334,43 +1695,36 @@ def main():
     if args.test_set is not None and args.test_set not in DATASETS:
         logging.error('The available test sets are: ')
         for testset in sorted(DATASETS.keys(), reverse=True):
-            logging.error('  %s: %s', testset,
-                          DATASETS[testset].get('description', ''))
+            logging.error('  %s: %s', testset, DATASETS[testset].get('description', ''))
         sys.exit(1)
 
-    if args.test_set and (args.langpair is None
-                          or args.langpair not in DATASETS[args.test_set]):
+    if args.test_set and (args.langpair is None or args.langpair not in DATASETS[args.test_set]):
         if args.langpair is None:
             logging.error('I need a language pair (-l).')
         elif args.langpair not in DATASETS[args.test_set]:
             logging.error('No such language pair "%s"', args.langpair)
         logging.error(
-            'Available language pairs for test set "%s": %s', args.test_set,
-            ', '.join(
-                filter(lambda x: '-' in x, DATASETS[args.test_set].keys())))
+            'Available language pairs for test set "%s": %s',
+            args.test_set,
+            ', '.join(filter(lambda x: '-' in x, DATASETS[args.test_set].keys())),
+        )
         sys.exit(1)
 
     if args.echo:
         if args.langpair is None or args.test_set is None:
-            logging.warning(
-                "--echo requires a test set (--t) and a language pair (-l)")
+            logging.warning("--echo requires a test set (--t) and a language pair (-l)")
             sys.exit(1)
         print_test_set(args.test_set, args.langpair, args.echo)
         sys.exit(0)
 
     if args.test_set is None and len(args.refs) == 0:
-        logging.error(
-            'I need either a predefined test set (-t) or a list of references')
+        logging.error('I need either a predefined test set (-t) or a list of references')
         logging.error('The available test sets are: ')
         for testset in sorted(DATASETS.keys(), reverse=True):
-            logging.error('  %s: %s', testset,
-                          DATASETS[testset].get('description', ''))
+            logging.error('  %s: %s', testset, DATASETS[testset].get('description', ''))
         sys.exit(1)
     elif args.test_set is not None and len(args.refs) > 0:
-        logging.error(
-            'I need exactly one of (a) a predefined test set (-t) or (b) a '
-            'list of references '
-        )
+        logging.error('I need exactly one of (a) a predefined test set (-t) or (b) a ' 'list of references ')
         sys.exit(1)
 
     if args.test_set is not None and args.tokenize == 'none':
@@ -2378,7 +1732,8 @@ def main():
             "You are turning off sacrebleu's internal tokenization ("
             "'--tokenize none'), presumably to supply\n "
             "your own reference tokenization. Published numbers will not be "
-            "comparable with other papers.\n")
+            "comparable with other papers.\n"
+        )
 
     # Internal tokenizer settings. Set to 'zh' for Chinese  DEFAULT_TOKENIZER (
     if args.tokenize is None:
@@ -2388,24 +1743,27 @@ def main():
         else:
             args.tokenize = DEFAULT_TOKENIZER
 
-    if args.langpair is not None and args.langpair.split('-')[
-        1] == 'zh' and 'bleu' in args.metrics and args.tokenize != 'zh':
-        logging.warning(
-            'You should also pass "--tok zh" when scoring Chinese...')
+    if (
+        args.langpair is not None
+        and args.langpair.split('-')[1] == 'zh'
+        and 'bleu' in args.metrics
+        and args.tokenize != 'zh'
+    ):
+        logging.warning('You should also pass "--tok zh" when scoring Chinese...')
 
     if args.test_set:
         _, *refs = download_test_set(args.test_set, args.langpair)
         if len(refs) == 0:
-            print('No references found for test set {}/{}.'.format(
-                args.test_set, args.langpair))
+            print('No references found for test set {}/{}.'.format(args.test_set, args.langpair))
             sys.exit(1)
     else:
         refs = args.refs
 
-    inputfh = io.TextIOWrapper(
-        sys.stdin.buffer,
-        encoding=args.encoding) if args.input == '-' else smart_open(
-        args.input, encoding=args.encoding)
+    inputfh = (
+        io.TextIOWrapper(sys.stdin.buffer, encoding=args.encoding)
+        if args.input == '-'
+        else smart_open(args.input, encoding=args.encoding)
+    )
     system = inputfh.readlines()
 
     # Read references
@@ -2413,22 +1771,25 @@ def main():
 
     try:
         if 'bleu' in args.metrics:
-            bleu = corpus_bleu(system,
-                               refs,
-                               smooth_method=args.smooth,
-                               smooth_value=args.smooth_value,
-                               force=args.force,
-                               lowercase=args.lc,
-                               tokenize=args.tokenize)
+            bleu = corpus_bleu(
+                system,
+                refs,
+                smooth_method=args.smooth,
+                smooth_value=args.smooth_value,
+                force=args.force,
+                lowercase=args.lc,
+                tokenize=args.tokenize,
+            )
         if 'chrf' in args.metrics:
-            chrf = corpus_chrf(system,
-                               refs[0],
-                               beta=args.chrf_beta,
-                               order=args.chrf_order,
-                               remove_whitespace=not args.chrf_whitespace)
+            chrf = corpus_chrf(
+                system,
+                refs[0],
+                beta=args.chrf_beta,
+                order=args.chrf_order,
+                remove_whitespace=not args.chrf_whitespace,
+            )
     except EOFError:
-        logging.error(
-            'The input and reference stream(s) were of different lengths.\n')
+        logging.error('The input and reference stream(s) were of different lengths.\n')
         if args.test_set is not None:
             logging.error(
                 'This could be a problem with your system output or with '
@@ -2441,7 +1802,8 @@ def main():
                 'They will be downloaded automatically again the next time '
                 'you run sacreBLEU.',
                 SACREBLEU_DIR,
-                args.test_set)
+                args.test_set,
+            )
         sys.exit(1)
 
     width = args.width
@@ -2451,16 +1813,14 @@ def main():
                 print('{0:.{1}f}'.format(bleu.score, width))
             else:
                 version_str = bleu_signature(args, len(refs))
-                print(
-                    bleu.format(width).replace('BLEU', 'BLEU+' + version_str))
+                print(bleu.format(width).replace('BLEU', 'BLEU+' + version_str))
 
         elif metric == 'chrf':
             if args.score_only:
                 print('{0:.{1}f}'.format(chrf, width))
             else:
                 version_str = chrf_signature(args, len(refs))
-                print('chrF{0:d}+{1} = {2:.{3}f}'.format(
-                    args.chrf_beta, version_str, chrf, width))
+                print('chrF{0:d}+{1} = {2:.{3}f}'.format(args.chrf_beta, version_str, chrf, width))
 
 
 if __name__ == '__main__':
diff --git a/nemo/collections/nlp/utils/metrics/squad_metrics.py b/nemo/collections/nlp/utils/metrics/squad_metrics.py
index 081c73ed6613..13eb29de1931 100644
--- a/nemo/collections/nlp/utils/metrics/squad_metrics.py
+++ b/nemo/collections/nlp/utils/metrics/squad_metrics.py
@@ -25,8 +25,7 @@
 
 def _get_best_indexes(logits, n_best_size):
     """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits),
-                             key=lambda x: x[1], reverse=True)
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
 
     best_indexes = []
     for i in range(len(index_and_score)):
@@ -68,8 +67,7 @@ def get_tokens(s):
 def f1_score(prediction, ground_truth):
     prediction_tokens = get_tokens(prediction)
     ground_truth_tokens = get_tokens(ground_truth)
-    common = collections.Counter(prediction_tokens) & \
-        collections.Counter(ground_truth_tokens)
+    common = collections.Counter(prediction_tokens) & collections.Counter(ground_truth_tokens)
     num_same = sum(common.values())
     if len(ground_truth_tokens) == 0 or len(prediction_tokens) == 0:
         # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
@@ -165,8 +163,9 @@ def _strip_spaces(text):
 
     if len(orig_ns_text) != len(tok_ns_text):
         if verbose_logging:
-            print("Length not equal after stripping spaces: '%s' vs '%s'",
-                  orig_ns_text, tok_ns_text)
+            print(
+                "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
+            )
         return orig_text
 
     # We then project the characters in `pred_text` back to `orig_text` using
@@ -197,7 +196,7 @@ def _strip_spaces(text):
             print("Couldn't map end position")
         return orig_text
 
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
     return output_text
 
 
@@ -226,8 +225,7 @@ def make_eval_dict(exact_scores, f1_scores, qid_list=None):
         total = len(qid_list)
         return collections.OrderedDict(
             [
-                ("exact", 100.0 *
-                    sum(exact_scores[k] for k in qid_list) / total),
+                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total,),
                 ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
                 ("total", total),
             ]
@@ -239,12 +237,9 @@ def merge_eval(main_eval, new_eval, prefix):
         main_eval["%s_%s" % (prefix, k)] = new_eval[k]
 
 
-def find_all_best_thresh(main_eval, preds, exact_raw,
-                         f1_raw, na_probs, qid_to_has_ans):
-    best_exact, exact_thresh = \
-        find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
-    best_f1, f1_thresh = \
-        find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
 
     main_eval["best_exact"] = best_exact
     main_eval["best_exact_thresh"] = exact_thresh
diff --git a/nemo/collections/nlp/utils/nlp_utils.py b/nemo/collections/nlp/utils/nlp_utils.py
index 056685970475..1b1ef57bb27a 100644
--- a/nemo/collections/nlp/utils/nlp_utils.py
+++ b/nemo/collections/nlp/utils/nlp_utils.py
@@ -1,11 +1,12 @@
 import os
 import time
 
-from matplotlib import pyplot as plt
-import nemo
 import numpy as np
+from matplotlib import pyplot as plt
 from sklearn.metrics import confusion_matrix
 
+import nemo
+
 
 def _is_whitespace(c):
     if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
@@ -14,18 +15,13 @@ def _is_whitespace(c):
 
 
 def mask_padded_tokens(tokens, pad_id):
-    mask = (tokens != pad_id)
+    mask = tokens != pad_id
     return mask
 
 
-def read_intent_slot_outputs(queries,
-                             intent_file,
-                             slot_file,
-                             intent_logits,
-                             slot_logits,
-                             slot_masks,
-                             intents=None,
-                             slots=None):
+def read_intent_slot_outputs(
+    queries, intent_file, slot_file, intent_logits, slot_logits, slot_masks, intents=None, slots=None,
+):
     intent_dict = get_vocab(intent_file)
     slot_dict = get_vocab(slot_file)
     pred_intents = np.argmax(intent_logits, 1)
@@ -36,8 +32,7 @@ def read_intent_slot_outputs(queries,
         pred = pred_intents[i]
         nemo.logging.info(f'Predicted intent:\t{pred}\t{intent_dict[pred]}')
         if intents is not None:
-            nemo.logging.info(
-                f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}')
+            nemo.logging.info(f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}')
 
         pred_slot = pred_slots[i][slot_masks[i]]
         tokens = query.strip().split()
@@ -83,12 +78,7 @@ def write_vocab_in_order(vocab, outfile):
             f.write(f'{vocab[key]}\n')
 
 
-def plot_confusion_matrix(label_ids,
-                          labels,
-                          preds,
-                          graph_fold,
-                          normalize=False,
-                          prefix=''):
+def plot_confusion_matrix(label_ids, labels, preds, graph_fold, normalize=False, prefix=''):
     '''
     Plot confusion matrix.
     Args:
@@ -102,8 +92,7 @@ def plot_confusion_matrix(label_ids,
     '''
     # remove labels from label_ids that don't appear in the dev set
     used_labels = set(labels) | set(preds)
-    label_ids = \
-        {k: label_ids[k] for k, v in label_ids.items() if v in used_labels}
+    label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels}
 
     ids_to_labels = {label_ids[k]: k for k in label_ids}
     classes = [ids_to_labels[id] for id in sorted(label_ids.values())]
@@ -131,5 +120,4 @@ def plot_confusion_matrix(label_ids,
     fig.colorbar(cax)
 
     title = (prefix + ' ' + title).strip()
-    plt.savefig(os.path.join(graph_fold,
-                             title + '_' + time.strftime('%Y%m%d-%H%M%S')))
+    plt.savefig(os.path.join(graph_fold, title + '_' + time.strftime('%Y%m%d-%H%M%S')))
diff --git a/nemo/collections/simple_gan/__init__.py b/nemo/collections/simple_gan/__init__.py
index 88b7b133000e..af1a5bb535e6 100644
--- a/nemo/collections/simple_gan/__init__.py
+++ b/nemo/collections/simple_gan/__init__.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-from .gan import *
-
 from nemo.core import Backend
 
+from .gan import *
+
 backend = Backend.PyTorch
diff --git a/nemo/collections/simple_gan/gan.py b/nemo/collections/simple_gan/gan.py
index 12be01804f33..47cf4f49121b 100644
--- a/nemo/collections/simple_gan/gan.py
+++ b/nemo/collections/simple_gan/gan.py
@@ -2,12 +2,10 @@
 """A collection of Neural Modules to be used for training a WGAN-GP on MNIST"""
 import torch
 from torch.utils.data import Dataset
-from torchvision import transforms, datasets
+from torchvision import datasets, transforms
 
-from nemo.backends.pytorch.nm import TrainableNM, NonTrainableNM, LossNM,\
-    DataLayerNM
-from nemo.core import NeuralType, BatchTag, ChannelTag, HeightTag, WidthTag,\
-    AxisType, DeviceType
+from nemo.backends.pytorch.nm import DataLayerNM, LossNM, NonTrainableNM, TrainableNM
+from nemo.core import AxisType, BatchTag, ChannelTag, DeviceType, HeightTag, NeuralType, WidthTag
 
 
 class SimpleDiscriminator(TrainableNM):
@@ -29,10 +27,14 @@ def input_ports(self):
             3: AxisType(WidthTag, 28)
         """
         return {
-            "image": NeuralType({0: AxisType(BatchTag),
-                                 1: AxisType(ChannelTag),
-                                 2: AxisType(HeightTag, 28),
-                                 3: AxisType(WidthTag, 28)})
+            "image": NeuralType(
+                {
+                    0: AxisType(BatchTag),
+                    1: AxisType(ChannelTag),
+                    2: AxisType(HeightTag, 28),
+                    3: AxisType(WidthTag, 28),
+                }
+            )
         }
 
     @property
@@ -44,10 +46,7 @@ def output_ports(self):
 
             1: AxisType(ChannelTag, 1)
         """
-        return {
-            "decision": NeuralType({0: AxisType(BatchTag),
-                                    1: AxisType(ChannelTag, 1)})
-        }
+        return {"decision": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag, 1)})}
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -61,12 +60,12 @@ def __init__(self, **kwargs):
             torch.nn.Conv2d(128, 256, 3, stride=2, padding=1),
             torch.nn.ReLU(),
         )
-        self.fc_layer = torch.nn.Linear(256*4*4, 1)
+        self.fc_layer = torch.nn.Linear(256 * 4 * 4, 1)
         self.to(self._device)
 
     def forward(self, image):
         decision = self.layers(image)
-        decision = decision.view(-1, 256*4*4)
+        decision = decision.view(-1, 256 * 4 * 4)
         decision = self.fc_layer(decision)
         return decision
 
@@ -90,11 +89,14 @@ def input_ports(self):
             3: AxisType(WidthTag, 4)
         """
         return {
-            "latents": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag, 64),
-                2: AxisType(HeightTag, 4),
-                3: AxisType(WidthTag, 4)})
+            "latents": NeuralType(
+                {
+                    0: AxisType(BatchTag),
+                    1: AxisType(ChannelTag, 64),
+                    2: AxisType(HeightTag, 4),
+                    3: AxisType(WidthTag, 4),
+                }
+            )
         }
 
     @property
@@ -111,11 +113,14 @@ def output_ports(self):
             3: AxisType(WidthTag, 28)
         """
         return {
-            "image": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag),
-                2: AxisType(HeightTag, 28),
-                3: AxisType(WidthTag, 28)})
+            "image": NeuralType(
+                {
+                    0: AxisType(BatchTag),
+                    1: AxisType(ChannelTag),
+                    2: AxisType(HeightTag, 28),
+                    3: AxisType(WidthTag, 28),
+                }
+            )
         }
 
     def __init__(self, **kwargs):
@@ -157,9 +162,7 @@ def input_ports(self):
             1: AxisType(ChannelTag, 1)
         """
         return {
-            "decision": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag, 1)}),
+            "decision": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag, 1)}),
         }
 
     @property
@@ -169,9 +172,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, neg=False, **kwargs):
         super().__init__(**kwargs)
@@ -212,14 +213,15 @@ def input_ports(self):
             1: AxisType(ChannelTag, 1)
         """
         return {
-            "interpolated_image": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag),
-                2: AxisType(HeightTag, 28),
-                3: AxisType(WidthTag, 28)}),
-            "interpolated_decision": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag, 1)}),
+            "interpolated_image": NeuralType(
+                {
+                    0: AxisType(BatchTag),
+                    1: AxisType(ChannelTag),
+                    2: AxisType(HeightTag, 28),
+                    3: AxisType(WidthTag, 28),
+                }
+            ),
+            "interpolated_decision": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag, 1)}),
         }
 
     @property
@@ -229,28 +231,29 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, lambda_, **kwargs):
         super().__init__(**kwargs)
         self.lambda_ = lambda_
 
     def _loss(self, interpolated_image, interpolated_decision):
-        grad_outputs = torch.ones(
-            interpolated_decision.size(), dtype=interpolated_image.dtype)
+        grad_outputs = torch.ones(interpolated_decision.size(), dtype=interpolated_image.dtype)
         if self.placement != DeviceType.CPU:
             grad_outputs = grad_outputs.cuda()
 
         gradients = torch.autograd.grad(
-            outputs=interpolated_decision, inputs=interpolated_image,
+            outputs=interpolated_decision,
+            inputs=interpolated_image,
             grad_outputs=grad_outputs,
-            create_graph=True, retain_graph=True, only_inputs=True)[0]
+            create_graph=True,
+            retain_graph=True,
+            only_inputs=True,
+        )[0]
         gradients = gradients.view(gradients.size(0), -1)
 
         gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
-        return self.lambda_*gradient_penalty
+        return self.lambda_ * gradient_penalty
 
     def _loss_function(self, **kwargs):
         return self._loss(**kwargs)
@@ -283,16 +286,22 @@ def input_ports(self):
             3: AxisType(WidthTag, 28)
         """
         return {
-            "image1": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag),
-                2: AxisType(HeightTag, 28),
-                3: AxisType(WidthTag, 28)}),
-            "image2": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag),
-                2: AxisType(HeightTag, 28),
-                3: AxisType(WidthTag, 28)})
+            "image1": NeuralType(
+                {
+                    0: AxisType(BatchTag),
+                    1: AxisType(ChannelTag),
+                    2: AxisType(HeightTag, 28),
+                    3: AxisType(WidthTag, 28),
+                }
+            ),
+            "image2": NeuralType(
+                {
+                    0: AxisType(BatchTag),
+                    1: AxisType(ChannelTag),
+                    2: AxisType(HeightTag, 28),
+                    3: AxisType(WidthTag, 28),
+                }
+            ),
         }
 
     @property
@@ -309,11 +318,14 @@ def output_ports(self):
             3: AxisType(WidthTag, 28)
         """
         return {
-            "interpolated_image": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag),
-                2: AxisType(HeightTag, 28),
-                3: AxisType(WidthTag, 28)})
+            "interpolated_image": NeuralType(
+                {
+                    0: AxisType(BatchTag),
+                    1: AxisType(ChannelTag),
+                    2: AxisType(HeightTag, 28),
+                    3: AxisType(WidthTag, 28),
+                }
+            )
         }
 
     def __init__(self, **kwargs):
@@ -350,18 +362,17 @@ def output_ports(self):
             3: AxisType(WidthTag, 4)
         """
         return {
-            "latent": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag, 64),
-                2: AxisType(HeightTag, 4),
-                3: AxisType(WidthTag, 4)})
+            "latent": NeuralType(
+                {
+                    0: AxisType(BatchTag),
+                    1: AxisType(ChannelTag, 64),
+                    2: AxisType(HeightTag, 4),
+                    3: AxisType(WidthTag, 4),
+                }
+            )
         }
 
-    def __init__(
-            self, *,
-            batch_size,
-            **kwargs
-    ):
+    def __init__(self, *, batch_size, **kwargs):
         DataLayerNM.__init__(self, **kwargs)
         self._batch_size = batch_size
 
@@ -374,7 +385,8 @@ def __getitem__(self, i):
                 return torch.randn(64, 4, 4)
 
             def __len__(self):
-                return self._batch_size*2
+                return self._batch_size * 2
+
         self._dataset = DummyDataset(batch_size)
 
     def __len__(self):
@@ -426,28 +438,26 @@ def output_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "latent": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag, 64),
-                2: AxisType(HeightTag, 4),
-                3: AxisType(WidthTag, 4)}),
-            "image": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(ChannelTag),
-                2: AxisType(HeightTag, self._input_size[1]),
-                3: AxisType(WidthTag, self._input_size[0])}),
-            "label": NeuralType({
-                0: AxisType(BatchTag)})
+            "latent": NeuralType(
+                {
+                    0: AxisType(BatchTag),
+                    1: AxisType(ChannelTag, 64),
+                    2: AxisType(HeightTag, 4),
+                    3: AxisType(WidthTag, 4),
+                }
+            ),
+            "image": NeuralType(
+                {
+                    0: AxisType(BatchTag),
+                    1: AxisType(ChannelTag),
+                    2: AxisType(HeightTag, self._input_size[1]),
+                    3: AxisType(WidthTag, self._input_size[0]),
+                }
+            ),
+            "label": NeuralType({0: AxisType(BatchTag)}),
         }
 
-    def __init__(
-            self, *,
-            batch_size,
-            root,
-            train=True,
-            shuffle=True,
-            **kwargs
-    ):
+    def __init__(self, *, batch_size, root, train=True, shuffle=True, **kwargs):
         self._input_size = (28, 28)
         DataLayerNM.__init__(self, **kwargs)
 
@@ -457,9 +467,7 @@ def __init__(
         self._root = root
         self._transforms = transforms.Compose([transforms.ToTensor()])
 
-        self._dataset = datasets.MNIST(root=self._root, train=self._train,
-                                       download=True,
-                                       transform=self._transforms)
+        self._dataset = datasets.MNIST(root=self._root, train=self._train, download=True, transform=self._transforms,)
 
         class DatasetWrapper(Dataset):
             def __init__(self, dataset):
@@ -473,6 +481,7 @@ def __getitem__(self, index):
 
             def __len__(self):
                 return self._dataset.__len__()
+
         self._dataset = DatasetWrapper(self._dataset)
 
     def __len__(self):
diff --git a/nemo/collections/tts/__init__.py b/nemo/collections/tts/__init__.py
index 9f2fe2d3d596..594588680a20 100644
--- a/nemo/collections/tts/__init__.py
+++ b/nemo/collections/tts/__init__.py
@@ -13,23 +13,15 @@
 # limitations under the License.
 # =============================================================================
 
-from nemo.core import Backend
-
+from nemo.collections.tts.data_layers import AudioDataLayer
+from nemo.collections.tts.parts.helpers import *
+from nemo.collections.tts.parts.helpers import __all__ as helpers__all__
 from nemo.collections.tts.tacotron2_modules import *
 from nemo.collections.tts.tacotron2_modules import __all__ as tacotron2__all__
-
 from nemo.collections.tts.waveglow_modules import *
 from nemo.collections.tts.waveglow_modules import __all__ as waveglow__all__
-
-from nemo.collections.tts.data_layers import AudioDataLayer
-
-from nemo.collections.tts.parts.helpers import *
-from nemo.collections.tts.parts.helpers import __all__ as helpers__all__
-
+from nemo.core import Backend
 
 backend = Backend.PyTorch
 
-__all__ = ["AudioDataLayer"] + \
-          helpers__all__ + \
-          tacotron2__all__ + \
-          waveglow__all__
+__all__ = ["AudioDataLayer"] + helpers__all__ + tacotron2__all__ + waveglow__all__
diff --git a/nemo/collections/tts/data_layers.py b/nemo/collections/tts/data_layers.py
index 820aadf17308..1826d580bfc0 100644
--- a/nemo/collections/tts/data_layers.py
+++ b/nemo/collections/tts/data_layers.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import torch
+
 import nemo
 from nemo.backends.pytorch.nm import DataLayerNM
 from nemo.core import DeviceType
 from nemo.core.neural_types import *
+
 from .parts.datasets import AudioOnlyDataset
 
 
@@ -57,9 +59,7 @@ def output_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "audio_signal": NeuralType(
-                {0: AxisType(BatchTag), 1: AxisType(TimeTag)}
-            ),
+            "audio_signal": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "a_sig_length": NeuralType({0: AxisType(BatchTag)}),
         }
 
@@ -90,9 +90,7 @@ def __init__(
         sampler = None
         if self._placement == DeviceType.AllGpu:
             nemo.logging.info('Parallelizing DATALAYER')
-            sampler = torch.utils.data.distributed.DistributedSampler(
-                self._dataset
-            )
+            sampler = torch.utils.data.distributed.DistributedSampler(self._dataset)
 
         self._dataloader = torch.utils.data.DataLoader(
             dataset=self._dataset,
diff --git a/nemo/collections/tts/parts/__init__.py b/nemo/collections/tts/parts/__init__.py
index c425d2db8b4e..1350837aada6 100644
--- a/nemo/collections/tts/parts/__init__.py
+++ b/nemo/collections/tts/parts/__init__.py
@@ -1,25 +1,29 @@
 from .datasets import AudioOnlyDataset
+from .helpers import (
+    tacotron2_eval_log_to_tb_func,
+    tacotron2_log_to_tb_func,
+    tacotron2_process_eval_batch,
+    tacotron2_process_final_eval,
+    waveglow_eval_log_to_tb_func,
+    waveglow_log_to_tb_func,
+    waveglow_process_eval_batch,
+)
 from .layers import get_mask_from_lengths
-from .tacotron2 import Encoder, Decoder, Postnet
+from .tacotron2 import Decoder, Encoder, Postnet
 from .waveglow import WaveGlow
-from .helpers import (waveglow_log_to_tb_func,
-                      waveglow_process_eval_batch,
-                      waveglow_eval_log_to_tb_func,
-                      tacotron2_log_to_tb_func,
-                      tacotron2_process_eval_batch,
-                      tacotron2_process_final_eval,
-                      tacotron2_eval_log_to_tb_func)
 
-__all__ = ['AudioOnlyDataset',
-           'get_mask_from_lengths',
-           'Encoder',
-           'Decoder',
-           'Postnet',
-           'WaveGlow',
-           'waveglow_log_to_tb_func',
-           'waveglow_process_eval_batch',
-           'waveglow_eval_log_to_tb_func',
-           'tacotron2_log_to_tb_func',
-           'tacotron2_process_eval_batch',
-           'tacotron2_process_final_eval',
-           'tacotron2_eval_log_to_tb_func']
+__all__ = [
+    'AudioOnlyDataset',
+    'get_mask_from_lengths',
+    'Encoder',
+    'Decoder',
+    'Postnet',
+    'WaveGlow',
+    'waveglow_log_to_tb_func',
+    'waveglow_process_eval_batch',
+    'waveglow_eval_log_to_tb_func',
+    'tacotron2_log_to_tb_func',
+    'tacotron2_process_eval_batch',
+    'tacotron2_process_final_eval',
+    'tacotron2_eval_log_to_tb_func',
+]
diff --git a/nemo/collections/tts/parts/datasets.py b/nemo/collections/tts/parts/datasets.py
index d2db0d6eeb37..63128d5f88d5 100644
--- a/nemo/collections/tts/parts/datasets.py
+++ b/nemo/collections/tts/parts/datasets.py
@@ -2,19 +2,13 @@
 import torch
 from torch.utils.data import Dataset
 
-from nemo.collections.asr.parts import collections
-from nemo.collections.asr.parts import parsers
+from nemo.collections.asr.parts import collections, parsers
 from nemo.collections.asr.parts.segment import AudioSegment
 
 
 class AudioOnlyDataset(Dataset):
     def __init__(
-        self,
-        manifest_filepath,
-        n_segments=0,
-        max_duration=None,
-        min_duration=None,
-        trim=False,
+        self, manifest_filepath, n_segments=0, max_duration=None, min_duration=None, trim=False,
     ):
         """See AudioDataLayer"""
         self.collection = collections.ASRAudioText(
@@ -43,9 +37,7 @@ def find_max_len(seq, index):
             else:
                 max_audio_len = find_max_len(batch, 0)
 
-            audio_signal = torch.zeros(
-                batch_size, max_audio_len, dtype=torch.float
-            )
+            audio_signal = torch.zeros(batch_size, max_audio_len, dtype=torch.float)
             audio_lengths = []
             for i, s in enumerate(batch):
                 audio_signal[i].narrow(0, 0, s[0].size(0)).copy_(s[0])
@@ -56,9 +48,7 @@ def find_max_len(seq, index):
 
     def __getitem__(self, index):
         example = self.collection[index]
-        features = AudioSegment.segment_from_file(
-            example.audio_file, n_segments=self.n_segments, trim=self.trim,
-        )
+        features = AudioSegment.segment_from_file(example.audio_file, n_segments=self.n_segments, trim=self.trim,)
         features = torch.tensor(features.samples, dtype=torch.float)
         f, fl = features, torch.tensor(features.shape[0]).long()
 
diff --git a/nemo/collections/tts/parts/helpers.py b/nemo/collections/tts/parts/helpers.py
index d3d613ddc7a7..3212b086bb6e 100644
--- a/nemo/collections/tts/parts/helpers.py
+++ b/nemo/collections/tts/parts/helpers.py
@@ -13,20 +13,22 @@
     "tacotron2_log_to_tb_func",
     "tacotron2_process_eval_batch",
     "tacotron2_process_final_eval",
-    "tacotron2_eval_log_to_tb_func"
+    "tacotron2_eval_log_to_tb_func",
 ]
 
 
-def waveglow_log_to_tb_func(swriter,
-                            tensors,
-                            step,
-                            tag="train",
-                            log_images=False,
-                            log_images_freq=1,
-                            n_fft=1024,
-                            hop_length=256,
-                            window="hann",
-                            mel_fb=None):
+def waveglow_log_to_tb_func(
+    swriter,
+    tensors,
+    step,
+    tag="train",
+    log_images=False,
+    log_images_freq=1,
+    n_fft=1024,
+    hop_length=256,
+    window="hann",
+    mel_fb=None,
+):
     loss, audio_pred, spec_target, mel_length = tensors
     if loss:
         swriter.add_scalar("loss", loss, step)
@@ -34,19 +36,25 @@ def waveglow_log_to_tb_func(swriter,
         mel_length = mel_length[0]
         spec_target = spec_target[0].data.cpu().numpy()[:, :mel_length]
         swriter.add_image(
-            f"{tag}_mel_target",
-            plot_spectrogram_to_numpy(spec_target),
-            step, dataformats="HWC")
+            f"{tag}_mel_target", plot_spectrogram_to_numpy(spec_target), step, dataformats="HWC",
+        )
         if mel_fb is not None:
-            mag, _ = librosa.core.magphase(librosa.core.stft(
-                np.nan_to_num(audio_pred[0].cpu().detach().numpy()),
-                n_fft=n_fft, hop_length=hop_length, window=window))
+            mag, _ = librosa.core.magphase(
+                librosa.core.stft(
+                    np.nan_to_num(audio_pred[0].cpu().detach().numpy()),
+                    n_fft=n_fft,
+                    hop_length=hop_length,
+                    window=window,
+                )
+            )
             mel_pred = np.matmul(mel_fb.cpu().numpy(), mag).squeeze()
             log_mel_pred = np.log(np.clip(mel_pred, a_min=1e-5, a_max=None))
             swriter.add_image(
                 f"{tag}_mel_predicted",
                 plot_spectrogram_to_numpy(log_mel_pred[:, :mel_length]),
-                step, dataformats="HWC")
+                step,
+                dataformats="HWC",
+            )
 
 
 def waveglow_process_eval_batch(tensors: dict, global_vars: dict):
@@ -62,14 +70,8 @@ def waveglow_process_eval_batch(tensors: dict, global_vars: dict):
 
 
 def waveglow_eval_log_to_tb_func(
-        swriter,
-        global_vars,
-        step,
-        tag=None,
-        n_fft=1024,
-        hop_length=256,
-        window="hann",
-        mel_fb=None):
+    swriter, global_vars, step, tag=None, n_fft=1024, hop_length=256, window="hann", mel_fb=None,
+):
     spec_target = global_vars['tensorboard']["mel_target"]
     audio_pred = global_vars['tensorboard']["audio_pred"]
     mel_length = global_vars['tensorboard']['mel_length']
@@ -82,33 +84,33 @@ def waveglow_eval_log_to_tb_func(
         n_fft=n_fft,
         hop_length=hop_length,
         window=window,
-        mel_fb=mel_fb)
+        mel_fb=mel_fb,
+    )
 
 
-def tacotron2_log_to_tb_func(swriter, tensors, step, tag="train",
-                             log_images=False, log_images_freq=1):
+def tacotron2_log_to_tb_func(swriter, tensors, step, tag="train", log_images=False, log_images_freq=1):
     loss, spec_target, mel_postnet, gate, gate_target, alignments = tensors
     if loss:
         swriter.add_scalar("loss", loss, step)
     if log_images and step % log_images_freq == 0:
         swriter.add_image(
-            f"{tag}_alignment",
-            plot_alignment_to_numpy(alignments[0].data.cpu().numpy().T),
-            step, dataformats="HWC")
+            f"{tag}_alignment", plot_alignment_to_numpy(alignments[0].data.cpu().numpy().T), step, dataformats="HWC",
+        )
         swriter.add_image(
-            f"{tag}_mel_target",
-            plot_spectrogram_to_numpy(spec_target[0].data.cpu().numpy()),
-            step, dataformats="HWC")
+            f"{tag}_mel_target", plot_spectrogram_to_numpy(spec_target[0].data.cpu().numpy()), step, dataformats="HWC",
+        )
         swriter.add_image(
             f"{tag}_mel_predicted",
             plot_spectrogram_to_numpy(mel_postnet[0].data.cpu().numpy()),
-            step, dataformats="HWC")
+            step,
+            dataformats="HWC",
+        )
         swriter.add_image(
             f"{tag}_gate",
-            plot_gate_outputs_to_numpy(
-                gate_target[0].data.cpu().numpy(),
-                torch.sigmoid(gate[0]).data.cpu().numpy()),
-            step, dataformats="HWC")
+            plot_gate_outputs_to_numpy(gate_target[0].data.cpu().numpy(), torch.sigmoid(gate[0]).data.cpu().numpy(),),
+            step,
+            dataformats="HWC",
+        )
 
 
 def tacotron2_process_eval_batch(tensors: dict, global_vars: dict):
@@ -149,11 +151,8 @@ def tacotron2_eval_log_to_tb_func(swriter, global_vars, step, tag=None):
     alignments = global_vars['tensorboard']["alignments"]
     swriter.add_scalar(f"{tag}.loss", global_vars['EvalLoss'], step)
     tacotron2_log_to_tb_func(
-        swriter,
-        [None, spec_target, mel_postnet, gate, gate_target, alignments],
-        step,
-        tag=tag,
-        log_images=True)
+        swriter, [None, spec_target, mel_postnet, gate, gate_target, alignments], step, tag=tag, log_images=True,
+    )
 
 
 def save_figure_to_numpy(fig):
@@ -165,8 +164,7 @@ def save_figure_to_numpy(fig):
 
 def plot_alignment_to_numpy(alignment, info=None):
     fig, ax = plt.subplots(figsize=(6, 4))
-    im = ax.imshow(alignment, aspect='auto', origin='lower',
-                   interpolation='none')
+    im = ax.imshow(alignment, aspect='auto', origin='lower', interpolation='none')
     fig.colorbar(im, ax=ax)
     xlabel = 'Decoder timestep'
     if info is not None:
@@ -183,8 +181,7 @@ def plot_alignment_to_numpy(alignment, info=None):
 
 def plot_spectrogram_to_numpy(spectrogram):
     fig, ax = plt.subplots(figsize=(12, 3))
-    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
-                   interpolation='none')
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation='none')
     plt.colorbar(im, ax=ax)
     plt.xlabel("Frames")
     plt.ylabel("Channels")
@@ -198,10 +195,12 @@ def plot_spectrogram_to_numpy(spectrogram):
 
 def plot_gate_outputs_to_numpy(gate_targets, gate_outputs):
     fig, ax = plt.subplots(figsize=(12, 3))
-    ax.scatter(range(len(gate_targets)), gate_targets, alpha=0.5,
-               color='green', marker='+', s=1, label='target')
-    ax.scatter(range(len(gate_outputs)), gate_outputs, alpha=0.5,
-               color='red', marker='.', s=1, label='predicted')
+    ax.scatter(
+        range(len(gate_targets)), gate_targets, alpha=0.5, color='green', marker='+', s=1, label='target',
+    )
+    ax.scatter(
+        range(len(gate_outputs)), gate_outputs, alpha=0.5, color='red', marker='.', s=1, label='predicted',
+    )
 
     plt.xlabel("Frames (Green target, Red predicted)")
     plt.ylabel("Gate State")
diff --git a/nemo/collections/tts/parts/layers.py b/nemo/collections/tts/parts/layers.py
index 3fcfdaa527cb..e78ef415b06e 100644
--- a/nemo/collections/tts/parts/layers.py
+++ b/nemo/collections/tts/parts/layers.py
@@ -8,28 +8,41 @@ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
         self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
 
         torch.nn.init.xavier_uniform_(
-            self.linear_layer.weight,
-            gain=torch.nn.init.calculate_gain(w_init_gain))
+            self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain),
+        )
 
     def forward(self, x):
         return self.linear_layer(x)
 
 
 class ConvNorm(torch.nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
-                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=None,
+        dilation=1,
+        bias=True,
+        w_init_gain='linear',
+    ):
         super(ConvNorm, self).__init__()
         if padding is None:
-            assert(kernel_size % 2 == 1)
+            assert kernel_size % 2 == 1
             padding = int(dilation * (kernel_size - 1) / 2)
 
-        self.conv = torch.nn.Conv1d(in_channels, out_channels,
-                                    kernel_size=kernel_size, stride=stride,
-                                    padding=padding, dilation=dilation,
-                                    bias=bias)
-
-        torch.nn.init.xavier_uniform_(
-            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+        self.conv = torch.nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+
+        torch.nn.init.xavier_uniform_(self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
 
     def forward(self, signal):
         conv_signal = self.conv(signal)
diff --git a/nemo/collections/tts/parts/manifest.py b/nemo/collections/tts/parts/manifest.py
index 773669c012a7..a9d4780dffd5 100644
--- a/nemo/collections/tts/parts/manifest.py
+++ b/nemo/collections/tts/parts/manifest.py
@@ -3,12 +3,9 @@
 
 
 class AudioManifest(object):
-    def __init__(self,
-                 manifest_paths,
-                 max_duration=None,
-                 min_duration=None,
-                 sort_by_duration=False,
-                 max_utts=0):
+    def __init__(
+        self, manifest_paths, max_duration=None, min_duration=None, sort_by_duration=False, max_utts=0,
+    ):
         ids = []
         duration = 0.0
         filtered_duration = 0.0
@@ -17,12 +14,10 @@ def __init__(self,
             with open(manifest_path, "r", encoding="utf-8") as fh:
                 for line in fh:
                     data = json.loads(line)
-                    if min_duration is not None and data['duration'] \
-                            < min_duration:
+                    if min_duration is not None and data['duration'] < min_duration:
                         filtered_duration += data['duration']
                         continue
-                    if max_duration is not None and data['duration'] \
-                            > max_duration:
+                    if max_duration is not None and data['duration'] > max_duration:
                         filtered_duration += data['duration']
                         continue
 
@@ -32,9 +27,7 @@ def __init__(self,
                     duration += data['duration']
 
                     if max_utts > 0 and len(ids) >= max_utts:
-                        print(
-                            'Stopping parsing %s as max_utts=%d' % (
-                                manifest_path, max_utts))
+                        print('Stopping parsing %s as max_utts=%d' % (manifest_path, max_utts))
                         break
 
         if sort_by_duration:
diff --git a/nemo/collections/tts/parts/tacotron2.py b/nemo/collections/tts/parts/tacotron2.py
index ed7e6f17f92c..ea2a6551b0d2 100644
--- a/nemo/collections/tts/parts/tacotron2.py
+++ b/nemo/collections/tts/parts/tacotron2.py
@@ -2,24 +2,27 @@
 from math import sqrt
 
 import torch
-from torch.autograd import Variable
 from torch import nn
+from torch.autograd import Variable
 from torch.nn import functional as F
 
 from .layers import ConvNorm, LinearNorm, get_mask_from_lengths
 
 
 class LocationLayer(nn.Module):
-    def __init__(self, attention_n_filters, attention_kernel_size,
-                 attention_dim):
+    def __init__(self, attention_n_filters, attention_kernel_size, attention_dim):
         super(LocationLayer, self).__init__()
         padding = int((attention_kernel_size - 1) / 2)
-        self.location_conv = ConvNorm(2, attention_n_filters,
-                                      kernel_size=attention_kernel_size,
-                                      padding=padding, bias=False, stride=1,
-                                      dilation=1)
-        self.location_dense = LinearNorm(attention_n_filters, attention_dim,
-                                         bias=False, w_init_gain='tanh')
+        self.location_conv = ConvNorm(
+            2,
+            attention_n_filters,
+            kernel_size=attention_kernel_size,
+            padding=padding,
+            bias=False,
+            stride=1,
+            dilation=1,
+        )
+        self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=False, w_init_gain='tanh')
 
     def forward(self, attention_weights_cat):
         processed_attention = self.location_conv(attention_weights_cat)
@@ -29,22 +32,24 @@ def forward(self, attention_weights_cat):
 
 
 class Attention(nn.Module):
-    def __init__(self, attention_rnn_dim, embedding_dim,
-                 attention_dim, attention_location_n_filters,
-                 attention_location_kernel_size):
+    def __init__(
+        self,
+        attention_rnn_dim,
+        embedding_dim,
+        attention_dim,
+        attention_location_n_filters,
+        attention_location_kernel_size,
+    ):
         super(Attention, self).__init__()
-        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
-                                      bias=False, w_init_gain='tanh')
-        self.memory_layer = LinearNorm(embedding_dim, attention_dim,
-                                       bias=False, w_init_gain='tanh')
+        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain='tanh')
+        self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain='tanh')
         self.v = LinearNorm(attention_dim, 1, bias=False)
-        self.location_layer = LocationLayer(attention_location_n_filters,
-                                            attention_location_kernel_size,
-                                            attention_dim)
+        self.location_layer = LocationLayer(
+            attention_location_n_filters, attention_location_kernel_size, attention_dim,
+        )
         self.score_mask_value = -float("inf")
 
-    def get_alignment_energies(self, query, processed_memory,
-                               attention_weights_cat):
+    def get_alignment_energies(self, query, processed_memory, attention_weights_cat):
         """
         PARAMS
         ------
@@ -58,16 +63,15 @@ def get_alignment_energies(self, query, processed_memory,
         """
 
         processed_query = self.query_layer(query.unsqueeze(1))
-        processed_attention_weights = self.location_layer(
-            attention_weights_cat)
-        energies = self.v(torch.tanh(
-            processed_query + processed_attention_weights + processed_memory))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(torch.tanh(processed_query + processed_attention_weights + processed_memory))
 
         energies = energies.squeeze(-1)
         return energies
 
-    def forward(self, attention_hidden_state, memory, processed_memory,
-                attention_weights_cat, mask):
+    def forward(
+        self, attention_hidden_state, memory, processed_memory, attention_weights_cat, mask,
+    ):
         """
         PARAMS
         ------
@@ -77,8 +81,7 @@ def forward(self, attention_hidden_state, memory, processed_memory,
         attention_weights_cat: previous and cummulative attention weights
         mask: binary mask for padded data
         """
-        alignment = self.get_alignment_energies(
-            attention_hidden_state, processed_memory, attention_weights_cat)
+        alignment = self.get_alignment_energies(attention_hidden_state, processed_memory, attention_weights_cat)
 
         if mask is not None:
             alignment.data.masked_fill_(mask, self.score_mask_value)
@@ -96,22 +99,20 @@ def __init__(self, in_dim, sizes, p_dropout=0.5):
         in_sizes = [in_dim] + sizes[:-1]
         self.p_dropout = p_dropout
         self.layers = nn.ModuleList(
-            [LinearNorm(in_size, out_size, bias=False)
-             for (in_size, out_size) in zip(in_sizes, sizes)])
+            [LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes)]
+        )
 
     def forward(self, x, inference=False):
         if inference:
             for linear in self.layers:
                 x = F.relu(linear(x))
                 x0 = x[0].unsqueeze(0)
-                mask = Variable(
-                    torch.bernoulli(x0.data.new(x0.data.size()).fill_(0.5)))
+                mask = Variable(torch.bernoulli(x0.data.new(x0.data.size()).fill_(0.5)))
                 mask = mask.expand(x.size(0), x.size(1))
-                x = x*mask*2
+                x = x * mask * 2
         else:
             for linear in self.layers:
-                x = F.dropout(
-                    F.relu(linear(x)), p=0., training=True)
+                x = F.dropout(F.relu(linear(x)), p=0.0, training=True)
         return x
 
 
@@ -120,46 +121,62 @@ class Postnet(nn.Module):
         - Five 1-d convolution with 512 channels and kernel size 5
     """
 
-    def __init__(self, n_mel_channels, postnet_embedding_dim,
-                 postnet_kernel_size, postnet_n_convolutions,
-                 p_dropout=0.5):
+    def __init__(
+        self, n_mel_channels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolutions, p_dropout=0.5,
+    ):
         super(Postnet, self).__init__()
         self.convolutions = nn.ModuleList()
 
         self.convolutions.append(
             nn.Sequential(
-                ConvNorm(n_mel_channels, postnet_embedding_dim,
-                         kernel_size=postnet_kernel_size, stride=1,
-                         padding=int((postnet_kernel_size - 1) / 2),
-                         dilation=1, w_init_gain='tanh'),
-                nn.BatchNorm1d(postnet_embedding_dim))
+                ConvNorm(
+                    n_mel_channels,
+                    postnet_embedding_dim,
+                    kernel_size=postnet_kernel_size,
+                    stride=1,
+                    padding=int((postnet_kernel_size - 1) / 2),
+                    dilation=1,
+                    w_init_gain='tanh',
+                ),
+                nn.BatchNorm1d(postnet_embedding_dim),
+            )
         )
 
         for _ in range(1, postnet_n_convolutions - 1):
             self.convolutions.append(
                 nn.Sequential(
-                    ConvNorm(postnet_embedding_dim,
-                             postnet_embedding_dim,
-                             kernel_size=postnet_kernel_size, stride=1,
-                             padding=int((postnet_kernel_size - 1) / 2),
-                             dilation=1, w_init_gain='tanh'),
-                    nn.BatchNorm1d(postnet_embedding_dim))
+                    ConvNorm(
+                        postnet_embedding_dim,
+                        postnet_embedding_dim,
+                        kernel_size=postnet_kernel_size,
+                        stride=1,
+                        padding=int((postnet_kernel_size - 1) / 2),
+                        dilation=1,
+                        w_init_gain='tanh',
+                    ),
+                    nn.BatchNorm1d(postnet_embedding_dim),
+                )
             )
 
         self.convolutions.append(
             nn.Sequential(
-                ConvNorm(postnet_embedding_dim, n_mel_channels,
-                         kernel_size=postnet_kernel_size, stride=1,
-                         padding=int((postnet_kernel_size - 1) / 2),
-                         dilation=1, w_init_gain='linear'),
-                nn.BatchNorm1d(n_mel_channels))
+                ConvNorm(
+                    postnet_embedding_dim,
+                    n_mel_channels,
+                    kernel_size=postnet_kernel_size,
+                    stride=1,
+                    padding=int((postnet_kernel_size - 1) / 2),
+                    dilation=1,
+                    w_init_gain='linear',
+                ),
+                nn.BatchNorm1d(n_mel_channels),
+            )
         )
         self.p_dropout = p_dropout
 
     def forward(self, x):
         for i in range(len(self.convolutions) - 1):
-            x = F.dropout(torch.tanh(
-                self.convolutions[i](x)), 0.5, self.training)
+            x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
         x = F.dropout(self.convolutions[-1](x), self.p_dropout, self.training)
 
         return x
@@ -171,25 +188,31 @@ class Encoder(nn.Module):
         - Bidirectional LSTM
     """
 
-    def __init__(self, encoder_n_convolutions,
-                 encoder_embedding_dim, encoder_kernel_size):
+    def __init__(
+        self, encoder_n_convolutions, encoder_embedding_dim, encoder_kernel_size,
+    ):
         super(Encoder, self).__init__()
 
         convolutions = []
         for _ in range(encoder_n_convolutions):
             conv_layer = nn.Sequential(
-                ConvNorm(encoder_embedding_dim,
-                         encoder_embedding_dim,
-                         kernel_size=encoder_kernel_size, stride=1,
-                         padding=int((encoder_kernel_size - 1) / 2),
-                         dilation=1, w_init_gain='relu'),
-                nn.BatchNorm1d(encoder_embedding_dim))
+                ConvNorm(
+                    encoder_embedding_dim,
+                    encoder_embedding_dim,
+                    kernel_size=encoder_kernel_size,
+                    stride=1,
+                    padding=int((encoder_kernel_size - 1) / 2),
+                    dilation=1,
+                    w_init_gain='relu',
+                ),
+                nn.BatchNorm1d(encoder_embedding_dim),
+            )
             convolutions.append(conv_layer)
         self.convolutions = nn.ModuleList(convolutions)
 
-        self.lstm = nn.LSTM(encoder_embedding_dim,
-                            int(encoder_embedding_dim / 2), 1,
-                            batch_first=True, bidirectional=True)
+        self.lstm = nn.LSTM(
+            encoder_embedding_dim, int(encoder_embedding_dim / 2), 1, batch_first=True, bidirectional=True,
+        )
 
     def forward(self, x, input_lengths):
         for conv in self.convolutions:
@@ -199,35 +222,35 @@ def forward(self, x, input_lengths):
 
         # pytorch tensor are not reversible, hence the conversion
         input_lengths = input_lengths.cpu().numpy()
-        x = nn.utils.rnn.pack_padded_sequence(
-            x, input_lengths, batch_first=True, enforce_sorted=False)
+        x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True, enforce_sorted=False)
 
         self.lstm.flatten_parameters()
         outputs, _ = self.lstm(x)
 
-        outputs, _ = nn.utils.rnn.pad_packed_sequence(
-            outputs, batch_first=True)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
 
         return outputs
 
 
 class Decoder(nn.Module):
-    def __init__(self,
-                 n_mel_channels,
-                 n_frames_per_step,
-                 encoder_embedding_dim,
-                 attention_dim,
-                 attention_location_n_filters,
-                 attention_location_kernel_size,
-                 attention_rnn_dim,
-                 decoder_rnn_dim,
-                 prenet_dim,
-                 max_decoder_steps,
-                 gate_threshold,
-                 p_attention_dropout,
-                 p_decoder_dropout,
-                 early_stopping,
-                 prenet_p_dropout=0.5):
+    def __init__(
+        self,
+        n_mel_channels,
+        n_frames_per_step,
+        encoder_embedding_dim,
+        attention_dim,
+        attention_location_n_filters,
+        attention_location_kernel_size,
+        attention_rnn_dim,
+        decoder_rnn_dim,
+        prenet_dim,
+        max_decoder_steps,
+        gate_threshold,
+        p_attention_dropout,
+        p_decoder_dropout,
+        early_stopping,
+        prenet_p_dropout=0.5,
+    ):
         super(Decoder, self).__init__()
         self.n_mel_channels = n_mel_channels
         self.n_frames_per_step = n_frames_per_step
@@ -241,31 +264,25 @@ def __init__(self,
         self.p_decoder_dropout = p_decoder_dropout
         self.early_stopping = early_stopping
 
-        self.prenet = Prenet(
-            n_mel_channels * n_frames_per_step,
-            [prenet_dim, prenet_dim],
-            prenet_p_dropout)
+        self.prenet = Prenet(n_mel_channels * n_frames_per_step, [prenet_dim, prenet_dim], prenet_p_dropout,)
 
-        self.attention_rnn = nn.LSTMCell(
-            prenet_dim + encoder_embedding_dim,
-            attention_rnn_dim)
+        self.attention_rnn = nn.LSTMCell(prenet_dim + encoder_embedding_dim, attention_rnn_dim)
 
         self.attention_layer = Attention(
-            attention_rnn_dim, encoder_embedding_dim,
-            attention_dim, attention_location_n_filters,
-            attention_location_kernel_size)
+            attention_rnn_dim,
+            encoder_embedding_dim,
+            attention_dim,
+            attention_location_n_filters,
+            attention_location_kernel_size,
+        )
 
-        self.decoder_rnn = nn.LSTMCell(
-            attention_rnn_dim + encoder_embedding_dim,
-            decoder_rnn_dim, 1)
+        self.decoder_rnn = nn.LSTMCell(attention_rnn_dim + encoder_embedding_dim, decoder_rnn_dim, 1)
 
         self.linear_projection = LinearNorm(
-            decoder_rnn_dim + encoder_embedding_dim,
-            n_mel_channels * n_frames_per_step)
+            decoder_rnn_dim + encoder_embedding_dim, n_mel_channels * n_frames_per_step,
+        )
 
-        self.gate_layer = LinearNorm(
-            decoder_rnn_dim + encoder_embedding_dim, 1,
-            bias=True, w_init_gain='sigmoid')
+        self.gate_layer = LinearNorm(decoder_rnn_dim + encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid',)
 
     def get_go_frame(self, memory):
         """ Gets all zeros frames to use as first decoder input
@@ -277,8 +294,7 @@ def get_go_frame(self, memory):
         decoder_input: all zeros frames
         """
         B = memory.size(0)
-        decoder_input = Variable(memory.data.new(
-            B, self.n_mel_channels * self.n_frames_per_step).zero_())
+        decoder_input = Variable(memory.data.new(B, self.n_mel_channels * self.n_frames_per_step).zero_())
         return decoder_input
 
     def initialize_decoder_states(self, memory, mask):
@@ -293,22 +309,15 @@ def initialize_decoder_states(self, memory, mask):
         B = memory.size(0)
         MAX_TIME = memory.size(1)
 
-        self.attention_hidden = Variable(memory.data.new(
-            B, self.attention_rnn_dim).zero_())
-        self.attention_cell = Variable(memory.data.new(
-            B, self.attention_rnn_dim).zero_())
+        self.attention_hidden = Variable(memory.data.new(B, self.attention_rnn_dim).zero_())
+        self.attention_cell = Variable(memory.data.new(B, self.attention_rnn_dim).zero_())
 
-        self.decoder_hidden = Variable(memory.data.new(
-            B, self.decoder_rnn_dim).zero_())
-        self.decoder_cell = Variable(memory.data.new(
-            B, self.decoder_rnn_dim).zero_())
+        self.decoder_hidden = Variable(memory.data.new(B, self.decoder_rnn_dim).zero_())
+        self.decoder_cell = Variable(memory.data.new(B, self.decoder_rnn_dim).zero_())
 
-        self.attention_weights = Variable(memory.data.new(
-            B, MAX_TIME).zero_())
-        self.attention_weights_cum = Variable(memory.data.new(
-            B, MAX_TIME).zero_())
-        self.attention_context = Variable(memory.data.new(
-            B, self.encoder_embedding_dim).zero_())
+        self.attention_weights = Variable(memory.data.new(B, MAX_TIME).zero_())
+        self.attention_weights_cum = Variable(memory.data.new(B, MAX_TIME).zero_())
+        self.attention_context = Variable(memory.data.new(B, self.encoder_embedding_dim).zero_())
 
         self.memory = memory
         self.processed_memory = self.attention_layer.memory_layer(memory)
@@ -326,8 +335,8 @@ def parse_decoder_inputs(self, decoder_inputs):
         # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
         decoder_inputs = decoder_inputs.transpose(1, 2)
         decoder_inputs = decoder_inputs.view(
-            decoder_inputs.size(0),
-            int(decoder_inputs.size(1)/self.n_frames_per_step), -1)
+            decoder_inputs.size(0), int(decoder_inputs.size(1) / self.n_frames_per_step), -1,
+        )
         # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
         decoder_inputs = decoder_inputs.transpose(0, 1)
         return decoder_inputs
@@ -355,8 +364,7 @@ def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
         # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
         mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
         # decouple frames per step
-        mel_outputs = mel_outputs.view(
-            mel_outputs.size(0), -1, self.n_mel_channels)
+        mel_outputs = mel_outputs.view(mel_outputs.size(0), -1, self.n_mel_channels)
         # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
         mel_outputs = mel_outputs.transpose(1, 2)
 
@@ -376,30 +384,27 @@ def decode(self, decoder_input):
         cell_input = torch.cat((decoder_input, self.attention_context), -1)
 
         self.attention_hidden, self.attention_cell = self.attention_rnn(
-            cell_input, (self.attention_hidden, self.attention_cell))
-        self.attention_hidden = F.dropout(
-            self.attention_hidden, self.p_attention_dropout, self.training)
+            cell_input, (self.attention_hidden, self.attention_cell)
+        )
+        self.attention_hidden = F.dropout(self.attention_hidden, self.p_attention_dropout, self.training)
 
         attention_weights_cat = torch.cat(
-            (self.attention_weights.unsqueeze(1),
-             self.attention_weights_cum.unsqueeze(1)), dim=1)
+            (self.attention_weights.unsqueeze(1), self.attention_weights_cum.unsqueeze(1),), dim=1,
+        )
         self.attention_context, self.attention_weights = self.attention_layer(
-            self.attention_hidden, self.memory, self.processed_memory,
-            attention_weights_cat, self.mask)
+            self.attention_hidden, self.memory, self.processed_memory, attention_weights_cat, self.mask,
+        )
 
         self.attention_weights_cum += self.attention_weights
-        decoder_input = torch.cat(
-            (self.attention_hidden, self.attention_context), -1)
+        decoder_input = torch.cat((self.attention_hidden, self.attention_context), -1)
 
         self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
-            decoder_input, (self.decoder_hidden, self.decoder_cell))
-        self.decoder_hidden = F.dropout(
-            self.decoder_hidden, self.p_decoder_dropout, self.training)
+            decoder_input, (self.decoder_hidden, self.decoder_cell)
+        )
+        self.decoder_hidden = F.dropout(self.decoder_hidden, self.p_decoder_dropout, self.training)
 
-        decoder_hidden_attention_context = torch.cat(
-            (self.decoder_hidden, self.attention_context), dim=1)
-        decoder_output = self.linear_projection(
-            decoder_hidden_attention_context)
+        decoder_hidden_attention_context = torch.cat((self.decoder_hidden, self.attention_context), dim=1)
+        decoder_output = self.linear_projection(decoder_hidden_attention_context)
 
         gate_prediction = self.gate_layer(decoder_hidden_attention_context)
         return decoder_output, gate_prediction, self.attention_weights
@@ -423,21 +428,18 @@ def forward(self, memory, decoder_inputs, memory_lengths):
         decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
         decoder_inputs = self.prenet(decoder_inputs)
 
-        self.initialize_decoder_states(
-            memory, mask=~get_mask_from_lengths(memory_lengths))
+        self.initialize_decoder_states(memory, mask=~get_mask_from_lengths(memory_lengths))
 
         mel_outputs, gate_outputs, alignments = [], [], []
         while len(mel_outputs) < decoder_inputs.size(0) - 1:
             decoder_input = decoder_inputs[len(mel_outputs)]
-            mel_output, gate_output, attention_weights = self.decode(
-                decoder_input)
+            mel_output, gate_output, attention_weights = self.decode(decoder_input)
 
             mel_outputs += [mel_output.squeeze(1)]
             gate_outputs += [gate_output.squeeze()]
             alignments += [attention_weights]
 
-        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
-            mel_outputs, gate_outputs, alignments)
+        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments)
 
         return mel_outputs, gate_outputs, alignments
 
@@ -472,10 +474,9 @@ def infer(self, memory, memory_lengths):
             decoder_input = self.prenet(decoder_input, inference=True)
             mel_output, gate_output, alignment = self.decode(decoder_input)
 
-            dec = torch.le(torch.sigmoid(gate_output.data),
-                           self.gate_threshold).to(torch.int32).squeeze(1)
+            dec = torch.le(torch.sigmoid(gate_output.data), self.gate_threshold).to(torch.int32).squeeze(1)
 
-            not_finished = not_finished*dec
+            not_finished = not_finished * dec
             mel_lengths += not_finished
 
             if self.early_stopping and torch.sum(not_finished) == 0:
@@ -491,7 +492,6 @@ def infer(self, memory, memory_lengths):
 
             decoder_input = mel_output
 
-        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
-            mel_outputs, gate_outputs, alignments)
+        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments)
 
         return mel_outputs, gate_outputs, alignments, mel_lengths
diff --git a/nemo/collections/tts/parts/waveglow.py b/nemo/collections/tts/parts/waveglow.py
index 79e251ee1376..8fc011dd296e 100644
--- a/nemo/collections/tts/parts/waveglow.py
+++ b/nemo/collections/tts/parts/waveglow.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import torch
-from torch.autograd import Variable
 import torch.nn.functional as F
+from torch.autograd import Variable
 
 
 @torch.jit.script
@@ -23,8 +23,7 @@ class Invertible1x1Conv(torch.nn.Module):
 
     def __init__(self, c):
         super(Invertible1x1Conv, self).__init__()
-        self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
-                                    bias=False)
+        self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, bias=False)
 
         # Sample a random orthonormal matrix to initialize weights
         W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
@@ -46,8 +45,7 @@ def forward(self, z, reverse=False):
                 # Reverse computation
                 W_inverse = W.float().inverse()
                 W_inverse = Variable(W_inverse[..., None])
-                if (z.type() == 'torch.cuda.HalfTensor'
-                        or z.type() == 'torch.HalfTensor'):
+                if z.type() == 'torch.cuda.HalfTensor' or z.type() == 'torch.HalfTensor':
                     W_inverse = W_inverse.half()
                 self.W_inverse = W_inverse
             z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
@@ -66,11 +64,10 @@ class WN(torch.nn.Module):
     also no dilation size reset.  The dilation only doubles on each layer
     """
 
-    def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
-                 kernel_size):
+    def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size):
         super(WN, self).__init__()
-        assert(kernel_size % 2 == 1)
-        assert(n_channels % 2 == 0)
+        assert kernel_size % 2 == 1
+        assert n_channels % 2 == 0
         self.n_layers = n_layers
         self.n_channels = n_channels
         self.in_layers = torch.nn.ModuleList()
@@ -91,8 +88,7 @@ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
         for i in range(n_layers):
             dilation = 2 ** i
             padding = int((kernel_size * dilation - dilation) / 2)
-            in_layer = torch.nn.Conv1d(n_channels, 2 * n_channels, kernel_size,
-                                       dilation=dilation, padding=padding)
+            in_layer = torch.nn.Conv1d(n_channels, 2 * n_channels, kernel_size, dilation=dilation, padding=padding,)
             in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
             self.in_layers.append(in_layer)
 
@@ -106,8 +102,7 @@ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
             else:
                 res_skip_channels = n_channels
             res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
-            res_skip_layer = torch.nn.utils.weight_norm(
-                res_skip_layer, name='weight')
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
             self.res_skip_layers.append(res_skip_layer)
 
     def forward(self, forward_input):
@@ -116,14 +111,13 @@ def forward(self, forward_input):
 
         for i in range(self.n_layers):
             acts = fused_add_tanh_sigmoid_multiply(
-                self.in_layers[i](audio),
-                self.cond_layers[i](spect),
-                torch.IntTensor([self.n_channels]))
+                self.in_layers[i](audio), self.cond_layers[i](spect), torch.IntTensor([self.n_channels]),
+            )
 
             res_skip_acts = self.res_skip_layers[i](acts)
             if i < self.n_layers - 1:
-                audio = res_skip_acts[:, :self.n_channels, :] + audio
-                skip_acts = res_skip_acts[:, self.n_channels:, :]
+                audio = res_skip_acts[:, : self.n_channels, :] + audio
+                skip_acts = res_skip_acts[:, self.n_channels :, :]
             else:
                 skip_acts = res_skip_acts
 
@@ -135,14 +129,13 @@ def forward(self, forward_input):
 
 
 class WaveGlow(torch.nn.Module):
-    def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
-                 n_early_size, WN_config):
+    def __init__(
+        self, n_mel_channels, n_flows, n_group, n_early_every, n_early_size, WN_config,
+    ):
         super(WaveGlow, self).__init__()
 
-        self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
-                                                 n_mel_channels,
-                                                 1024, stride=256)
-        assert(n_group % 2 == 0)
+        self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, n_mel_channels, 1024, stride=256)
+        assert n_group % 2 == 0
         self.n_flows = n_flows
         self.n_group = n_group
         self.n_early_every = n_early_every
@@ -172,9 +165,9 @@ def forward(self, forward_input):
 
         #  Upsample spectrogram to size of audio
         spect = self.upsample(spect)
-        assert(spect.size(2) >= audio.size(1))
+        assert spect.size(2) >= audio.size(1)
         if spect.size(2) > audio.size(1):
-            spect = spect[:, :, :audio.size(1)]
+            spect = spect[:, :, : audio.size(1)]
 
         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
@@ -187,8 +180,8 @@ def forward(self, forward_input):
 
         for k in range(self.n_flows):
             if k % self.n_early_every == 0 and k > 0:
-                output_audio.append(audio[:, :self.n_early_size, :])
-                audio = audio[:, self.n_early_size:, :]
+                output_audio.append(audio[:, : self.n_early_size, :])
+                audio = audio[:, self.n_early_size :, :]
 
             audio, log_det_W = self.convinv[k](audio)
             log_det_W_list.append(log_det_W)
@@ -218,9 +211,9 @@ def infer(self, spect, sigma=1.0):
         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
         spect = spect.permute(0, 2, 1)
 
-        audio = torch.randn(spect.size(0),
-                            self.n_remaining_channels,
-                            spect.size(2), device=spect.device).to(spect.dtype)
+        audio = torch.randn(spect.size(0), self.n_remaining_channels, spect.size(2), device=spect.device,).to(
+            spect.dtype
+        )
 
         audio = torch.autograd.Variable(sigma * audio)
 
@@ -238,13 +231,10 @@ def infer(self, spect, sigma=1.0):
             audio = self.convinv[k](audio, reverse=True)
 
             if k % self.n_early_every == 0 and k > 0:
-                z = torch.randn(spect.size(0), self.n_early_size, spect.size(
-                    2), device=spect.device).to(spect.dtype)
+                z = torch.randn(spect.size(0), self.n_early_size, spect.size(2), device=spect.device,).to(spect.dtype)
                 audio = torch.cat((sigma * z, audio), 1)
 
-        audio = audio.permute(
-            0, 2, 1).contiguous().view(
-            audio.size(0), -1).data
+        audio = audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data
         return audio
 
     @staticmethod
diff --git a/nemo/collections/tts/tacotron2_modules.py b/nemo/collections/tts/tacotron2_modules.py
index 12d0e2478bb4..c2feff724d02 100644
--- a/nemo/collections/tts/tacotron2_modules.py
+++ b/nemo/collections/tts/tacotron2_modules.py
@@ -5,10 +5,11 @@
 from torch import nn
 from torch.nn.functional import pad
 
-from nemo.backends.pytorch.nm import TrainableNM, NonTrainableNM, LossNM
+from nemo.backends.pytorch.nm import LossNM, NonTrainableNM, TrainableNM
 from nemo.core.neural_types import *
-from .parts.tacotron2 import Encoder, Decoder, Postnet
+
 from .parts.layers import get_mask_from_lengths
+from .parts.tacotron2 import Decoder, Encoder, Postnet
 
 __all__ = [
     "MakeGate",
@@ -17,7 +18,7 @@
     "Tacotron2Decoder",
     "Tacotron2DecoderInfer",
     "Tacotron2Encoder",
-    "TextEmbedding"
+    "TextEmbedding",
 ]
 
 
@@ -41,11 +42,7 @@ def input_ports(self):
 
             1: AxisType(TimeTag)
         """
-        return {
-            "char_phone": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)})
-        }
+        return {"char_phone": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})}
 
     @property
     def output_ports(self):
@@ -59,16 +56,14 @@ def output_ports(self):
             2: AxisType(TimeTag)})
         """
         return {
-            "char_phone_embeddings": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(EmbeddedTextTag),
-                2: AxisType(TimeTag)})
+            "char_phone_embeddings": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(EmbeddedTextTag), 2: AxisType(TimeTag),}
+            )
         }
 
     def __init__(self, n_symbols, symbols_embedding_dim: int = 512, **kwargs):
         super().__init__(**kwargs)
-        self.embedding = nn.Embedding(
-            n_symbols, symbols_embedding_dim)
+        self.embedding = nn.Embedding(n_symbols, symbols_embedding_dim)
         self.to(self._device)
 
     def forward(self, char_phone):
@@ -105,12 +100,10 @@ def input_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "char_phone_embeddings": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(EmbeddedTextTag),
-                2: AxisType(TimeTag)}),
-            "embedding_length": NeuralType({
-                0: AxisType(BatchTag)})
+            "char_phone_embeddings": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(EmbeddedTextTag), 2: AxisType(TimeTag),}
+            ),
+            "embedding_length": NeuralType({0: AxisType(BatchTag)}),
         }
 
     @property
@@ -125,27 +118,24 @@ def output_ports(self):
             2: AxisType(EncodedRepresentationTag)})
         """
         return {
-            "char_phone_encoded": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(EncodedRepresentationTag)})
+            "char_phone_encoded": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(EncodedRepresentationTag),}
+            )
         }
 
     def __init__(
-            self,
-            encoder_n_convolutions: int = 5,
-            encoder_embedding_dim: int = 512,
-            encoder_kernel_size: int = 3,
-            **kwargs):
+        self, encoder_n_convolutions: int = 5, encoder_embedding_dim: int = 512, encoder_kernel_size: int = 3, **kwargs
+    ):
         super().__init__(**kwargs)
-        self.encoder = Encoder(encoder_n_convolutions=encoder_n_convolutions,
-                               encoder_embedding_dim=encoder_embedding_dim,
-                               encoder_kernel_size=encoder_kernel_size)
+        self.encoder = Encoder(
+            encoder_n_convolutions=encoder_n_convolutions,
+            encoder_embedding_dim=encoder_embedding_dim,
+            encoder_kernel_size=encoder_kernel_size,
+        )
         self.to(self._device)
 
     def forward(self, char_phone_embeddings, embedding_length):
-        char_phone_encoded = self.encoder(
-            char_phone_embeddings, embedding_length)
+        char_phone_encoded = self.encoder(char_phone_embeddings, embedding_length)
         return char_phone_encoded
 
 
@@ -209,16 +199,13 @@ def input_ports(self):
             2: AxisType(TimeTag)
         """
         return {
-            "char_phone_encoded": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(EncodedRepresentationTag)}),
-            "encoded_length": NeuralType({
-                0: AxisType(BatchTag)}),
-            "mel_target": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(MelSpectrogramSignalTag),
-                2: AxisType(TimeTag)})
+            "char_phone_encoded": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(EncodedRepresentationTag),}
+            ),
+            "encoded_length": NeuralType({0: AxisType(BatchTag)}),
+            "mel_target": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(TimeTag),}
+            ),
         }
 
     @property
@@ -245,36 +232,31 @@ def output_ports(self):
             2: AxisType(TimeTag)
         """
         return {
-            "mel_output": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(MelSpectrogramSignalTag),
-                2: AxisType(TimeTag)}),
-            "gate_output": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)}),
-            "alignments": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(TimeTag)})
+            "mel_output": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(TimeTag),}
+            ),
+            "gate_output": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "alignments": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(TimeTag),}),
         }
 
     def __init__(
-            self,
-            n_mel_channels: int,
-            n_frames_per_step: int = 1,
-            encoder_embedding_dim: int = 512,
-            gate_threshold: float = 0.5,
-            prenet_dim: int = 256,
-            max_decoder_steps: int = 1000,
-            decoder_rnn_dim: int = 1024,
-            p_decoder_dropout: float = 0.1,
-            p_attention_dropout: float = 0.1,
-            attention_rnn_dim: int = 1024,
-            attention_dim: int = 128,
-            attention_location_n_filters: int = 32,
-            attention_location_kernel_size: int = 31,
-            prenet_p_dropout: float = 0.5,
-            **kwargs):
+        self,
+        n_mel_channels: int,
+        n_frames_per_step: int = 1,
+        encoder_embedding_dim: int = 512,
+        gate_threshold: float = 0.5,
+        prenet_dim: int = 256,
+        max_decoder_steps: int = 1000,
+        decoder_rnn_dim: int = 1024,
+        p_decoder_dropout: float = 0.1,
+        p_attention_dropout: float = 0.1,
+        attention_rnn_dim: int = 1024,
+        attention_dim: int = 128,
+        attention_location_n_filters: int = 32,
+        attention_location_kernel_size: int = 31,
+        prenet_p_dropout: float = 0.5,
+        **kwargs
+    ):
         super().__init__(**kwargs)
         self.decoder = Decoder(
             n_mel_channels=n_mel_channels,
@@ -291,16 +273,19 @@ def __init__(
             attention_location_n_filters=attention_location_n_filters,
             attention_location_kernel_size=attention_location_kernel_size,
             prenet_p_dropout=prenet_p_dropout,
-            early_stopping=True)
+            early_stopping=True,
+        )
         self.to(self._device)
 
     def forward(self, char_phone_encoded, encoded_length, mel_target):
         if self.training:
             mel_output, gate_output, alignments = self.decoder(
-                char_phone_encoded, mel_target, memory_lengths=encoded_length)
+                char_phone_encoded, mel_target, memory_lengths=encoded_length
+            )
         else:
             mel_output, gate_output, alignments, _ = self.decoder.infer(
-                char_phone_encoded, memory_lengths=encoded_length)
+                char_phone_encoded, memory_lengths=encoded_length
+            )
         return mel_output, gate_output, alignments
 
 
@@ -355,12 +340,10 @@ def input_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "char_phone_encoded": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(EncodedRepresentationTag)}),
-            "encoded_length": NeuralType({
-                0: AxisType(BatchTag)})
+            "char_phone_encoded": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(EncodedRepresentationTag),}
+            ),
+            "encoded_length": NeuralType({0: AxisType(BatchTag)}),
         }
 
     @property
@@ -390,19 +373,12 @@ def output_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "mel_output": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(MelSpectrogramSignalTag),
-                2: AxisType(TimeTag)}),
-            "gate_output": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)}),
-            "alignments": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag),
-                2: AxisType(TimeTag)}),
-            "mel_len": NeuralType({
-                0: AxisType(BatchTag)})
+            "mel_output": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(TimeTag),}
+            ),
+            "gate_output": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "alignments": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(TimeTag),}),
+            "mel_len": NeuralType({0: AxisType(BatchTag)}),
         }
 
     def __str__(self):
@@ -410,11 +386,11 @@ def __str__(self):
 
     def forward(self, char_phone_encoded, encoded_length):
         if self.training:
-            raise ValueError("You are using the Tacotron 2 Infer Neural Module"
-                             " in training mode.")
+            raise ValueError("You are using the Tacotron 2 Infer Neural Module" " in training mode.")
         with torch.no_grad():
             mel_output, gate_output, alignments, mel_len = self.decoder.infer(
-                char_phone_encoded, memory_lengths=encoded_length)
+                char_phone_encoded, memory_lengths=encoded_length
+            )
         return mel_output, gate_output, alignments, mel_len
 
 
@@ -446,11 +422,9 @@ def input_ports(self):
             2: AxisType(TimeTag)
         """
         return {
-            "mel_input": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(MelSpectrogramSignalTag),
-                2: AxisType(TimeTag),
-            })
+            "mel_input": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(TimeTag),}
+            )
         }
 
     @property
@@ -465,27 +439,28 @@ def output_ports(self):
             2: AxisType(TimeTag)
         """
         return {
-            "mel_output": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(MelSpectrogramSignalTag),
-                2: AxisType(TimeTag)
-            }),
+            "mel_output": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(TimeTag),}
+            ),
         }
 
     def __init__(
-            self,
-            n_mel_channels: int,
-            postnet_embedding_dim: int = 512,
-            postnet_kernel_size: int = 5,
-            postnet_n_convolutions: int = 5,
-            p_dropout: float = 0.5,
-            **kwargs):
+        self,
+        n_mel_channels: int,
+        postnet_embedding_dim: int = 512,
+        postnet_kernel_size: int = 5,
+        postnet_n_convolutions: int = 5,
+        p_dropout: float = 0.5,
+        **kwargs
+    ):
         super().__init__(**kwargs)
-        self.postnet = Postnet(n_mel_channels=n_mel_channels,
-                               postnet_embedding_dim=postnet_embedding_dim,
-                               postnet_kernel_size=postnet_kernel_size,
-                               postnet_n_convolutions=postnet_n_convolutions,
-                               p_dropout=p_dropout)
+        self.postnet = Postnet(
+            n_mel_channels=n_mel_channels,
+            postnet_embedding_dim=postnet_embedding_dim,
+            postnet_kernel_size=postnet_kernel_size,
+            postnet_n_convolutions=postnet_n_convolutions,
+            p_dropout=p_dropout,
+        )
         self.to(self._device)
 
     def forward(self, mel_input):
@@ -549,28 +524,19 @@ def input_ports(self):
             0: AxisType(BatchTag)
         """
         return {
-            "mel_out": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(MelSpectrogramSignalTag),
-                2: AxisType(TimeTag)}),
-            "mel_out_postnet": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(MelSpectrogramSignalTag),
-                2: AxisType(TimeTag)}),
-            "gate_out": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)}),
-            "mel_target": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(MelSpectrogramSignalTag),
-                2: AxisType(TimeTag)}),
-            "gate_target": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)}),
-            "target_len": NeuralType({
-                0: AxisType(BatchTag)}),
-            "seq_len": NeuralType({
-                0: AxisType(BatchTag)}),
+            "mel_out": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(TimeTag),}
+            ),
+            "mel_out_postnet": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(TimeTag),}
+            ),
+            "gate_out": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "mel_target": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(TimeTag),}
+            ),
+            "gate_target": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "target_len": NeuralType({0: AxisType(BatchTag)}),
+            "seq_len": NeuralType({0: AxisType(BatchTag)}),
         }
 
     @property
@@ -580,9 +546,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, pad_value: float = -11.52, **kwargs):
         super().__init__(**kwargs)
@@ -591,8 +555,9 @@ def __init__(self, pad_value: float = -11.52, **kwargs):
     def _loss_function(self, **kwargs):
         return self._loss(*(kwargs.values()))
 
-    def _loss(self, mel_out, mel_out_postnet, gate_out,
-              mel_target, gate_target, target_len, seq_len):
+    def _loss(
+        self, mel_out, mel_out_postnet, gate_out, mel_target, gate_target, target_len, seq_len,
+    ):
         mel_target.requires_grad = False
         gate_target.requires_grad = False
         gate_target = gate_target.view(-1, 1)
@@ -609,8 +574,7 @@ def _loss(self, mel_out, mel_out_postnet, gate_out,
             # Need to do padding
             pad_amount = max_len - mel_out.shape[2]
             mel_out = pad(mel_out, (0, pad_amount), value=self.pad_value)
-            mel_out_postnet = pad(
-                mel_out_postnet, (0, pad_amount), value=self.pad_value)
+            mel_out_postnet = pad(mel_out_postnet, (0, pad_amount), value=self.pad_value)
             gate_out = pad(gate_out, (0, pad_amount), value=1e3)
             max_len = mel_out.shape[2]
 
@@ -622,8 +586,7 @@ def _loss(self, mel_out, mel_out_postnet, gate_out,
         gate_out.data.masked_fill_(mask[:, 0, :], 1e3)
 
         gate_out = gate_out.view(-1, 1)
-        mel_loss = nn.MSELoss()(mel_out, mel_target) + \
-            nn.MSELoss()(mel_out_postnet, mel_target)
+        mel_loss = nn.MSELoss()(mel_out, mel_target) + nn.MSELoss()(mel_out_postnet, mel_target)
         gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
         return mel_loss + gate_loss
 
@@ -647,12 +610,10 @@ def input_ports(self):
             2: AxisType(TimeTag)
         """
         return {
-            "target_len": NeuralType({
-                0: AxisType(BatchTag)}),
-            "mel_target": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(MelSpectrogramSignalTag),
-                2: AxisType(TimeTag)}),
+            "target_len": NeuralType({0: AxisType(BatchTag)}),
+            "mel_target": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(TimeTag),}
+            ),
         }
 
     @property
@@ -664,16 +625,12 @@ def output_ports(self):
 
             1: AxisType(TimeTag)
         """
-        return {
-            "gate_target": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)})
-        }
+        return {"gate_target": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})}
 
     def forward(self, target_len, mel_target):
         max_len = mel_target.shape[2]
         gate_padded = torch.FloatTensor(target_len.shape[0], max_len)
         gate_padded.zero_()
         for i, length in enumerate(target_len):
-            gate_padded[i, length.data-1:] = 1
+            gate_padded[i, length.data - 1 :] = 1
         return gate_padded.to(device=self._device)
diff --git a/nemo/collections/tts/waveglow_modules.py b/nemo/collections/tts/waveglow_modules.py
index bf3e50e3240c..54f184cee689 100644
--- a/nemo/collections/tts/waveglow_modules.py
+++ b/nemo/collections/tts/waveglow_modules.py
@@ -3,15 +3,12 @@
 import numpy as np
 import torch
 
-from nemo.backends.pytorch.nm import TrainableNM, LossNM
+from nemo.backends.pytorch.nm import LossNM, TrainableNM
 from nemo.core.neural_types import *
+
 from .parts.waveglow import WaveGlow
 
-__all__ = [
-    "WaveGlowNM",
-    "WaveGlowInferNM",
-    "WaveGlowLoss"
-]
+__all__ = ["WaveGlowNM", "WaveGlowInferNM", "WaveGlowLoss"]
 
 
 class WaveGlowNM(TrainableNM):
@@ -59,12 +56,9 @@ def input_ports(self):
         """
         return {
             "mel_spectrogram": NeuralType(
-                {0: AxisType(BatchTag),
-                 1: AxisType(MelSpectrogramSignalTag),
-                 2: AxisType(TimeTag)}),
-            "audio": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)})
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(TimeTag),}
+            ),
+            "audio": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
     @property
@@ -85,29 +79,28 @@ def output_ports(self):
         """
         # TODO @blisc: please take a look at those definitions
         return {
-            "audio": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)}),
+            "audio": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "log_s_list": NeuralType(),
             "log_det_W_list": NeuralType(),
         }
 
     def __init__(
-            self,
-            n_mel_channels: int = 80,
-            n_flows: int = 12,
-            n_group: int = 8,
-            n_early_every: int = 4,
-            n_early_size: int = 2,
-            n_wn_layers: int = 8,
-            n_wn_channels: int = 512,
-            wn_kernel_size: int = 3,
-            **kwargs):
+        self,
+        n_mel_channels: int = 80,
+        n_flows: int = 12,
+        n_group: int = 8,
+        n_early_every: int = 4,
+        n_early_size: int = 2,
+        n_wn_layers: int = 8,
+        n_wn_channels: int = 512,
+        wn_kernel_size: int = 3,
+        **kwargs
+    ):
         super().__init__(**kwargs)
         wavenet_config = {
             "n_layers": n_wn_layers,
             "n_channels": n_wn_channels,
-            "kernel_size": wn_kernel_size
+            "kernel_size": wn_kernel_size,
         }
         self.waveglow = WaveGlow(
             n_mel_channels=n_mel_channels,
@@ -115,7 +108,8 @@ def __init__(
             n_group=n_group,
             n_early_every=n_early_every,
             n_early_size=n_early_size,
-            WN_config=wavenet_config)
+            WN_config=wavenet_config,
+        )
         self.to(self._device)
 
     def forward(self, mel_spectrogram, audio):
@@ -123,11 +117,9 @@ def forward(self, mel_spectrogram, audio):
         # If training, it returns the predicted normal distribution
         # Else it returns the predicted audio
         if self.training:
-            audio, log_s_list, log_det_W_list = self.waveglow(
-                (mel_spectrogram, audio))
+            audio, log_s_list, log_det_W_list = self.waveglow((mel_spectrogram, audio))
         else:
-            audio = self.waveglow.infer(
-                mel_spectrogram)
+            audio = self.waveglow.infer(mel_spectrogram)
             log_s_list = log_det_W_list = []
         return audio, log_s_list, log_det_W_list
 
@@ -175,10 +167,9 @@ def input_ports(self):
             2: AxisType(TimeTag)
         """
         return {
-            "mel_spectrogram": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(MelSpectrogramSignalTag),
-                2: AxisType(TimeTag)})
+            "mel_spectrogram": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(MelSpectrogramSignalTag), 2: AxisType(TimeTag),}
+            )
         }
 
     @property
@@ -190,27 +181,24 @@ def output_ports(self):
 
             1: AxisType(TimeTag)
         """
-        return {
-            "audio": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)})
-        }
+        return {"audio": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})}
 
     def __str__(self):
         return "WaveGlowNM"
 
     def __init__(
-            self,
-            n_mel_channels: int = 80,
-            n_flows: int = 12,
-            n_group: int = 8,
-            n_early_every: int = 4,
-            n_early_size: int = 2,
-            n_wn_layers: int = 8,
-            n_wn_channels: int = 512,
-            wn_kernel_size: int = 3,
-            sigma: float = 0.6,
-            **kwargs):
+        self,
+        n_mel_channels: int = 80,
+        n_flows: int = 12,
+        n_group: int = 8,
+        n_early_every: int = 4,
+        n_early_size: int = 2,
+        n_wn_layers: int = 8,
+        n_wn_channels: int = 512,
+        wn_kernel_size: int = 3,
+        sigma: float = 0.6,
+        **kwargs
+    ):
         self._sigma = sigma
         super().__init__(
             n_mel_channels=n_mel_channels,
@@ -221,7 +209,8 @@ def __init__(
             n_wn_layers=n_wn_layers,
             n_wn_channels=n_wn_channels,
             wn_kernel_size=wn_kernel_size,
-            **kwargs)
+            **kwargs
+        )
         self._removed_weight_norm = False
 
     def setup_denoiser(self):
@@ -229,16 +218,13 @@ def setup_denoiser(self):
             mel_input = torch.zeros((1, 80, 88), device=self._device)
             bias_audio = self.waveglow.infer(mel_input, sigma=0.0).float()
             bias_audio = bias_audio.squeeze().cpu().numpy()
-            bias_spec, _ = librosa.core.magphase(
-                librosa.core.stft(bias_audio, n_fft=1024))
+            bias_spec, _ = librosa.core.magphase(librosa.core.stft(bias_audio, n_fft=1024))
             self.bias_spec = np.expand_dims(bias_spec[:, 0], axis=-1)
 
     def denoise(self, audio, strength=0.1):
-        audio_spec, audio_angles = librosa.core.magphase(
-            librosa.core.stft(audio, n_fft=1024))
+        audio_spec, audio_angles = librosa.core.magphase(librosa.core.stft(audio, n_fft=1024))
         audio_spec_denoised = audio_spec - self.bias_spec * strength
-        audio_spec_denoised = np.clip(
-            audio_spec_denoised, a_min=0.0, a_max=None)
+        audio_spec_denoised = np.clip(audio_spec_denoised, a_min=0.0, a_max=None)
         audio_denoised = librosa.core.istft(audio_spec_denoised * audio_angles)
         return audio_denoised, audio_spec_denoised
 
@@ -248,8 +234,7 @@ def forward(self, mel_spectrogram):
             self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
             self._removed_weight_norm = True
         if self.training:
-            raise ValueError("You are using the WaveGlow Infer Neural Module "
-                             "in training mode.")
+            raise ValueError("You are using the WaveGlow Infer Neural Module " "in training mode.")
         with torch.no_grad():
             audio = self.waveglow.infer(mel_spectrogram, sigma=self._sigma)
         return audio
@@ -288,9 +273,7 @@ def input_ports(self):
         """
         # TODO @blisc: please take a look at those definitions
         return {
-            "z": NeuralType({
-                0: AxisType(BatchTag),
-                1: AxisType(TimeTag)}),
+            "z": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "log_s_list": NeuralType(),
             "log_det_W_list": NeuralType(),
         }
@@ -302,9 +285,7 @@ def output_ports(self):
         loss:
             NeuralType(None)
         """
-        return {
-            "loss": NeuralType(None)
-        }
+        return {"loss": NeuralType(None)}
 
     def __init__(self, sigma: float = 1.0, **kwargs):
         super().__init__(**kwargs)
@@ -322,6 +303,5 @@ def _loss(self, z, log_s_list, log_det_W_list):
                 log_s_total = log_s_total + torch.sum(log_s)
                 log_det_W_total += log_det_W_list[i]
 
-        loss = (torch.sum(z * z) / (2 * self.sigma * self.sigma)
-                - log_s_total - log_det_W_total)
+        loss = torch.sum(z * z) / (2 * self.sigma * self.sigma) - log_s_total - log_det_W_total
         return loss / (z.size(0) * z.size(1) * z.size(2))
diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py
index 4695971c22de..606d5af1801a 100644
--- a/nemo/core/callbacks.py
+++ b/nemo/core/callbacks.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2019 NVIDIA Corporation
-from abc import ABC, abstractmethod
-from collections import namedtuple
 import glob
 import os
 import sys
 import time
 import warnings
+from abc import ABC, abstractmethod
+from collections import namedtuple
 
 import nemo
 
@@ -50,8 +50,7 @@ def action(self, action_obj):
 
     @property
     def logger(self):
-        warnings.warn("This will be deprecated in future releases. Please use "
-                      "nemo.logging instead")
+        warnings.warn("This will be deprecated in future releases. Please use " "nemo.logging instead")
         return nemo.logging
 
     def on_action_start(self):
@@ -79,11 +78,9 @@ class ModuleSaverCallback(ActionCallback):
     https://nvidia.github.io/NeMo/tutorials/callbacks.html
     """
 
-    def __init__(self,
-                 save_modules_list,
-                 step_freq=1000,
-                 folder=None,
-                 checkpoints_to_keep=4):
+    def __init__(
+        self, save_modules_list, step_freq=1000, folder=None, checkpoints_to_keep=4,
+    ):
         super().__init__()
         self._save_modules_list = save_modules_list
         self._folder = folder
@@ -94,11 +91,10 @@ def __init__(self,
     def on_iteration_end(self):
         step = self.step
         if (
-                self._step_freq > 0
-                and
-                step % self._step_freq == 0
-                and step > 0
-                and (self.global_rank is None or self.global_rank == 0)
+            self._step_freq > 0
+            and step % self._step_freq == 0
+            and step > 0
+            and (self.global_rank is None or self.global_rank == 0)
         ):
             for m in self._save_modules_list:
                 class_name = m.__class__.__name__
@@ -113,10 +109,10 @@ def on_iteration_end(self):
                 nemo.logging.info("Saved.")
             self._saved_ckpts.append(f'-{self.step}.pt')
             if len(self._saved_ckpts) > self._ckpt2keep:
-                for end in self._saved_ckpts[:-self._ckpt2keep]:
+                for end in self._saved_ckpts[: -self._ckpt2keep]:
                     for file in glob.glob(f'{self._folder}/*{end}'):
                         os.remove(file)
-                self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep:]
+                self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :]
 
     def on_action_end(self):
         step = self.step
@@ -140,13 +136,9 @@ class SimpleLossLoggerCallback(ActionCallback):
     https://nvidia.github.io/NeMo/tutorials/callbacks.html
     """
 
-    def __init__(self,
-                 tensors,
-                 print_func=None,
-                 get_tb_values=None,
-                 log_to_tb_func=None,
-                 step_freq=25,
-                 tb_writer=None):
+    def __init__(
+        self, tensors, print_func=None, get_tb_values=None, log_to_tb_func=None, step_freq=25, tb_writer=None,
+    ):
 
         super().__init__()
         if not isinstance(tensors, list):
@@ -200,10 +192,7 @@ def on_iteration_end(self):
         if self.global_rank is None or self.global_rank == 0:
             step = self.step
             if step % self._step_freq == 0:
-                tensor_values = [
-                    self.registered_tensors[t.unique_name]
-                    for t in self.tensors
-                ]
+                tensor_values = [self.registered_tensors[t.unique_name] for t in self.tensors]
 
                 nemo.logging.info(f"Step: {step}")
                 if self._print_func:
@@ -216,8 +205,7 @@ def on_iteration_end(self):
                             value = value.item()
                             self._swriter.add_scalar(name, value, step)
                     if self._log_to_tb_func:
-                        self._log_to_tb_func(
-                            self._swriter, tensor_values, step)
+                        self._log_to_tb_func(self._swriter, tensor_values, step)
                     run_time = time.time() - self._last_iter_start
                     self._swriter.add_scalar('misc/step_time', run_time, step)
                 run_time = time.time() - self._last_iter_start
@@ -230,27 +218,21 @@ class CheckpointCallback(ActionCallback):
     https://nvidia.github.io/NeMo/tutorials/callbacks.html
     """
 
-    def __init__(self, folder, load_from_folder=None, step_freq=-1,
-                 epoch_freq=-1, checkpoints_to_keep=4, force_load=False):
+    def __init__(
+        self, folder, load_from_folder=None, step_freq=-1, epoch_freq=-1, checkpoints_to_keep=4, force_load=False,
+    ):
         super().__init__()
         if step_freq == -1 and epoch_freq == -1:
-            nemo.logging.warning(
-                "No checkpoints will be saved because step_freq and "
-                "epoch_freq are both -1."
-            )
+            nemo.logging.warning("No checkpoints will be saved because step_freq and " "epoch_freq are both -1.")
 
         if step_freq > -1 and epoch_freq > -1:
-            nemo.logging.warning(
-                "You config the model to save by both steps and epochs. "
-                "Save by step_freq only"
-            )
+            nemo.logging.warning("You config the model to save by both steps and epochs. " "Save by step_freq only")
             epoch_freq = -1
 
         self._step_freq = step_freq
         self._epoch_freq = epoch_freq
         self._folder = folder
-        self._load_from_folder = load_from_folder if load_from_folder \
-            else folder
+        self._load_from_folder = load_from_folder if load_from_folder else folder
         self._ckpt2keep = checkpoints_to_keep
         self._saved_ckpts = []
         # If True, run will fail if we cannot load module weights
@@ -268,7 +250,8 @@ def __save_to(self, path):
                 if str(module) in unique_mod_names:
                     raise NotImplementedError(
                         "There were two instances of the same module. Please "
-                        "overwrite __str__() of one of the modules.")
+                        "overwrite __str__() of one of the modules."
+                    )
                 unique_mod_names.add(str(module))
                 if self._step_freq > -1:
                     filename = f"{module}-STEP-{self.step}.pt"
@@ -286,17 +269,18 @@ def __save_to(self, path):
             self._saved_ckpts.append(f'-{self.epoch_num}.pt')
 
         if len(self._saved_ckpts) > self._ckpt2keep:
-            for end in self._saved_ckpts[:-self._ckpt2keep]:
+            for end in self._saved_ckpts[: -self._ckpt2keep]:
                 for file in glob.glob(f'{path}/*{end}'):
                     os.remove(file)
-            self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep:]
+            self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :]
         nemo.logging.info(f'Saved checkpoint: {path}/{filename}')
 
     def __restore_from(self, path):
         if not os.path.isdir(path):
             if self._force_load:
-                raise ValueError("force_load was set to True for checkpoint "
-                                 "callback but a checkpoint was not found.")
+                raise ValueError(
+                    "force_load was set to True for checkpoint " "callback but a checkpoint was not found."
+                )
             nemo.logging.warning(f"Checkpoint folder {path} not found!")
         else:
             nemo.logging.info(f"Restoring checkpoint from folder {path} ...")
@@ -307,26 +291,21 @@ def __restore_from(self, path):
                     modules_to_restore.append(module)
                     modules_to_restore_name.append(str(module))
             try:
-                module_checkpoints = get_checkpoint_from_dir(
-                    modules_to_restore_name, path
-                )
+                module_checkpoints = get_checkpoint_from_dir(modules_to_restore_name, path)
 
-                for mod, checkpoint in zip(modules_to_restore,
-                                           module_checkpoints):
+                for mod, checkpoint in zip(modules_to_restore, module_checkpoints):
                     mod.restore_from(checkpoint, self.local_rank)
             except (BaseException, ValueError) as e:
                 if self._force_load:
                     raise ValueError(
-                        "force_load was set to True for checkpoint callback"
-                        "but a checkpoint was not found.")
+                        "force_load was set to True for checkpoint callback" "but a checkpoint was not found."
+                    )
                 nemo.logging.warning(e)
-                nemo.logging.warning(
-                    f"Checkpoint folder {path} present but did not restore")
+                nemo.logging.warning(f"Checkpoint folder {path} present but did not restore")
                 return
 
             try:
-                trainer_checkpoints = get_checkpoint_from_dir(
-                    ["trainer"], path)
+                trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path)
                 for tr, checkpoint in zip([self.action], trainer_checkpoints):
                     tr.restore_state_from(checkpoint)
             except (BaseException, ValueError) as e:
@@ -342,11 +321,11 @@ def on_action_start(self):
                 if str(module) in unique_mod_names:
                     raise NotImplementedError(
                         "There were two instances of the same module. Please "
-                        "overwrite __str__() of one of the modules.")
+                        "overwrite __str__() of one of the modules."
+                    )
                 unique_mod_names.add(str(module))
                 num_parameters += module.num_weights
-        nemo.logging.info(f"Found {len(unique_mod_names)} modules with "
-                          f"weights:")
+        nemo.logging.info(f"Found {len(unique_mod_names)} modules with " f"weights:")
         for name in unique_mod_names:
             nemo.logging.info(f"{name}")
         nemo.logging.info(f"Total model parameters: {num_parameters}")
@@ -368,8 +347,7 @@ def on_epoch_end(self):
         if self._epoch_freq > 0:
             if self.global_rank is None or self.global_rank == 0:
                 run_time = time.time() - self._last_epoch_start
-                nemo.logging.info(
-                    f'Finished epoch {self.epoch_num} in {run_time}')
+                nemo.logging.info(f'Finished epoch {self.epoch_num} in {run_time}')
                 if (self.epoch_num + 1) % self._epoch_freq == 0:
                     self.__save_to(path=self._folder)
 
@@ -381,26 +359,20 @@ class EvaluatorCallback(ActionCallback):
     """
 
     def __init__(
-            self,
-            eval_tensors,
-            user_iter_callback,
-            user_epochs_done_callback,
-            tb_writer=None,
-            tb_writer_func=None,
-            eval_step=1,
-            eval_epoch=None,
+        self,
+        eval_tensors,
+        user_iter_callback,
+        user_epochs_done_callback,
+        tb_writer=None,
+        tb_writer_func=None,
+        eval_step=1,
+        eval_epoch=None,
     ):
         # TODO: Eval_epoch currently does nothing
         if eval_step is None and eval_epoch is None:
-            raise ValueError("Either eval_step or eval_epoch must be set. "
-                             f"But got: {eval_step} and {eval_epoch}")
-        if (eval_step is not None and eval_step <= 0) or (
-                eval_epoch is not None and eval_epoch <= 0
-        ):
-            raise ValueError(
-                f"Eval_step and eval_epoch must be > 0."
-                f"But got: {eval_step} and {eval_epoch}"
-            )
+            raise ValueError("Either eval_step or eval_epoch must be set. " f"But got: {eval_step} and {eval_epoch}")
+        if (eval_step is not None and eval_step <= 0) or (eval_epoch is not None and eval_epoch <= 0):
+            raise ValueError(f"Eval_step and eval_epoch must be > 0." f"But got: {eval_step} and {eval_epoch}")
         super().__init__()
         self._eval_tensors = eval_tensors
         self._swriter = tb_writer
@@ -489,13 +461,13 @@ class _Method(ABC):
     """ Classes inherited from _Method are used for
     ValueSetterCallback below
     """
+
     @abstractmethod
     def __call__(self, step, total_steps):
         pass
 
 
 class _Const(_Method):
-
     def __init__(self, value):
         super().__init__()
 
@@ -506,7 +478,6 @@ def __call__(self, step, total_steps):
 
 
 class _Linear(_Method):
-
     def __init__(self, a, b):
         super().__init__()
         self.a, self.b = a, b
@@ -523,14 +494,12 @@ class ValueSetterCallback(ActionCallback):
     Policy = _Policy
     Method = _Method
 
-    def __init__(self, module, arg_name,
-                 policies=None, total_steps=None, tb_writer=None):
+    def __init__(self, module, arg_name, policies=None, total_steps=None, tb_writer=None):
         super().__init__()
 
         if policies is None:
             initial_value = getattr(module, arg_name)
-            policies = [_Policy(method=Const(initial_value),
-                                start=0.0, end=1.0)]
+            policies = [_Policy(method=Const(initial_value), start=0.0, end=1.0)]
 
         new_policies = []
         for p in policies:
@@ -570,7 +539,6 @@ def on_iteration_start(self):
 
 
 class UnfreezeCallback(ActionCallback):
-
     def __init__(self, modules, start_epoch=0):
         super().__init__()
 
diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py
index 25c37c2bd5e3..528ef68455a2 100644
--- a/nemo/core/neural_factory.py
+++ b/nemo/core/neural_factory.py
@@ -1,28 +1,32 @@
 # Copyright (c) 2019 NVIDIA Corporation
-__all__ = ['Backend',
-           'ModelMode',
-           'Optimization',
-           'DeviceType',
-           'Actions',
-           'NeuralModuleFactory',
-           'DeploymentFormat']
+__all__ = [
+    'Backend',
+    'ModelMode',
+    'Optimization',
+    'DeviceType',
+    'Actions',
+    'NeuralModuleFactory',
+    'DeploymentFormat',
+]
 
-from abc import ABC, abstractmethod
 import random
-from typing import List, Optional
 import warnings
-
+from abc import ABC, abstractmethod
 from enum import Enum
+from typing import List, Optional
+
 import numpy as np
 
 import nemo
+
+from ..utils import ExpManager
 from .callbacks import ActionCallback, EvaluatorCallback
 from .neural_types import *
-from ..utils import ExpManager
 
 
 class DeploymentFormat(Enum):
     """Which format to use when exporting a Neural Module for deployment"""
+
     AUTO = 0
     PYTORCH = 1
     TORCHSCRIPT = 2
@@ -64,11 +68,7 @@ class DeviceType(Enum):
 class Actions(ABC):
     """Basic actions allowed on graphs of Neural Modules"""
 
-    def __init__(
-            self,
-            local_rank,
-            global_rank,
-            optimization_level=Optimization.mxprO0):
+    def __init__(self, local_rank, global_rank, optimization_level=Optimization.mxprO0):
         self._local_rank = local_rank
         self._global_rank = global_rank
         self._optim_level = optimization_level
@@ -95,12 +95,12 @@ def global_rank(self):
 
     @abstractmethod
     def train(
-            self,
-            tensors_to_optimize: List[NmTensor],
-            callbacks: Optional[List[ActionCallback]],
-            lr_policy=None,
-            batches_per_step=None,
-            stop_on_nan_loss=False
+        self,
+        tensors_to_optimize: List[NmTensor],
+        callbacks: Optional[List[ActionCallback]],
+        lr_policy=None,
+        batches_per_step=None,
+        stop_on_nan_loss=False,
     ):
         """This action executes training and (optionally) evaluation.
 
@@ -162,11 +162,7 @@ def restore_state_from(self, path: str):
         pass
 
     @abstractmethod
-    def create_optimizer(
-            self,
-            optimizer,
-            things_to_optimize,
-            optimizer_params):
+    def create_optimizer(self, optimizer, things_to_optimize, optimizer_params):
         """
         Creates an optimizer object to be use in the train() method.
 
@@ -184,55 +180,45 @@ def create_optimizer(
     def _perform_on_iteration_start(self, callbacks):
         # TODO: Most of these checks can be relaxed since we enforce callbacks
         # to be a list of ActionCallback objects
-        if callbacks is not None and isinstance(callbacks, List) and len(
-                callbacks) > 0:
+        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 callback.on_iteration_start()
 
     def _perform_on_iteration_end(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(
-                callbacks) > 0:
+        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 callback.on_iteration_end()
 
     def _perform_on_action_start(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(
-                callbacks) > 0:
+        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 callback.on_action_start()
 
     def _perform_on_action_end(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(
-                callbacks) > 0:
+        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 callback.on_action_end()
 
     def _perform_on_epoch_start(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(
-                callbacks) > 0:
+        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 callback.on_epoch_start()
 
     def _perform_on_epoch_end(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(
-                callbacks) > 0:
+        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 callback.on_epoch_end()
 
     def _init_callbacks(self, callbacks):
-        if callbacks is not None and isinstance(callbacks, List) and len(
-                callbacks) > 0:
+        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 callback.action = self
 
     def _update_callbacks(
-            self,
-            callbacks=None,
-            registered_tensors=None,
+        self, callbacks=None, registered_tensors=None,
     ):
         # if self.local_rank is None or self.local_rank == 0:
-        if callbacks is not None and isinstance(callbacks, List) and len(
-                callbacks) > 0:
+        if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0:
             for callback in callbacks:
                 callback._registered_tensors = registered_tensors
 
@@ -274,20 +260,20 @@ class NeuralModuleFactory(object):
     """
 
     def __init__(
-            self,
-            backend=Backend.PyTorch,
-            local_rank=None,
-            optimization_level=Optimization.mxprO0,
-            placement=None,
-            cudnn_benchmark=False,
-            random_seed=None,
-            set_default=True,
-            log_dir=None,
-            checkpoint_dir=None,
-            tensorboard_dir=None,
-            create_tb_writer=False,
-            files_to_copy=None,
-            add_time_to_log_dir=False
+        self,
+        backend=Backend.PyTorch,
+        local_rank=None,
+        optimization_level=Optimization.mxprO0,
+        placement=None,
+        cudnn_benchmark=False,
+        random_seed=None,
+        set_default=True,
+        log_dir=None,
+        checkpoint_dir=None,
+        tensorboard_dir=None,
+        create_tb_writer=False,
+        files_to_copy=None,
+        add_time_to_log_dir=False,
     ):
         self._local_rank = local_rank
         self._global_rank = None
@@ -312,20 +298,22 @@ def __init__(
         if backend == Backend.PyTorch:
             # TODO: Move all framework specific code from this file
             import torch
+
             if self._placement != DeviceType.CPU:
                 if not torch.cuda.is_available():
-                    raise ValueError("You requested to use GPUs but CUDA is "
-                                     "not installed. You can try running using"
-                                     " CPU-only. To do this, instantiate your"
-                                     " factory with placement=DeviceType.CPU"
-                                     "\n"
-                                     "Note that this is slow and is not "
-                                     "well supported.")
+                    raise ValueError(
+                        "You requested to use GPUs but CUDA is "
+                        "not installed. You can try running using"
+                        " CPU-only. To do this, instantiate your"
+                        " factory with placement=DeviceType.CPU"
+                        "\n"
+                        "Note that this is slow and is not "
+                        "well supported."
+                    )
 
             torch.backends.cudnn.benchmark = cudnn_benchmark
             if random_seed is not None and cudnn_benchmark:
-                raise ValueError("cudnn_benchmark can not be set to True"
-                                 "when random_seed is not None.")
+                raise ValueError("cudnn_benchmark can not be set to True" "when random_seed is not None.")
             if random_seed is not None:
                 torch.backends.cudnn.deterministic = True
                 torch.backends.cudnn.benchmark = False
@@ -334,9 +322,7 @@ def __init__(
                 random.seed(random_seed)
 
             if self._local_rank is not None:
-                torch.distributed.init_process_group(
-                    backend="nccl", init_method="env://"
-                )
+                torch.distributed.init_process_group(backend="nccl", init_method="env://")
 
                 cuda_set = True
                 # Try to set cuda device. This should fail if self._local_rank
@@ -353,13 +339,13 @@ def __init__(
                 # Do an all_reduce to ensure all workers obtained a GPU
                 # For the strangest reason, BAND doesn't work so I am resorting
                 # to MIN.
-                torch.distributed.all_reduce(
-                    cuda_set_t, op=torch.distributed.ReduceOp.MIN)
+                torch.distributed.all_reduce(cuda_set_t, op=torch.distributed.ReduceOp.MIN)
                 if cuda_set_t.item() == 0:
                     raise RuntimeError(
                         "There was an error initializing distributed training."
                         " Perhaps you specified more gpus than you have "
-                        "available")
+                        "available"
+                    )
 
                 del cuda_set_t
                 torch.cuda.empty_cache()
@@ -374,11 +360,9 @@ def torch_broadcast_wrapper(str_len=None, string=None, src=0):
                     """
                     # Create byte cuda torch tensor
                     if string is not None:
-                        string_tensor = torch.tensor(
-                            list(string.encode()), dtype=torch.uint8).cuda()
+                        string_tensor = torch.tensor(list(string.encode()), dtype=torch.uint8).cuda()
                     else:
-                        string_tensor = torch.tensor(
-                            [0] * str_len, dtype=torch.uint8).cuda()
+                        string_tensor = torch.tensor([0] * str_len, dtype=torch.uint8).cuda()
                     # Run broadcast
                     torch.distributed.broadcast(string_tensor, src)
                     # turn byte tensor back to string
@@ -388,8 +372,7 @@ def torch_broadcast_wrapper(str_len=None, string=None, src=0):
 
                 broadcast_func = torch_broadcast_wrapper
         else:
-            raise NotImplementedError(
-                "Only Pytorch backend is currently supported.")
+            raise NotImplementedError("Only Pytorch backend is currently supported.")
 
         # Create ExpManager
         # if log_dir is None, only create logger
@@ -403,7 +386,8 @@ def torch_broadcast_wrapper(str_len=None, string=None, src=0):
             files_to_copy=files_to_copy,
             add_time=add_time_to_log_dir,
             exist_ok=True,
-            broadcast_func=broadcast_func)
+            broadcast_func=broadcast_func,
+        )
         self._tb_writer = self._exp_manager.tb_writer
 
         # Create trainer
@@ -436,38 +420,25 @@ def __name_import(name):
 
     def __get_pytorch_module(self, name, params, collection, pretrained):
         params["factory"] = self
-        if collection == "toys" or collection == "tutorials" or collection \
-                == "other":
-            constructor = NeuralModuleFactory.__name_import(
-                "nemo.backends.pytorch.tutorials." + name
-            )
+        if collection == "toys" or collection == "tutorials" or collection == "other":
+            constructor = NeuralModuleFactory.__name_import("nemo.backends.pytorch.tutorials." + name)
         elif collection == "nemo_nlp":
-            constructor = NeuralModuleFactory.__name_import(
-                "nemo_nlp." + name
-            )
+            constructor = NeuralModuleFactory.__name_import("nemo_nlp." + name)
             if name == "BERT" and pretrained is True:
                 params["pretrained"] = True
         elif collection == "nemo_asr":
-            constructor = NeuralModuleFactory.__name_import(
-                "nemo_asr." + name
-            )
+            constructor = NeuralModuleFactory.__name_import("nemo_asr." + name)
         elif collection == "nemo_lpr":
-            constructor = NeuralModuleFactory.__name_import(
-                "nemo_lpr." + name
-            )
+            constructor = NeuralModuleFactory.__name_import("nemo_lpr." + name)
         elif collection == 'common':
-            constructor = NeuralModuleFactory.__name_import(
-                'nemo.backends.pytorch.common.' + name
-            )
+            constructor = NeuralModuleFactory.__name_import('nemo.backends.pytorch.common.' + name)
         elif collection == "torchvision":
             import torchvision.models as tv_models
             import nemo.backends.pytorch.module_wrapper as mw
             import torch.nn as nn
 
             if name == "ImageFolderDataLayer":
-                constructor = NeuralModuleFactory.__name_import(
-                    "nemo.backends.pytorch.torchvision.data." + name
-                )
+                constructor = NeuralModuleFactory.__name_import("nemo.backends.pytorch.torchvision.data." + name)
                 instance = constructor(**params)
                 return instance
             else:
@@ -483,21 +454,14 @@ def __get_pytorch_module(self, name, params, collection, pretrained):
                             }
                         )
                     }
-                    output_ports = {
-                        "output": NeuralType(
-                            {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}
-                        )
-                    }
+                    output_ports = {"output": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})}
 
                     pt_model = tv_models.resnet18(pretrained=pretrained)
                     num_classes = params.get("num_classes", None)
                     if num_classes is not None:
                         pt_model.fc = nn.Linear(512, params["num_classes"])
                     return mw.TrainableNeuralModuleWrapper(
-                        pt_nn_module=pt_model,
-                        input_ports_dict=input_ports,
-                        output_ports_dict=output_ports,
-                        **params,
+                        pt_nn_module=pt_model, input_ports_dict=input_ports, output_ports_dict=output_ports, **params,
                     )
                 elif _nm_name == "resnet50":
                     input_ports = {
@@ -510,21 +474,14 @@ def __get_pytorch_module(self, name, params, collection, pretrained):
                             }
                         )
                     }
-                    output_ports = {
-                        "output": NeuralType(
-                            {0: AxisType(BatchTag), 1: AxisType(ChannelTag)}
-                        )
-                    }
+                    output_ports = {"output": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})}
 
                     pt_model = tv_models.resnet50(pretrained=pretrained)
                     num_classes = params.get("num_classes", None)
                     if num_classes is not None:
                         pt_model.fc = nn.Linear(2048, params["num_classes"])
                     return mw.TrainableNeuralModuleWrapper(
-                        pt_nn_module=pt_model,
-                        input_ports_dict=input_ports,
-                        output_ports_dict=output_ports,
-                        **params,
+                        pt_nn_module=pt_model, input_ports_dict=input_ports, output_ports_dict=output_ports, **params,
                     )
         else:
             collection_path = "nemo.collections." + collection + "." + name
@@ -559,10 +516,7 @@ def get_module(self, name, params, collection, pretrained=False):
                     "Module's {0} requested optimization level {1} is"
                     "different from the one specified by factory - {2}."
                     "Using: {3} for this module".format(
-                        name,
-                        params["optimization_level"],
-                        self._optim_level,
-                        params["optimization_level"],
+                        name, params["optimization_level"], self._optim_level, params["optimization_level"],
                     )
                 )
         else:
@@ -571,35 +525,30 @@ def get_module(self, name, params, collection, pretrained=False):
             params["optimization_level"] = self._optim_level
 
         if self._backend == Backend.PyTorch:
-            return self.__get_pytorch_module(
-                name=name, params=params, collection=collection,
-                pretrained=pretrained
-            )
+            return self.__get_pytorch_module(name=name, params=params, collection=collection, pretrained=pretrained,)
         else:
             return None
 
-    def create_optimizer(self,
-                         optimizer,
-                         things_to_optimize,
-                         optimizer_params):
+    def create_optimizer(self, optimizer, things_to_optimize, optimizer_params):
         return self._trainer.create_optimizer(
-            optimizer=optimizer,
-            things_to_optimize=things_to_optimize,
-            optimizer_params=optimizer_params)
-
-    def train(self,
-              tensors_to_optimize,
-              optimizer=None,
-              optimization_params=None,
-              callbacks: Optional[List[ActionCallback]] = None,
-              lr_policy=None,
-              batches_per_step=None,
-              stop_on_nan_loss=False,
-              synced_batchnorm=False,
-              synced_batchnorm_groupsize=0,
-              gradient_predivide=False,
-              amp_max_loss_scale=2.**24,
-              reset=False):
+            optimizer=optimizer, things_to_optimize=things_to_optimize, optimizer_params=optimizer_params,
+        )
+
+    def train(
+        self,
+        tensors_to_optimize,
+        optimizer=None,
+        optimization_params=None,
+        callbacks: Optional[List[ActionCallback]] = None,
+        lr_policy=None,
+        batches_per_step=None,
+        stop_on_nan_loss=False,
+        synced_batchnorm=False,
+        synced_batchnorm_groupsize=0,
+        gradient_predivide=False,
+        amp_max_loss_scale=2.0 ** 24,
+        reset=False,
+    ):
         if reset:
             self.reset_trainer()
         return self._trainer.train(
@@ -613,30 +562,22 @@ def train(self,
             synced_batchnorm=synced_batchnorm,
             synced_batchnorm_groupsize=synced_batchnorm_groupsize,
             gradient_predivide=gradient_predivide,
-            amp_max_loss_scale=amp_max_loss_scale)
+            amp_max_loss_scale=amp_max_loss_scale,
+        )
 
-    def eval(self,
-             callbacks: List[EvaluatorCallback]):
+    def eval(self, callbacks: List[EvaluatorCallback]):
         if callbacks is None or len(callbacks) == 0:
-            raise ValueError(f"You need to provide at lease one evaluation"
-                             f"callback to eval")
+            raise ValueError(f"You need to provide at lease one evaluation" f"callback to eval")
         for callback in callbacks:
             if not isinstance(callback, EvaluatorCallback):
-                raise TypeError(f"All callbacks passed to the eval action must"
-                                f"be inherited from EvaluatorCallback")
+                raise TypeError(f"All callbacks passed to the eval action must" f"be inherited from EvaluatorCallback")
         self.train(
-            tensors_to_optimize=None,
-            optimizer='sgd',
-            callbacks=callbacks,
-            optimization_params={'num_epochs': 1}
+            tensors_to_optimize=None, optimizer='sgd', callbacks=callbacks, optimization_params={'num_epochs': 1},
         )
 
-    def deployment_export(self,
-                          module,
-                          output: str,
-                          d_format: DeploymentFormat,
-                          input_example=None,
-                          output_example=None):
+    def deployment_export(
+        self, module, output: str, d_format: DeploymentFormat, input_example=None, output_example=None,
+    ):
         """Exports Neural Module instance for deployment.
 
         Args:
@@ -668,18 +609,20 @@ def deployment_export(self,
             output=output,
             d_format=d_format,
             input_example=input_example,
-            output_example=output_example
+            output_example=output_example,
         )
 
-    def infer(self,
-              tensors: List[NmTensor],
-              checkpoint_dir=None,
-              ckpt_pattern='',
-              verbose=True,
-              cache=False,
-              use_cache=False,
-              offload_to_cpu=True,
-              modules_to_restore=None):
+    def infer(
+        self,
+        tensors: List[NmTensor],
+        checkpoint_dir=None,
+        ckpt_pattern='',
+        verbose=True,
+        cache=False,
+        use_cache=False,
+        offload_to_cpu=True,
+        modules_to_restore=None,
+    ):
         """Runs inference to obtain values for tensors
 
         Args:
@@ -716,7 +659,8 @@ def infer(self,
             cache=cache,
             use_cache=use_cache,
             offload_to_cpu=offload_to_cpu,
-            modules_to_restore=modules_to_restore)
+            modules_to_restore=modules_to_restore,
+        )
 
     def clear_cache(self):
         """Helper function to clean inference cache."""
@@ -724,14 +668,13 @@ def clear_cache(self):
 
     def _get_trainer(self, tb_writer=None):
         if self._backend == Backend.PyTorch:
-            constructor = NeuralModuleFactory.__name_import(
-                "nemo.backends.pytorch.PtActions"
-            )
+            constructor = NeuralModuleFactory.__name_import("nemo.backends.pytorch.PtActions")
             instance = constructor(
                 local_rank=self._local_rank,
                 global_rank=self._global_rank,
                 tb_writer=tb_writer,
-                optimization_level=self._optim_level)
+                optimization_level=self._optim_level,
+            )
             return instance
         else:
             raise ValueError("Only PyTorch backend is currently supported.")
@@ -742,11 +685,13 @@ def get_trainer(self, tb_writer=None):
             f"in future versions of NeMo."
             f"Please use .train(...), .eval(...), .infer(...) and "
             f".create_optimizer(...) directly methods from "
-            f"NeuralModuleFactory instance.")
+            f"NeuralModuleFactory instance."
+        )
         if self._trainer:
             nemo.logging.warning(
                 "The trainer instance was created during initialization of "
-                "Neural factory, using the already created instance.")
+                "Neural factory, using the already created instance."
+            )
             return self._trainer
         return self._get_trainer(tb_writer)
 
@@ -766,21 +711,21 @@ def sync_all_processes(self, status=True):
                 message on its own and exit
         """
         if self._world_size == 1:
-            nemo.logging.info("sync_all_processes does nothing if there is "
-                              "one process")
+            nemo.logging.info("sync_all_processes does nothing if there is " "one process")
             return
         if self._backend == Backend.PyTorch:
             import torch
+
             status_tensor = torch.cuda.IntTensor([status])
-            torch.distributed.all_reduce(
-                status_tensor, op=torch.distributed.ReduceOp.MIN)
+            torch.distributed.all_reduce(status_tensor, op=torch.distributed.ReduceOp.MIN)
             if status_tensor.item() == 0:
                 nemo.logging.error("At least one process had a failure")
                 if status:
                     raise ValueError(
                         f"Process with global rank {self._global_rank} entered"
                         " sync_all_processes with a passing status, but "
-                        "another process indicated a failure")
+                        "another process indicated a failure"
+                    )
 
     @property
     def world_size(self):
@@ -800,8 +745,7 @@ def optim_level(self):
 
     @property
     def logger(self):
-        warnings.warn("This will be deprecated in future releases. Please use "
-                      "nemo.logging instead")
+        warnings.warn("This will be deprecated in future releases. Please use " "nemo.logging instead")
         return nemo.logging
 
     @property
diff --git a/nemo/core/neural_modules.py b/nemo/core/neural_modules.py
index 9ef78d52c2e3..16001c2dc682 100644
--- a/nemo/core/neural_modules.py
+++ b/nemo/core/neural_modules.py
@@ -1,38 +1,40 @@
 # Copyright (c) 2019 NVIDIA Corporation
 """This file contains NeuralModule and NmTensor classes."""
-__all__ = ['WeightShareTransform',
-           'NeuralModule']
+__all__ = ['WeightShareTransform', 'NeuralModule']
 
 
+import collections
+import uuid
 from abc import ABC, abstractmethod
 from collections import namedtuple
 from enum import Enum
 from inspect import getargvalues, stack
-from typing import Optional, Dict, Set, Tuple, List
-import uuid
-import collections
-
+from typing import Dict, List, Optional, Set, Tuple
 
 from nemo.core import NeuralModuleFactory
 from nemo.utils.decorators.deprecated import deprecated
 
-from .neural_factory import Optimization, DeviceType
-from .neural_types import (CanNotInferResultNeuralType,
-                           NeuralType, NeuralTypeComparisonResult,
-                           NeuralPortNameMismatchError,
-                           NeuralPortNmTensorMismatchError,
-                           NmTensor)
+from .neural_factory import DeviceType, Optimization
+from .neural_types import (
+    CanNotInferResultNeuralType,
+    NeuralPortNameMismatchError,
+    NeuralPortNmTensorMismatchError,
+    NeuralType,
+    NeuralTypeComparisonResult,
+    NmTensor,
+)
 
 
 class WeightShareTransform(Enum):
     """When sharing parameters, what kind of transform to apply."""
+
     SAME = 0
     TRANSPOSE = 1
 
 
-PretrainedModelInfo = namedtuple("PretrainedModleInfo",
-                                 ("pretrained_model_name", "description",
-                                  "parameters", "location"))
+PretrainedModelInfo = namedtuple(
+    "PretrainedModleInfo", ("pretrained_model_name", "description", "parameters", "location"),
+)
 
 
 class NeuralModule(ABC):
@@ -50,11 +52,7 @@ class NeuralModule(ABC):
     """
 
     def __init__(
-            self, *,
-            pretrained_model_name=None,
-            factory=None,
-            placement=None,
-            **kwargs
+        self, *, pretrained_model_name=None, factory=None, placement=None, **kwargs,
     ):
         self._pretrained_model_name = pretrained_model_name
         self._local_parameters = self.update_local_params()
@@ -64,10 +62,8 @@ def __init__(
             factory = default_factory
 
         # Set module properties from factory else use defaults
-        self._placement = factory.placement if factory is not None\
-            else DeviceType.GPU
-        self._opt_level = factory.optim_level if factory is not None\
-            else Optimization.mxprO0
+        self._placement = factory.placement if factory is not None else DeviceType.GPU
+        self._opt_level = factory.optim_level if factory is not None else Optimization.mxprO0
 
         # Update module properties using overrides if overrides exist
         if placement is not None:
@@ -89,7 +85,8 @@ def create_ports(**kwargs):
         """ Deprecated method, to be remoted in the next release."""
         raise Exception(
             'Deprecated method. Please implement ``inputs`` and ``outputs`` \
-                 properties to define module ports instead')
+                 properties to define module ports instead'
+        )
 
     @property
     @abstractmethod
@@ -143,9 +140,7 @@ def __call__(self, **kwargs):
         input_nmtensors_are_of_same_type = True
         for port_name, tgv in kwargs.items():
             if port_name not in input_port_defs.keys():
-                raise NeuralPortNameMismatchError(
-                    "Wrong input port name: {0}".format(port_name)
-                )
+                raise NeuralPortNameMismatchError("Wrong input port name: {0}".format(port_name))
 
             type_comatibility = input_port_defs[port_name].compare(tgv)
 
@@ -155,23 +150,19 @@ def __call__(self, **kwargs):
                 if first_input_nmtensor_type._axis2type is None:
                     input_nmtensors_are_of_same_type = True
                 else:
-                    input_nmtensors_are_of_same_type = \
-                        first_input_nmtensor_type.compare(tgv) \
-                        == NeuralTypeComparisonResult.SAME and \
-                        len(first_input_nmtensor_type._axis2type)
-            if not (type_comatibility == NeuralTypeComparisonResult.SAME or
-                    type_comatibility == NeuralTypeComparisonResult.GREATER):
+                    input_nmtensors_are_of_same_type = first_input_nmtensor_type.compare(
+                        tgv
+                    ) == NeuralTypeComparisonResult.SAME and len(first_input_nmtensor_type._axis2type)
+            if not (
+                type_comatibility == NeuralTypeComparisonResult.SAME
+                or type_comatibility == NeuralTypeComparisonResult.GREATER
+            ):
                 raise NeuralPortNmTensorMismatchError(
                     "\n\nIn {0}. \n"
                     "Port: {1} and a NmTensor it was fed are \n"
                     "of incompatible neural types:\n\n{2} \n\n and \n\n{3}"
-                    "\n\nType comparison result: {4}"
-                    .format(
-                        self.__class__.__name__,
-                        port_name,
-                        input_port_defs[port_name],
-                        tgv,
-                        type_comatibility
+                    "\n\nType comparison result: {4}".format(
+                        self.__class__.__name__, port_name, input_port_defs[port_name], tgv, type_comatibility,
                     )
                 )
             if type_comatibility == NeuralTypeComparisonResult.LESS:
@@ -185,14 +176,9 @@ def __call__(self, **kwargs):
                     out_type = first_input_nmtensor_type
                 else:
                     raise CanNotInferResultNeuralType(
-                        "Can't infer output neural type."
-                        "Likely your inputs are of "
-                        "different type."
+                        "Can't infer output neural type." "Likely your inputs are of " "different type."
                     )
-            return NmTensor(
-                producer=self, producer_args=kwargs, name=out_name,
-                ntype=out_type
-            )
+            return NmTensor(producer=self, producer_args=kwargs, name=out_name, ntype=out_type,)
         else:
             result = []
             for out_port, n_type in output_port_defs.items():
@@ -202,26 +188,14 @@ def __call__(self, **kwargs):
                         out_type = first_input_nmtensor_type
                     else:
                         raise CanNotInferResultNeuralType(
-                            "Can't infer output neural type."
-                            "Likely your inputs are of "
-                            "different type."
+                            "Can't infer output neural type." "Likely your inputs are of " "different type."
                         )
-                result.append(
-                    NmTensor(
-                        producer=self,
-                        producer_args=kwargs,
-                        name=out_port,
-                        ntype=out_type,
-                    )
-                )
+                result.append(NmTensor(producer=self, producer_args=kwargs, name=out_port, ntype=out_type,))
 
             # Creating ad-hoc class for returning from module's forward pass.
             output_class_name = f'{self.__class__.__name__}Output'
             field_names = list(output_port_defs)
-            result_type = collections.namedtuple(
-                typename=output_class_name,
-                field_names=field_names,
-            )
+            result_type = collections.namedtuple(typename=output_class_name, field_names=field_names,)
 
             # Tie tuple of output tensors with corresponding names.
             result = result_type(*result)
@@ -241,10 +215,9 @@ def get_weights(self) -> Optional[Dict[(str, bool)]]:
 
     @abstractmethod
     def set_weights(
-            self,
-            name2weight: Dict[(str, Tuple[str, bool])],
-            name2name_and_transform: Dict[
-                (str, Tuple[str, WeightShareTransform])] = None,
+        self,
+        name2weight: Dict[(str, Tuple[str, bool])],
+        name2name_and_transform: Dict[(str, Tuple[str, WeightShareTransform])] = None,
     ):
         """Sets weight from given values. For every named weight in
         name2weight,
@@ -288,11 +261,10 @@ def get_config_dict_and_checkpoint(self, pretrained_model_name):
 
     @abstractmethod
     def tie_weights_with(
-            self,
-            module,
-            weight_names=List[str],
-            name2name_and_transform: Dict[
-                (str, Tuple[str, WeightShareTransform])] = None,
+        self,
+        module,
+        weight_names=List[str],
+        name2name_and_transform: Dict[(str, Tuple[str, WeightShareTransform])] = None,
     ):
         """Ties weights between self and module. For every weight name in
         weight_names, if weight with the same name is found in self, it will
@@ -438,11 +410,9 @@ def update_local_params():
         for frame in stack()[1:]:
             posname, kwname, localvars = getargvalues(frame[0])[-3:]
             # Check if caller is a Neural Module
-            if ("self" in localvars and
-                    isinstance(localvars["self"], NeuralModule)):
+            if "self" in localvars and isinstance(localvars["self"], NeuralModule):
                 if posname is not None:
-                    raise ValueError("NeuralModules cannot accept `*` "
-                                     "positional arguments.")
+                    raise ValueError("NeuralModules cannot accept `*` " "positional arguments.")
                 # Get func arg dict
                 localvars.update(localvars.pop(kwname, []))
                 del localvars["self"]
diff --git a/nemo/core/neural_types.py b/nemo/core/neural_types.py
index 35ee9f5c5d99..38b606fc5b9e 100644
--- a/nemo/core/neural_types.py
+++ b/nemo/core/neural_types.py
@@ -6,32 +6,34 @@
 An exception will be raised when a NmTensor and input port where it goes are
 of incompatible types.
 """
-__all__ = ['BaseTag',
-           'BatchTag',
-           'TimeTag',
-           'ProcessedTimeTag',
-           'ChannelTag',
-           'EmbeddedTextTag',
-           'SpectrogramSignalTag',
-           'MelSpectrogramSignalTag',
-           'MFCCSignalTag',
-           'EncodedRepresentationTag',
-           'ClassTag',
-           'WidthTag',
-           'HeightTag',
-           'CategoricalTag',
-           'RegressionTag',
-           'NeuralTypeComparisonResult',
-           'AxisType',
-           'NeuralType',
-           'NmTensor',
-           'NeuralTypeError',
-           'NeuralPortNameMismatchError',
-           'NeuralPortNmTensorMismatchError',
-           'CanNotInferResultNeuralType']
+__all__ = [
+    'BaseTag',
+    'BatchTag',
+    'TimeTag',
+    'ProcessedTimeTag',
+    'ChannelTag',
+    'EmbeddedTextTag',
+    'SpectrogramSignalTag',
+    'MelSpectrogramSignalTag',
+    'MFCCSignalTag',
+    'EncodedRepresentationTag',
+    'ClassTag',
+    'WidthTag',
+    'HeightTag',
+    'CategoricalTag',
+    'RegressionTag',
+    'NeuralTypeComparisonResult',
+    'AxisType',
+    'NeuralType',
+    'NmTensor',
+    'NeuralTypeError',
+    'NeuralPortNameMismatchError',
+    'NeuralPortNmTensorMismatchError',
+    'CanNotInferResultNeuralType',
+]
 
-from enum import Enum
 import uuid
+from enum import Enum
 
 
 class BaseTag(object):
@@ -155,9 +157,7 @@ class NeuralTypeComparisonResult(Enum):
     GREATER = 2  # B is A
     DIM_INCOMPATIBLE = 3  # Resize connector might fix incompatibility
     TRANSPOSE_SAME = 4  # A transpose will make them same
-    INCOMPATIBLE = (
-        5
-    )  # A and B are incompatible. Can't fix incompatibility automatically
+    INCOMPATIBLE = 5  # A and B are incompatible. Can't fix incompatibility automatically
 
 
 class AxisType(object):
@@ -165,18 +165,13 @@ class AxisType(object):
     It's semantics is a Neural Tag (inherited from BaseTag)
     dimension (dim) is (optional) int and descriptor is (optional) string"""
 
-    def __init__(self, semantics, dim: int = None,
-                 descriptor: str = None):
+    def __init__(self, semantics, dim: int = None, descriptor: str = None):
         self._semantics = semantics
         self._dim = dim
         self._descriptor = descriptor
 
     def __eq__(self, other):
-        return (
-            self.semantics == other.semantics
-            and self.dim == other.dim
-            and self.descriptor == other.descriptor
-        )
+        return self.semantics == other.semantics and self.dim == other.dim and self.descriptor == other.descriptor
 
     def __str__(self):
         return "{0}:{1}:{2}".format(self.semantics, self.dim, self.descriptor)
@@ -194,9 +189,7 @@ def compare_to(self, other):
         Returns:
           Results of a comparison (NeuralTypeComparisonResult)
         """
-        if (
-                self.dim is None or self.dim == other.dim
-        ) and self.descriptor == other.descriptor:
+        if (self.dim is None or self.dim == other.dim) and self.descriptor == other.descriptor:
             if self.semantics == other.semantics:
                 return NeuralTypeComparisonResult.SAME
             elif issubclass(self.semantics, other.semantics):
@@ -205,8 +198,7 @@ def compare_to(self, other):
                 return NeuralTypeComparisonResult.GREATER
             else:
                 return NeuralTypeComparisonResult.INCOMPATIBLE
-        elif self.descriptor == other.descriptor and self.semantics == \
-                other.semantics:
+        elif self.descriptor == other.descriptor and self.semantics == other.semantics:
             return NeuralTypeComparisonResult.DIM_INCOMPATIBLE
         else:
             return NeuralTypeComparisonResult.INCOMPATIBLE
@@ -275,16 +267,13 @@ def __init__(self, axis2type={}, optional=False):
 
     def __str__(self):
         if self._axis2type is None:
-            return "(Optional) " if self._optional else "" + "non-tensor " \
-                                                             "object"
+            return "(Optional) " if self._optional else "" + "non-tensor " "object"
         elif len(self._axis2type) == 0:
             return "(Optional) " if self._optional else "" + "Root NeuralType"
         return (
             "(Optional)"
             if self._optional
-            else ""
-                 + "\n".join(["{0}->{1}".format(axis, tag) for axis, tag in
-                              self._axis2type.items()])
+            else "" + "\n".join(["{0}->{1}".format(axis, tag) for axis, tag in self._axis2type.items()])
         )
 
     def compare(self, n_type2) -> NeuralTypeComparisonResult:
@@ -312,10 +301,9 @@ def compare(self, n_type2) -> NeuralTypeComparisonResult:
         elif self._axis2type == n_type2._axis2type:
             return NeuralTypeComparisonResult.SAME
         # same set of keys and set of values => TRANSPOSE_SAME
-        elif set(self._axis2type.keys()) == set(
-                n_type2._axis2type.keys()) and set(
-            self._axis2type.values()
-        ) == set(n_type2._axis2type.values()):
+        elif set(self._axis2type.keys()) == set(n_type2._axis2type.keys()) and set(self._axis2type.values()) == set(
+            n_type2._axis2type.values()
+        ):
             return NeuralTypeComparisonResult.TRANSPOSE_SAME
 
         elif set(self._axis2type.keys()) == set(n_type2._axis2type.keys()):
@@ -323,9 +311,7 @@ def compare(self, n_type2) -> NeuralTypeComparisonResult:
             comparison_result = 0
             for key in self._axis2type.keys():
                 comparison_result = max(
-                    self._axis2type[key].compare_to(
-                        n_type2._axis2type[key]).value,
-                    comparison_result,
+                    self._axis2type[key].compare_to(n_type2._axis2type[key]).value, comparison_result,
                 )
             return NeuralTypeComparisonResult(comparison_result)
         else:
diff --git a/nemo/package_info.py b/nemo/package_info.py
index f69f40dc5cdc..e589578de875 100644
--- a/nemo/package_info.py
+++ b/nemo/package_info.py
@@ -34,5 +34,4 @@
 __download_url__ = 'https://github.com/NVIDIA/NeMo/releases'
 __description__ = 'NEMO core package. Necessary for all collections'
 __license__ = 'Apache2'
-__keywords__ = 'deep learning, machine learning, gpu, NLP, NeMo, nvidia, ' \
-               'pytorch, torch, tts, speech, language '
+__keywords__ = 'deep learning, machine learning, gpu, NLP, NeMo, nvidia, ' 'pytorch, torch, tts, speech, language '
diff --git a/nemo/utils/argparse.py b/nemo/utils/argparse.py
index 722d1a6f31c6..5bb1bf298672 100644
--- a/nemo/utils/argparse.py
+++ b/nemo/utils/argparse.py
@@ -16,48 +16,61 @@ class NemoArgParser(argparse.ArgumentParser):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         # NeMo arguments
-        self.add_argument("--local_rank", default=None, type=int,
-                          help="node rank for distributed training")
-        self.add_argument("--amp_opt_level", default="O0", type=str,
-                          choices=["O0", "O1", "O2", "O3"],
-                          help="apex/amp precision:"
-                               " O0 - float32,"
-                               " O1 - mixed precision opt1,"
-                               " O2 - mixed precision opt2,"
-                               " O3 - float16"
-                               "See: https://nvidia.github.io/apex/amp.html")
-        self.add_argument("--cudnn_benchmark", action="store_true",
-                          help="If set to True it will use cudnnFind method to"
-                               " find the best kernels instead of using "
-                               "heuristics. If the shapes of your inputs are "
-                               "constant this should help, for various shapes "
-                               "it can slow things down.")
+        self.add_argument(
+            "--local_rank", default=None, type=int, help="node rank for distributed training",
+        )
+        self.add_argument(
+            "--amp_opt_level",
+            default="O0",
+            type=str,
+            choices=["O0", "O1", "O2", "O3"],
+            help="apex/amp precision:"
+            " O0 - float32,"
+            " O1 - mixed precision opt1,"
+            " O2 - mixed precision opt2,"
+            " O3 - float16"
+            "See: https://nvidia.github.io/apex/amp.html",
+        )
+        self.add_argument(
+            "--cudnn_benchmark",
+            action="store_true",
+            help="If set to True it will use cudnnFind method to"
+            " find the best kernels instead of using "
+            "heuristics. If the shapes of your inputs are "
+            "constant this should help, for various shapes "
+            "it can slow things down.",
+        )
         # self.add_argument("--random_seed", default=None, type=int,
         #                   help="random seed")
         # self.add_argument("--deterministic", action="store_true",
         #                   help="whether to enable determinism")
 
         # Model defintion
-        self.add_argument("--model_config", type=str, default=None,
-                          help="model configuration file: model.yaml")
-        self.add_argument("--train_dataset", type=str, default=None,
-                          help="training dataset path")
-        self.add_argument("--eval_datasets", type=str, nargs="*",
-                          help="evaludation datasets paths")
-        self.add_argument("--batch_size", type=int,
-                          help="train batch size per GPU")
-        self.add_argument("--eval_batch_size", type=int,
-                          help="evaluation  batch size per GPU")
-        self.add_argument("--eval_freq", default=1000, type=int,
-                          help="evaluation frequency, steps")
+        self.add_argument(
+            "--model_config", type=str, default=None, help="model configuration file: model.yaml",
+        )
+        self.add_argument(
+            "--train_dataset", type=str, default=None, help="training dataset path",
+        )
+        self.add_argument(
+            "--eval_datasets", type=str, nargs="*", help="evaludation datasets paths",
+        )
+        self.add_argument("--batch_size", type=int, help="train batch size per GPU")
+        self.add_argument(
+            "--eval_batch_size", type=int, help="evaluation  batch size per GPU",
+        )
+        self.add_argument(
+            "--eval_freq", default=1000, type=int, help="evaluation frequency, steps",
+        )
 
         # Optimizer Choices
-        self.add_argument("--optimizer", type=str,
-                          choices=["sgd", "adam", "fused_adam", "adam_w",
-                                   "novograd", "lamb"],
-                          help="optimizer")
-        self.add_argument("--weight_decay", type=float, default=0.,
-                          help="weight decay")
+        self.add_argument(
+            "--optimizer",
+            type=str,
+            choices=["sgd", "adam", "fused_adam", "adam_w", "novograd", "lamb",],
+            help="optimizer",
+        )
+        self.add_argument("--weight_decay", type=float, default=0.0, help="weight decay")
         # self.add_argument("--momentum", type=float,
         #                   help="SGD momentum")
         # self.add_argument("--beta1", type=float,
@@ -66,34 +79,54 @@ def __init__(self, **kwargs):
         #                   help="Adam/AdamW/NovoGrad beta2")
 
         # Optimization Arguments
-        self.add_argument("--num_epochs", type=int, default=None,
-                          help="number of epochs to train. You should specify"
-                               "either num_epochs or max_steps")
-        self.add_argument("--max_steps", type=int, default=None,
-                          help="max number of steps to train. You should "
-                               "specify either num_epochs or max_steps")
-        self.add_argument("--lr", type=float, default=1e-3,
-                          help="base learning rate")
-        self.add_argument("--lr_policy", type=str, default='WarmupAnnealing',
-                          help="learning rate decay policy")
+        self.add_argument(
+            "--num_epochs",
+            type=int,
+            default=None,
+            help="number of epochs to train. You should specify" "either num_epochs or max_steps",
+        )
+        self.add_argument(
+            "--max_steps",
+            type=int,
+            default=None,
+            help="max number of steps to train. You should " "specify either num_epochs or max_steps",
+        )
+        self.add_argument("--lr", type=float, default=1e-3, help="base learning rate")
+        self.add_argument(
+            "--lr_policy", type=str, default='WarmupAnnealing', help="learning rate decay policy",
+        )
         # self.add_argument("--warmup_steps", default=0, type=int,
         #                   help="number of learning rate warmup steps")
-        self.add_argument("--iter_per_step", default=1, type=int,
-                          help="number of gradients accumulation iterations "
-                               "per weights update step")
+        self.add_argument(
+            "--iter_per_step",
+            default=1,
+            type=int,
+            help="number of gradients accumulation iterations " "per weights update step",
+        )
 
         # Logging arguments
-        self.add_argument("--work_dir", default=None, type=str,
-                          help="working directory for experiment")
-        self.add_argument("--checkpoint_dir", default=None, type=str,
-                          help="where to save checkpoints. If ckpt_dir is "
-                               "None, the default behaviour is to put it under"
-                               "{work_dir}/checkpoints")
-        self.add_argument("--create_tb_writer", action="store_true",
-                          help="whether to log into Tensorboard")
-        self.add_argument("--tensorboard_dir", default=None, type=str,
-                          help="If --create_tb_writer is enabled, specifies "
-                               "the tensorboard directory. Defaults to "
-                               "{work_dir}/checkpoints")
-        self.add_argument("--checkpoint_save_freq", default=1000, type=int,
-                          help="checkpoint frequency, steps")
+        self.add_argument(
+            "--work_dir", default=None, type=str, help="working directory for experiment",
+        )
+        self.add_argument(
+            "--checkpoint_dir",
+            default=None,
+            type=str,
+            help="where to save checkpoints. If ckpt_dir is "
+            "None, the default behaviour is to put it under"
+            "{work_dir}/checkpoints",
+        )
+        self.add_argument(
+            "--create_tb_writer", action="store_true", help="whether to log into Tensorboard",
+        )
+        self.add_argument(
+            "--tensorboard_dir",
+            default=None,
+            type=str,
+            help="If --create_tb_writer is enabled, specifies "
+            "the tensorboard directory. Defaults to "
+            "{work_dir}/checkpoints",
+        )
+        self.add_argument(
+            "--checkpoint_save_freq", default=1000, type=int, help="checkpoint frequency, steps",
+        )
diff --git a/nemo/utils/decorators/deprecated.py b/nemo/utils/decorators/deprecated.py
index 6bac2a307c44..3b471ea78852 100644
--- a/nemo/utils/decorators/deprecated.py
+++ b/nemo/utils/decorators/deprecated.py
@@ -45,6 +45,7 @@ def __call__(self, func):
         Method prints the adequate warning (only once per function) when
         required and calls the function func, passing the original arguments.
         """
+
         def wrapper(*args, **kwargs):
             """
             Function prints the adequate warning and calls the function func,
@@ -60,9 +61,7 @@ def wrapper(*args, **kwargs):
 
                 # Optionally, add version and alternative.
                 if self.version is not None:
-                    msg = msg + \
-                        " It is going to be removed in version {}.".format(
-                            self.version)
+                    msg = msg + " It is going to be removed in version {}.".format(self.version)
 
                 if self.explanation is not None:
                     msg = msg + " " + self.explanation
diff --git a/nemo/utils/exp_logging.py b/nemo/utils/exp_logging.py
index 52987000d703..69868bc8c365 100644
--- a/nemo/utils/exp_logging.py
+++ b/nemo/utils/exp_logging.py
@@ -1,18 +1,17 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import logging
 import os
-import time
-from shutil import copyfile
 import subprocess
 import sys
+import time
 import warnings
+from shutil import copyfile
 
 import nemo
 
 
 def get_logger(unused):
-    warnings.warn("This function will be deprecated in the future. You "
-                  "can just use nemo.logging instead")
+    warnings.warn("This function will be deprecated in the future. You " "can just use nemo.logging instead")
     return nemo.logging
 
 
@@ -77,18 +76,20 @@ class ExpManager:
         the datetime suffix such that all ranks are consistent on work_dir
         name.
     """
+
     def __init__(
-            self,
-            work_dir=None,
-            local_rank=None,
-            global_rank=None,
-            use_tb=True,
-            exist_ok=True,
-            ckpt_dir=None,
-            tb_dir=None,
-            files_to_copy=None,
-            add_time=True,
-            broadcast_func=None):
+        self,
+        work_dir=None,
+        local_rank=None,
+        global_rank=None,
+        use_tb=True,
+        exist_ok=True,
+        ckpt_dir=None,
+        tb_dir=None,
+        files_to_copy=None,
+        add_time=True,
+        broadcast_func=None,
+    ):
         self.local_rank = local_rank if local_rank is not None else 0
         self.global_rank = global_rank if global_rank is not None else 0
         self.logger = None
@@ -107,7 +108,8 @@ def __init__(
             if broadcast_func is None:
                 raise ValueError(
                     "local rank was not None, but ExpManager was not passed a "
-                    "broadcast function to broadcast the datetime suffix")
+                    "broadcast function to broadcast the datetime suffix"
+                )
             if global_rank == 0:
                 broadcast_func(string=tm_suf)
             else:
@@ -130,15 +132,13 @@ def __init__(
                     copyfile(file, os.path.join(self.work_dir, basename))
             if self.global_rank == 0:
                 # Create files for cmd args and git info
-                with open(os.path.join(
-                        self.work_dir, f'cmd-args_{tm_suf}.log'), 'w') as f:
+                with open(os.path.join(self.work_dir, f'cmd-args_{tm_suf}.log'), 'w') as f:
                     f.write(" ".join(sys.argv))
 
                 # Try to get git hash
                 git_repo, git_hash = get_git_hash()
                 if git_repo:
-                    git_log_file = os.path.join(
-                        self.work_dir, f'git-info_{tm_suf}.log')
+                    git_log_file = os.path.join(self.work_dir, f'git-info_{tm_suf}.log')
                     with open(git_log_file, 'w') as f:
                         f.write(f'commit hash: {git_hash}')
                         f.write(get_git_diff())
@@ -146,8 +146,7 @@ def __init__(
         # Create loggers
         self.create_logger(log_file=bool(work_dir))
         if use_tb and not work_dir:
-            raise ValueError("ExpManager received use_tb as True but did not "
-                             "receive a work_dir")
+            raise ValueError("ExpManager received use_tb as True but did not " "receive a work_dir")
 
         if ckpt_dir:
             self.ckpt_dir = ckpt_dir
@@ -166,9 +165,7 @@ def create_logger(self, level=logging.INFO, log_file=True):
             logger.addHandler(ch)
 
         if log_file:
-            self.log_file = (
-                f'{self.work_dir}/log_globalrank-{self.global_rank}_'
-                f'localrank-{self.local_rank}.txt')
+            self.log_file = f'{self.work_dir}/log_globalrank-{self.global_rank}_' f'localrank-{self.local_rank}.txt'
             fh = logging.FileHandler(self.log_file)
             fh.setLevel(level)
             fh.setFormatter(tmp)
@@ -191,6 +188,7 @@ def get_tb_writer(self, tb_dir=None, exist_ok=True):
 
             try:
                 from torch.utils.tensorboard import SummaryWriter
+
                 self.tb_writer = SummaryWriter(self.tb_dir)
             except ImportError:
                 self.tb_writer = None
@@ -203,8 +201,7 @@ def log_exp_info(self, params, print_everywhere=False):
             nemo.logging.info("NEMO MODEL'S PARAMETERS")
             for key in params:
                 nemo.logging.info(f'{key}\t{params[key]}')
-            nemo.logging.info(
-                f'Experiment output is stored in {self.work_dir}')
+            nemo.logging.info(f'Experiment output is stored in {self.work_dir}')
 
     def reset_loggers(self):
         nemo.logging.handlers = []
@@ -212,15 +209,16 @@ def reset_loggers(self):
 
 def get_git_hash():
     try:
-        return True, subprocess.check_output(['git', 'rev-parse', 'HEAD'],
-                                             stderr=subprocess.STDOUT).decode()
+        return (
+            True,
+            subprocess.check_output(['git', 'rev-parse', 'HEAD'], stderr=subprocess.STDOUT).decode(),
+        )
     except subprocess.CalledProcessError as e:
         return False, "{}\n".format(e.output.decode("utf-8"))
 
 
 def get_git_diff():
     try:
-        return subprocess.check_output(['git', 'diff'],
-                                       stderr=subprocess.STDOUT).decode()
+        return subprocess.check_output(['git', 'diff'], stderr=subprocess.STDOUT).decode()
     except subprocess.CalledProcessError as e:
         return "{}\n".format(e.output.decode("utf-8"))
diff --git a/nemo/utils/helpers.py b/nemo/utils/helpers.py
index 3104f76a8516..0a20a05f5a7e 100644
--- a/nemo/utils/helpers.py
+++ b/nemo/utils/helpers.py
@@ -45,8 +45,7 @@ def get_checkpoint_from_dir(module_names, cpkt_dir, ckpt_pattern=''):
 
         module_ckpts = glob.glob(f'{cpkt_dir}/{module}*{ckpt_pattern}*')
         if not module_ckpts:
-            raise ValueError(f'For module {module}, '
-                             f'no file matches {ckpt_pattern} in {cpkt_dir}')
+            raise ValueError(f'For module {module}, ' f'no file matches {ckpt_pattern} in {cpkt_dir}')
 
         # if multiple checkpoints match a pattern, take the latest one
         def step_from_checkpoint(checkpoint_name):
@@ -67,7 +66,7 @@ def _call_args_to_string(call_args):
     result = "(force_pt=True,"
     counter = 0
     for key, value in call_dict.items():
-        result += (f"{key}={value}" if counter == 0 else f", {key}={value}")
+        result += f"{key}={value}" if counter == 0 else f", {key}={value}"
         counter += 1
     result += ")"
     return result
diff --git a/nemo/utils/lr_policies.py b/nemo/utils/lr_policies.py
index ddf83e37b4b7..a3a9319324dd 100644
--- a/nemo/utils/lr_policies.py
+++ b/nemo/utils/lr_policies.py
@@ -1,13 +1,17 @@
 # Copyright (c) 2019 NVIDIA Corporation
-__all__ = ['WarmupPolicy', 'SquareAnnealing', 'CosineAnnealing',
-           'WarmupAnnealing', 'InverseSquareRootAnnealing',
-           'SquareRootAnnealing']
-
-import math
-from abc import ABC, abstractmethod
+__all__ = [
+    'WarmupPolicy',
+    'SquareAnnealing',
+    'CosineAnnealing',
+    'WarmupAnnealing',
+    'InverseSquareRootAnnealing',
+    'SquareRootAnnealing',
+]
 
 import inspect
+import math
 import sys
+from abc import ABC, abstractmethod
 
 
 class _LRPolicy(ABC):
@@ -43,15 +47,11 @@ class WarmupPolicy(_LRPolicy):
 
     """
 
-    def __init__(self,
-                 *,
-                 warmup_steps=None,
-                 warmup_ratio=None,
-                 total_steps=None):
-        assert not (warmup_steps is not None and warmup_ratio is not None), \
-            "Either use particular number of step or ratio"
-        assert warmup_ratio is None or total_steps is not None, \
-            "If there is a ratio, there should be a total steps"
+    def __init__(self, *, warmup_steps=None, warmup_ratio=None, total_steps=None):
+        assert not (
+            warmup_steps is not None and warmup_ratio is not None
+        ), "Either use particular number of step or ratio"
+        assert warmup_ratio is None or total_steps is not None, "If there is a ratio, there should be a total steps"
 
         super().__init__()
 
@@ -105,7 +105,7 @@ def _get_lr(self, initial_lr, step, epoch):
             initial_lr=initial_lr,
             step=step - self.warmup_steps,
             total_steps=self.total_steps - self.warmup_steps,
-            min_lr=self.min_lr
+            min_lr=self.min_lr,
         )
 
 
@@ -116,10 +116,7 @@ def __init__(self, total_steps, min_lr=0, **kwargs):
 
     def _get_lr(self, initial_lr, step, epoch):
         return _squareroot_annealing(
-            initial_lr=initial_lr,
-            step=step,
-            total_steps=self.total_steps,
-            min_lr=self.min_lr
+            initial_lr=initial_lr, step=step, total_steps=self.total_steps, min_lr=self.min_lr,
         )
 
 
@@ -130,13 +127,14 @@ def __init__(self, total_steps, min_lr=0, **kwargs):
 
     def _get_lr(self, initial_lr, step, epoch):
         if initial_lr < self.min_lr:
-            raise ValueError(f"{self} received an initial learning rate that "
-                             f"was lower than the minimum learning rate.")
+            raise ValueError(
+                f"{self} received an initial learning rate that " f"was lower than the minimum learning rate."
+            )
         return _cosine_annealing(
             initial_lr=initial_lr,
             step=step - self.warmup_steps,
             total_steps=self.total_steps - self.warmup_steps,
-            min_lr=self.min_lr
+            min_lr=self.min_lr,
         )
 
 
@@ -148,7 +146,7 @@ def _get_lr(self, initial_lr, step, epoch):
         progress = float(step / self.total_steps)
         warmup_ratio = float(self.warmup_steps / self.total_steps)
 
-        mult = max((progress - 1.) / (warmup_ratio - 1.), 0.)
+        mult = max((progress - 1.0) / (warmup_ratio - 1.0), 0.0)
         out_lr = initial_lr * mult
 
         return out_lr
@@ -177,6 +175,7 @@ def get_all_lr_classes():
 def get_lr_policy(lr_policy, **kwargs):
     lr_classes = get_all_lr_classes()
     if lr_policy not in lr_classes:
-        raise ValueError(f'{lr_policy} is not a supported lr policy. '
-                         f'Supported lr policies are {lr_classes.keys()}.')
+        raise ValueError(
+            f'{lr_policy} is not a supported lr policy. ' f'Supported lr policies are {lr_classes.keys()}.'
+        )
     return lr_classes[lr_policy](**kwargs)
diff --git a/requirements/requirements_test.txt b/requirements/requirements_test.txt
index c4090fd237ab..9017d9f7536b 100644
--- a/requirements/requirements_test.txt
+++ b/requirements/requirements_test.txt
@@ -1,2 +1,4 @@
 parameterized
 pytest
+black
+isort[requirements]
\ No newline at end of file
diff --git a/scripts/build_lm_text.py b/scripts/build_lm_text.py
index 0799c1ff38c9..d3b48f2bc442 100644
--- a/scripts/build_lm_text.py
+++ b/scripts/build_lm_text.py
@@ -1,13 +1,11 @@
-import pandas as pd
-import os
 import argparse
+import os
 
+import pandas as pd
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Build N-gram LM model from text file')
-    parser.add_argument('text', metavar='text', type=str,
-                        help='text file')
+    parser = argparse.ArgumentParser(description='Build N-gram LM model from text file')
+    parser.add_argument('text', metavar='text', type=str, help='text file')
     parser.add_argument('--n', type=int, help='n for n-grams', default=3)
     args = parser.parse_args()
 
diff --git a/scripts/convert_iob_format_to_token_classification_format.py b/scripts/convert_iob_format_to_token_classification_format.py
index a41aa286de3b..f9602216e307 100644
--- a/scripts/convert_iob_format_to_token_classification_format.py
+++ b/scripts/convert_iob_format_to_token_classification_format.py
@@ -16,9 +16,7 @@
 import os
 
 
-def __convert_data(in_file,
-                   out_text,
-                   out_labels):
+def __convert_data(in_file, out_text, out_labels):
     """
     in_file should be in the IOB format, see example here:
     https://www.clips.uantwerpen.be/conll2003/ner/.
@@ -46,20 +44,24 @@ def __convert_data(in_file,
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Convert data from IOB ' +
-                                     'format to the format compatible with ' +
-                                     'nlp/examples/token_classification.py')
+    parser = argparse.ArgumentParser(
+        description='Convert data from IOB '
+        + 'format to the format compatible with '
+        + 'nlp/examples/token_classification.py'
+    )
     parser.add_argument("--data_dir", required=True, type=str)
     args = parser.parse_args()
 
     for dataset in ['dev.txt', 'train.txt']:
         file_path = os.path.join(args.data_dir, dataset)
         if not os.path.exists(file_path):
-            raise FileNotFoundError("{file_path} not found in {args.data_dir}"
-                                    "For NER, CoNLL-2003 dataset"
-                                    "can be obtained at"
-                                    "https://github.com/kyzhouhzau/BERT"
-                                    "-NER/tree/master/data.")
+            raise FileNotFoundError(
+                "{file_path} not found in {args.data_dir}"
+                "For NER, CoNLL-2003 dataset"
+                "can be obtained at"
+                "https://github.com/kyzhouhzau/BERT"
+                "-NER/tree/master/data."
+            )
 
         print(f'Processing {dataset}')
         out_text = os.path.join(args.data_dir, 'text_' + dataset)
diff --git a/scripts/export_bert_to_trt.py b/scripts/export_bert_to_trt.py
index ac6c523c8ee7..e977d34b7a83 100644
--- a/scripts/export_bert_to_trt.py
+++ b/scripts/export_bert_to_trt.py
@@ -18,8 +18,9 @@
 import json
 import re
 
-import torch
 import numpy as np
+import torch
+
 import tensorrt as trt
 
 nvinfer = ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
@@ -32,17 +33,14 @@
 TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
 trt.init_libnvinfer_plugins(TRT_LOGGER, "")
 plg_registry = trt.get_plugin_registry()
-qkv2_plg_creator = plg_registry.get_plugin_creator(
-    "CustomQKVToContextPluginDynamic", "1", "")
-skln_plg_creator = plg_registry.get_plugin_creator(
-    "CustomSkipLayerNormPluginDynamic", "1", "")
-gelu_plg_creator = plg_registry.get_plugin_creator("CustomGeluPluginDynamic",
-                                                   "1", "")
-emln_plg_creator = plg_registry.get_plugin_creator(
-    "CustomEmbLayerNormPluginDynamic", "1", "")
-
-print("creators:", plg_registry, qkv2_plg_creator, skln_plg_creator,
-      gelu_plg_creator, emln_plg_creator)
+qkv2_plg_creator = plg_registry.get_plugin_creator("CustomQKVToContextPluginDynamic", "1", "")
+skln_plg_creator = plg_registry.get_plugin_creator("CustomSkipLayerNormPluginDynamic", "1", "")
+gelu_plg_creator = plg_registry.get_plugin_creator("CustomGeluPluginDynamic", "1", "")
+emln_plg_creator = plg_registry.get_plugin_creator("CustomEmbLayerNormPluginDynamic", "1", "")
+
+print(
+    "creators:", plg_registry, qkv2_plg_creator, skln_plg_creator, gelu_plg_creator, emln_plg_creator,
+)
 print("\n".join([x.name for x in plg_registry.plugin_creator_list]))
 
 """
@@ -99,12 +97,11 @@ def set_layer_name(layer, prefix, name, out_idx=0):
     set_tensor_name(layer.get_output(out_idx), prefix, name)
 
 
-def attention_layer_opt(prefix, config, init_dict, network, input_tensor,
-                        imask):
+def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask):
     """
     Add the attention layer
     """
-    assert (len(input_tensor.shape) == 5)
+    assert len(input_tensor.shape) == 5
     B, S, hidden_size, _, _ = input_tensor.shape
     num_heads = config.num_attention_heads
     head_size = int(hidden_size / num_heads)
@@ -112,25 +109,17 @@ def attention_layer_opt(prefix, config, init_dict, network, input_tensor,
     Wall = init_dict[prefix + WQKV]
     Ball = init_dict[prefix + BQKV]
 
-    mult_all = network.add_fully_connected(input_tensor, 3 * hidden_size, Wall,
-                                           Ball)
+    mult_all = network.add_fully_connected(input_tensor, 3 * hidden_size, Wall, Ball)
     set_layer_name(mult_all, prefix, "qkv_mult")
 
     has_mask = imask is not None
 
-    pf_hidden_size = trt.PluginField("hidden_size",
-                                     np.array([hidden_size], np.int32),
-                                     trt.PluginFieldType.INT32)
-    pf_num_heads = trt.PluginField("num_heads",
-                                   np.array([num_heads], np.int32),
-                                   trt.PluginFieldType.INT32)
-    pf_S = trt.PluginField("S", np.array([S], np.int32),
-                           trt.PluginFieldType.INT32)
-    pf_has_mask = trt.PluginField("has_mask", np.array([has_mask], np.int32),
-                                  trt.PluginFieldType.INT32)
-
-    pfc = trt.PluginFieldCollection(
-        [pf_hidden_size, pf_num_heads, pf_S, pf_has_mask])
+    pf_hidden_size = trt.PluginField("hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32,)
+    pf_num_heads = trt.PluginField("num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32)
+    pf_S = trt.PluginField("S", np.array([S], np.int32), trt.PluginFieldType.INT32)
+    pf_has_mask = trt.PluginField("has_mask", np.array([has_mask], np.int32), trt.PluginFieldType.INT32)
+
+    pfc = trt.PluginFieldCollection([pf_hidden_size, pf_num_heads, pf_S, pf_has_mask])
     qkv2ctx_plug = qkv2_plg_creator.create_plugin("qkv2ctx", pfc)
 
     qkv_in = [mult_all.get_output(0), imask]
@@ -147,14 +136,11 @@ def skipln(prefix, init_dict, network, input_tensor, skip):
     assert len(idims) == 5
     hidden_size = idims[2]
 
-    pf_ld = trt.PluginField("ld", np.array([hidden_size], np.int32),
-                            trt.PluginFieldType.INT32)
+    pf_ld = trt.PluginField("ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
     wbeta = init_dict[prefix + "bias"]
-    pf_beta = trt.PluginField("beta", wbeta.numpy(),
-                              trt.PluginFieldType.FLOAT32)
+    pf_beta = trt.PluginField("beta", wbeta.numpy(), trt.PluginFieldType.FLOAT32)
     wgamma = init_dict[prefix + "weight"]
-    pf_gamma = trt.PluginField("gamma", wgamma.numpy(),
-                               trt.PluginFieldType.FLOAT32)
+    pf_gamma = trt.PluginField("gamma", wgamma.numpy(), trt.PluginFieldType.FLOAT32)
 
     pfc = trt.PluginFieldCollection([pf_ld, pf_beta, pf_gamma])
     skipln_plug = skln_plg_creator.create_plugin("skipln", pfc)
@@ -165,8 +151,7 @@ def skipln(prefix, init_dict, network, input_tensor, skip):
     return layer
 
 
-def transformer_layer_opt(prefix, config, init_dict, network, input_tensor,
-                          imask):
+def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imask):
     """
     Add the transformer layer
     """
@@ -174,25 +159,23 @@ def transformer_layer_opt(prefix, config, init_dict, network, input_tensor,
     assert len(idims) == 5
     hidden_size = idims[2]
 
-    context_transposed = attention_layer_opt(prefix + "attention_self_",
-                                             config, init_dict, network,
-                                             input_tensor, imask)
+    context_transposed = attention_layer_opt(
+        prefix + "attention_self_", config, init_dict, network, input_tensor, imask,
+    )
     attention_heads = context_transposed.get_output(0)
 
     W_aout = init_dict[prefix + W_AOUT]
     B_aout = init_dict[prefix + B_AOUT]
-    attention_out_fc = network.add_fully_connected(attention_heads,
-                                                   hidden_size, W_aout, B_aout)
+    attention_out_fc = network.add_fully_connected(attention_heads, hidden_size, W_aout, B_aout)
 
-    skiplayer = skipln(prefix + "attention_output_layernorm_", init_dict,
-                       network, attention_out_fc.get_output(0), input_tensor)
+    skiplayer = skipln(
+        prefix + "attention_output_layernorm_", init_dict, network, attention_out_fc.get_output(0), input_tensor,
+    )
     attention_ln = skiplayer.get_output(0)
 
     W_mid = init_dict[prefix + W_MID]
     B_mid = init_dict[prefix + B_MID]
-    mid_dense = network.add_fully_connected(attention_ln,
-                                            config.intermediate_size, W_mid,
-                                            B_mid)
+    mid_dense = network.add_fully_connected(attention_ln, config.intermediate_size, W_mid, B_mid)
 
     mid_dense_out = mid_dense.get_output(0)
 
@@ -208,11 +191,9 @@ def transformer_layer_opt(prefix, config, init_dict, network, input_tensor,
     W_lout = init_dict[prefix + W_LOUT]
     B_lout = init_dict[prefix + B_LOUT]
 
-    out_dense = network.add_fully_connected(intermediate_act, hidden_size,
-                                            W_lout, B_lout)
+    out_dense = network.add_fully_connected(intermediate_act, hidden_size, W_lout, B_lout)
     set_layer_name(out_dense, prefix + "output_", "dense")
-    out_layer = skipln(prefix + "output_layernorm_", init_dict, network,
-                       out_dense.get_output(0), attention_ln)
+    out_layer = skipln(prefix + "output_layernorm_", init_dict, network, out_dense.get_output(0), attention_ln,)
     out_ln = out_layer.get_output(0)
 
     set_tensor_name(out_ln, prefix + "output_", "reshape")
@@ -227,8 +208,7 @@ def bert_model(config, init_dict, network, input_tensor, input_mask):
     prev_input = input_tensor
     for layer in range(0, config.num_hidden_layers):
         ss = "l{}_".format(layer)
-        prev_input = transformer_layer_opt(ss, config, init_dict, network,
-                                           prev_input, input_mask)
+        prev_input = transformer_layer_opt(ss, config, init_dict, network, prev_input, input_mask)
     return prev_input
 
 
@@ -237,6 +217,7 @@ def bert_model(config, init_dict, network, input_tensor, input_mask):
 # config.hidden_size, config.hidden_size)
 # pooled_output = self.activation(pooled_output), nn.tanh
 
+
 def bert_pooler(prefix, init_dict, network, input_tensor):
     """
     pooler the bert output
@@ -247,18 +228,15 @@ def bert_pooler(prefix, init_dict, network, input_tensor):
     shuf = network.add_shuffle(input_tensor)
     shuf.first_transpose = (2, 3, 0, 1)
 
-    first_token_tensor = network.add_slice(shuf.get_output(0),
-                                           start=(0, 0, 0, 0),
-                                           shape=(1, 1, 1, hidden_size),
-                                           stride=(1, 1, 1, 1))
+    first_token_tensor = network.add_slice(
+        shuf.get_output(0), start=(0, 0, 0, 0), shape=(1, 1, 1, hidden_size), stride=(1, 1, 1, 1),
+    )
 
     W_out = init_dict[prefix + POOL_W]
     B_out = init_dict[prefix + POOL_B]
-    pooler = network.add_fully_connected(first_token_tensor.get_output(0),
-                                         hidden_size, W_out, B_out)
+    pooler = network.add_fully_connected(first_token_tensor.get_output(0), hidden_size, W_out, B_out)
 
-    pooler = network.add_activation(pooler.get_output(0),
-                                    trt.ActivationType.TANH)
+    pooler = network.add_activation(pooler.get_output(0), trt.ActivationType.TANH)
     set_layer_name(pooler, prefix, "pooler")
 
     return pooler.get_output(0)
@@ -280,8 +258,7 @@ def squad_output(prefix, init_dict, network, input_tensor):
     return dense
 
 
-def sequence_class_output(prefix, init_dict, network, input_tensor,
-                          softmax=True):
+def sequence_class_output(prefix, init_dict, network, input_tensor, softmax=True):
     print(input_tensor.shape)
     seq_len = input_tensor.shape[1]
     hidden_size = input_tensor.shape[2]
@@ -293,33 +270,25 @@ def sequence_class_output(prefix, init_dict, network, input_tensor,
     in_shape_tensor = network.add_shape(shuf.get_output(0)).get_output(0)
     out_shape_tensor = network.add_gather(
         in_shape_tensor,
-        network.add_constant((5,),
-                             trt.Weights(
-                                 np.array(
-                                     [0, 1,
-                                      2, 2,
-                                      4]).astype(
-                                     np.int32))).get_output(
-            0), 0).get_output(0)
-
-    first_token_tensor = network.add_slice(shuf.get_output(0),
-                                           start=(0, 0, 0, 0, 0),
-                                           shape=(-1, 1, 1, 1, hidden_size),
-                                           stride=(1, 1, 1, 1, 1))
-    first_token_tensor.set_input(1, network.add_constant((5,), trt.Weights(
-        np.array([0, 0, 0, 0, 0]).astype(np.int32))).get_output(0))
+        network.add_constant((5,), trt.Weights(np.array([0, 1, 2, 2, 4]).astype(np.int32))).get_output(0),
+        0,
+    ).get_output(0)
+
+    first_token_tensor = network.add_slice(
+        shuf.get_output(0), start=(0, 0, 0, 0, 0), shape=(-1, 1, 1, 1, hidden_size), stride=(1, 1, 1, 1, 1),
+    )
+    first_token_tensor.set_input(
+        1, network.add_constant((5,), trt.Weights(np.array([0, 0, 0, 0, 0]).astype(np.int32))).get_output(0),
+    )
     first_token_tensor.set_input(2, out_shape_tensor)
 
     W_out = init_dict[prefix + "mlp.layer0." + SQD_W]
     B_out = init_dict[prefix + "mlp.layer0." + SQD_B]
-    dense = network.add_fully_connected(first_token_tensor.get_output(0),
-                                        W_out.shape[0], W_out, B_out)
-    dense_relu = network.add_activation(dense.get_output(0),
-                                        trt.ActivationType.RELU)
+    dense = network.add_fully_connected(first_token_tensor.get_output(0), W_out.shape[0], W_out, B_out)
+    dense_relu = network.add_activation(dense.get_output(0), trt.ActivationType.RELU)
     W_out = init_dict[prefix + "mlp.layer2." + SQD_W]
     B_out = init_dict[prefix + "mlp.layer2." + SQD_B]
-    classifier = network.add_fully_connected(dense_relu.get_output(0),
-                                             W_out.shape[0], W_out, B_out)
+    classifier = network.add_fully_connected(dense_relu.get_output(0), W_out.shape[0], W_out, B_out)
     if softmax:
         probs = network.add_softmax(classifier.get_output(0))
         probs.axes = 4  # last dimension
@@ -335,14 +304,11 @@ def sequence_class_output(prefix, init_dict, network, input_tensor,
 def token_class_output(prefix, init_dict, network, input_tensor, softmax=True):
     W_out = init_dict[prefix + "mlp.layer0." + SQD_W]
     B_out = init_dict[prefix + "mlp.layer0." + SQD_B]
-    dense = network.add_fully_connected(input_tensor, W_out.shape[0], W_out,
-                                        B_out)
-    dense_relu = network.add_activation(dense.get_output(0),
-                                        trt.ActivationType.RELU)
+    dense = network.add_fully_connected(input_tensor, W_out.shape[0], W_out, B_out)
+    dense_relu = network.add_activation(dense.get_output(0), trt.ActivationType.RELU)
     W_out = init_dict[prefix + "mlp.layer2." + SQD_W]
     B_out = init_dict[prefix + "mlp.layer2." + SQD_B]
-    classifier = network.add_fully_connected(dense_relu.get_output(0),
-                                             W_out.shape[0], W_out, B_out)
+    classifier = network.add_fully_connected(dense_relu.get_output(0), W_out.shape[0], W_out, B_out)
 
     if softmax:
         probs = network.add_softmax(classifier.get_output(0))
@@ -368,15 +334,14 @@ def load_weights(inputbase):
 
         # There might be training-related variables in the checkpoint that
         # can be discarded
-        param_names = [key for key in sorted(tensor_dict) if
-                       'adam' not in key and 'global_step' not in key]
+        param_names = [key for key in sorted(tensor_dict) if 'adam' not in key and 'global_step' not in key]
         count = len(param_names)
         TRT_LOGGER.log(TRT_LOGGER.INFO, str(count))
 
         for pn in param_names:
             toks = pn.lower().split('.')
             if 'encoder' in pn:
-                assert ('layer' in pn)
+                assert 'layer' in pn
                 lvar = (re.findall('\d+', pn))[0]  # nopep8
                 outname = 'l{}_'.format(lvar) + '_'.join(toks[4:])
             else:
@@ -386,13 +351,12 @@ def load_weights(inputbase):
             tensor = tensor_dict[pn].numpy()
             shape = tensor.shape
             flat_tensor = tensor.flatten()
-            shape_str = '{} '.format(len(shape)) + ' '.join(
-                [str(d) for d in shape])
+            shape_str = '{} '.format(len(shape)) + ' '.join([str(d) for d in shape])
             weights_dict[outname] = trt.Weights(flat_tensor)
 
-            TRT_LOGGER.log(TRT_LOGGER.INFO,
-                           "Orig.name: {:}, TRT name: {:}, shape: {:}".format(
-                               pn, outname, shape_str))
+            TRT_LOGGER.log(
+                TRT_LOGGER.INFO, "Orig.name: {:}, TRT name: {:}, shape: {:}".format(pn, outname, shape_str),
+            )
 
         additional_dict = dict()
         for key, value in weights_dict.items():
@@ -414,12 +378,11 @@ def load_weights(inputbase):
                 bcount = 3 * hidden_size
                 Ball = np.zeros(bcount, np.float32)
                 Wall[0:mat_size] = Wq_.numpy()[0:mat_size]
-                Wall[mat_size:2 * mat_size] = Wk_.numpy()[0:mat_size]
-                Wall[2 * mat_size:3 * mat_size] = Wv_.numpy()[0:mat_size]
+                Wall[mat_size : 2 * mat_size] = Wk_.numpy()[0:mat_size]
+                Wall[2 * mat_size : 3 * mat_size] = Wv_.numpy()[0:mat_size]
                 Ball[0:hidden_size] = Bq_.numpy()[0:hidden_size]
-                Ball[hidden_size:2 * hidden_size] = Bk_.numpy()[0:hidden_size]
-                Ball[2 * hidden_size:3 * hidden_size] = Bv_.numpy()[
-                                                        0:hidden_size]
+                Ball[hidden_size : 2 * hidden_size] = Bk_.numpy()[0:hidden_size]
+                Ball[2 * hidden_size : 3 * hidden_size] = Bv_.numpy()[0:hidden_size]
 
                 additional_dict[prefix + WQKV] = trt.Weights(Wall)
                 additional_dict[prefix + BQKV] = trt.Weights(Ball)
@@ -431,17 +394,25 @@ def load_weights(inputbase):
     return weights_dict
 
 
-def main(bert_weight_path, class_weight_path, B, S, config_path, outputbase,
-         min_batch=None, max_batch=None, seq_class_prefix=None,
-         tok_class_prefix=None):
+def main(
+    bert_weight_path,
+    class_weight_path,
+    B,
+    S,
+    config_path,
+    outputbase,
+    min_batch=None,
+    max_batch=None,
+    seq_class_prefix=None,
+    tok_class_prefix=None,
+):
     bert_config_path = config_path
     TRT_LOGGER.log(TRT_LOGGER.INFO, bert_config_path)
     config = BertConfig(bert_config_path)
 
     # Load weights from checkpoint file
     init_dict = load_weights(bert_weight_path)
-    classifiers_dict = {k: v.numpy() for k, v in torch.load(class_weight_path,
-                        map_location='cpu').items()}
+    classifiers_dict = {k: v.numpy() for k, v in torch.load(class_weight_path, map_location='cpu').items()}
 
     #    import pdb;pdb.set_trace()
     with trt.Builder(TRT_LOGGER) as builder:
@@ -449,62 +420,44 @@ def main(bert_weight_path, class_weight_path, B, S, config_path, outputbase,
 
         # import pdb;pdb.set_trace()
         w = init_dict["bert_embeddings_layernorm_bias"]
-        wbeta = trt.PluginField("bert_embeddings_layernorm_beta", w.numpy(),
-                                ty)
+        wbeta = trt.PluginField("bert_embeddings_layernorm_beta", w.numpy(), ty)
 
         w = init_dict["bert_embeddings_layernorm_weight"]
-        wgamma = trt.PluginField("bert_embeddings_layernorm_gamma", w.numpy(),
-                                 ty)
+        wgamma = trt.PluginField("bert_embeddings_layernorm_gamma", w.numpy(), ty)
 
         w = init_dict["bert_embeddings_word_embeddings_weight"]
-        wwordemb = trt.PluginField("bert_embeddings_word_embeddings",
-                                   w.numpy(), ty)
+        wwordemb = trt.PluginField("bert_embeddings_word_embeddings", w.numpy(), ty)
 
         w = init_dict["bert_embeddings_token_type_embeddings_weight"]
-        wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings",
-                                  w.numpy(), ty)
+        wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings", w.numpy(), ty)
 
         w = init_dict["bert_embeddings_position_embeddings_weight"]
-        wposemb = trt.PluginField("bert_embeddings_position_embeddings",
-                                  w.numpy(), ty)
+        wposemb = trt.PluginField("bert_embeddings_position_embeddings", w.numpy(), ty)
 
-        pfc = trt.PluginFieldCollection(
-            [wbeta, wgamma, wwordemb, wtokemb, wposemb])
+        pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb])
         fn = emln_plg_creator.create_plugin("embeddings", pfc)
 
-        explicit_batch_flag = 1 << int(
-            trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-        with builder.create_network(
-                explicit_batch_flag) as network, \
-                builder.create_builder_config() as builder_config:
-            builder_config.max_workspace_size = 5000 * (
-                    1024 * 1024)  # 5000 MiB
+        explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+        with builder.create_network(explicit_batch_flag) as network, builder.create_builder_config() as builder_config:
+            builder_config.max_workspace_size = 5000 * (1024 * 1024)  # 5000 MiB
             builder_config.set_flag(trt.BuilderFlag.FP16)
 
-            input_ids = network.add_input(name="input_ids", dtype=trt.int32,
-                                          shape=(-1, S,))
-            segment_ids = network.add_input(name="segment_ids",
-                                            dtype=trt.int32, shape=(-1, S,))
-            input_mask = network.add_input(name="input_mask", dtype=trt.int32,
-                                           shape=(-1, S,))
+            input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1, S,))
+            segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1, S,))
+            input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1, S,))
 
-            def set_profile_shape(profile, batch_size, min_batch=None,
-                                  max_batch=None):
+            def set_profile_shape(profile, batch_size, min_batch=None, max_batch=None):
                 opt_shape = (batch_size, S)
                 min_shape = (min_batch or batch_size, S)
                 max_shape = (max_batch or batch_size, S)
-                profile.set_shape("input_ids", min=min_shape, opt=opt_shape,
-                                  max=max_shape)
-                profile.set_shape("segment_ids", min=min_shape, opt=opt_shape,
-                                  max=max_shape)
-                profile.set_shape("input_mask", min=min_shape, opt=opt_shape,
-                                  max=max_shape)
+                profile.set_shape("input_ids", min=min_shape, opt=opt_shape, max=max_shape)
+                profile.set_shape("segment_ids", min=min_shape, opt=opt_shape, max=max_shape)
+                profile.set_shape("input_mask", min=min_shape, opt=opt_shape, max=max_shape)
 
             # Specify only a single profile for now, even though this is
             # less optimal
             bs1_profile = builder.create_optimization_profile()
-            set_profile_shape(bs1_profile, B, min_batch=min_batch,
-                              max_batch=max_batch)
+            set_profile_shape(bs1_profile, B, min_batch=min_batch, max_batch=max_batch)
             builder_config.add_optimization_profile(bs1_profile)
 
             inputs = [input_ids, segment_ids, input_mask]
@@ -513,22 +466,17 @@ def set_profile_shape(profile, batch_size, min_batch=None,
             embeddings = emb_layer.get_output(0)
             mask_idx = emb_layer.get_output(1)
 
-            bert_out = bert_model(config, init_dict, network, embeddings,
-                                  mask_idx)
+            bert_out = bert_model(config, init_dict, network, embeddings, mask_idx)
 
             if tok_class_prefix is not None:
-                token_class = token_class_output(tok_class_prefix,
-                                                 classifiers_dict, network,
-                                                 bert_out)
+                token_class = token_class_output(tok_class_prefix, classifiers_dict, network, bert_out)
                 token_class_logits_out = token_class.get_output(0)
                 token_class_logits_out.name = "token_logits"
                 token_class_logits_out.dtype = trt.DataType.FLOAT
                 network.mark_output(token_class_logits_out)
 
             if seq_class_prefix is not None:
-                seq_class = sequence_class_output(seq_class_prefix,
-                                                  classifiers_dict, network,
-                                                  bert_out)
+                seq_class = sequence_class_output(seq_class_prefix, classifiers_dict, network, bert_out)
                 seq_class_logits_out = seq_class.get_output(0)
                 seq_class_logits_out.name = "seq_logits"
                 seq_class_logits_out.dtype = trt.DataType.FLOAT
@@ -537,8 +485,7 @@ def set_profile_shape(profile, batch_size, min_batch=None,
             with builder.build_engine(network, builder_config) as engine:
                 TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Serializing Engine...")
                 serialized_engine = engine.serialize()
-                TRT_LOGGER.log(TRT_LOGGER.INFO,
-                               "Saving Engine to {:}".format(outputbase))
+                TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(outputbase))
                 with open(outputbase, 'wb') as fout:
                     fout.write(serialized_engine)
                 TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.")
@@ -546,40 +493,59 @@ def set_profile_shape(profile, batch_size, min_batch=None,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='TensorRT BERT Sample')
-    parser.add_argument('-bw', '--bert-weight', required=True,
-                        help='bert weight from nemo')
-    parser.add_argument('-cw', '--class-weight', required=True,
-                        help='classifier weight from nemo')
-
-    parser.add_argument('-t', '--token-classifier', required=False,
-                        default=None, help="Name of the token classifier")
-    parser.add_argument('-s', '--seq-classifier', required=False, default=None,
-                        help="Name of the sequence classifier")
-
-    parser.add_argument('-o', '--output', required=True,
-                        help='The bert engine file, ex bert.engine')
-    parser.add_argument('-b', '--batch-size', type=int, required=False,
-                        default=1, help='Preferred batch size (default = 1)')
-    parser.add_argument('--max-batch-size', type=int, required=False,
-                        default=None,
-                        help='Maximum batch size (default = same as '
-                             'batch-size)')
-    parser.add_argument('--min-batch-size', type=int, required=False,
-                        default=None,
-                        help='Minimum batch size (default = same as '
-                             'batch-size)')
-
-    parser.add_argument('-l', '--seq-length', type=int, required=False,
-                        default=128,
-                        help='Sequence length of the BERT model (default=128)')
-    parser.add_argument('-c', '--config', required=True,
-                        help='The folder containing the bert_config.json, '
-                             'which can be downloaded e.g. from '
-                             'https://github.com/google-research/bert#pre'
-                             '-trained-models or by running '
-                             'download_models.py in '
-                             'dle/TensorFlow/LanguageModeling/BERT/'
-                             'data/pretrained_models_google')
+    parser.add_argument('-bw', '--bert-weight', required=True, help='bert weight from nemo')
+    parser.add_argument(
+        '-cw', '--class-weight', required=True, help='classifier weight from nemo',
+    )
+
+    parser.add_argument(
+        '-t', '--token-classifier', required=False, default=None, help="Name of the token classifier",
+    )
+    parser.add_argument(
+        '-s', '--seq-classifier', required=False, default=None, help="Name of the sequence classifier",
+    )
+
+    parser.add_argument(
+        '-o', '--output', required=True, help='The bert engine file, ex bert.engine',
+    )
+    parser.add_argument(
+        '-b', '--batch-size', type=int, required=False, default=1, help='Preferred batch size (default = 1)',
+    )
+    parser.add_argument(
+        '--max-batch-size',
+        type=int,
+        required=False,
+        default=None,
+        help='Maximum batch size (default = same as ' 'batch-size)',
+    )
+    parser.add_argument(
+        '--min-batch-size',
+        type=int,
+        required=False,
+        default=None,
+        help='Minimum batch size (default = same as ' 'batch-size)',
+    )
+
+    parser.add_argument(
+        '-l',
+        '--seq-length',
+        type=int,
+        required=False,
+        default=128,
+        help='Sequence length of the BERT model (default=128)',
+    )
+    parser.add_argument(
+        '-c',
+        '--config',
+        required=True,
+        help='The folder containing the bert_config.json, '
+        'which can be downloaded e.g. from '
+        'https://github.com/google-research/bert#pre'
+        '-trained-models or by running '
+        'download_models.py in '
+        'dle/TensorFlow/LanguageModeling/BERT/'
+        'data/pretrained_models_google',
+    )
 
     opt = parser.parse_args()
 
@@ -587,9 +553,15 @@ def set_profile_shape(profile, batch_size, min_batch=None,
     config_path = opt.config
     print("token class:", opt.token_classifier)
     print("seq class:  ", opt.seq_classifier)
-    main(opt.bert_weight, opt.class_weight, opt.batch_size, opt.seq_length,
-         config_path,
-         outputbase, min_batch=opt.min_batch_size,
-         max_batch=opt.max_batch_size,
-         tok_class_prefix=opt.token_classifier,
-         seq_class_prefix=opt.seq_classifier)
+    main(
+        opt.bert_weight,
+        opt.class_weight,
+        opt.batch_size,
+        opt.seq_length,
+        config_path,
+        outputbase,
+        min_batch=opt.min_batch_size,
+        max_batch=opt.max_batch_size,
+        tok_class_prefix=opt.token_classifier,
+        seq_class_prefix=opt.seq_classifier,
+    )
diff --git a/scripts/export_jasper_onnx_to_trt.py b/scripts/export_jasper_onnx_to_trt.py
index 1394a87832c0..130a629a6992 100644
--- a/scripts/export_jasper_onnx_to_trt.py
+++ b/scripts/export_jasper_onnx_to_trt.py
@@ -1,18 +1,26 @@
 import argparse
 
 import onnx
+
 import tensorrt as trt
 
 
-def build_engine(onnx_path, seq_len=192, max_seq_len=256, batch_size=8,
-                 max_batch_size=64, trt_fp16=True, verbose=True,
-                 max_workspace_size=None, encoder=True):
+def build_engine(
+    onnx_path,
+    seq_len=192,
+    max_seq_len=256,
+    batch_size=8,
+    max_batch_size=64,
+    trt_fp16=True,
+    verbose=True,
+    max_workspace_size=None,
+    encoder=True,
+):
     """Builds TRT engine from an ONNX file
     Note that network output 1 is unmarked so that the engine will not use
     vestigial length calculations associated with masked_fill
     """
-    TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(
-        trt.Logger.WARNING)
+    TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.WARNING)
     builder = trt.Builder(TRT_LOGGER)
     builder.max_batch_size = max_batch_size
 
@@ -20,36 +28,33 @@ def build_engine(onnx_path, seq_len=192, max_seq_len=256, batch_size=8,
         model = model_fh.read()
 
     model_onnx = onnx.load_model_from_string(model)
-    input_feats = model_onnx.graph.input[0].type.tensor_type.shape.dim[
-        1].dim_value
+    input_feats = model_onnx.graph.input[0].type.tensor_type.shape.dim[1].dim_value
 
     if trt_fp16:
         builder.fp16_mode = True
         print("Optimizing for FP16")
-        config_flags = 1 << int(
-            trt.BuilderFlag.FP16)  # | 1 << int(trt.BuilderFlag.STRICT_TYPES)
+        config_flags = 1 << int(trt.BuilderFlag.FP16)  # | 1 << int(trt.BuilderFlag.STRICT_TYPES)
     else:
         config_flags = 0
-    builder.max_workspace_size = max_workspace_size if max_workspace_size \
-        else (
-                4 * 1024 * 1024 * 1024)
+    builder.max_workspace_size = max_workspace_size if max_workspace_size else (4 * 1024 * 1024 * 1024)
 
     config = builder.create_builder_config()
     config.flags = config_flags
 
     profile = builder.create_optimization_profile()
-    profile.set_shape("audio_signal" if encoder else "encoder_output",
-                      min=(1, input_feats, seq_len),
-                      opt=(batch_size, input_feats, seq_len),
-                      max=(max_batch_size, input_feats, max_seq_len))
+    profile.set_shape(
+        "audio_signal" if encoder else "encoder_output",
+        min=(1, input_feats, seq_len),
+        opt=(batch_size, input_feats, seq_len),
+        max=(max_batch_size, input_feats, max_seq_len),
+    )
     # if encoder:
     #     profile.set_shape("encoded_lengths",
     #                         min=(1,), opt=(batch_size,),
     #                         max=(max_batch_size,))
     config.add_optimization_profile(profile)
 
-    explicit_batch = 1 << (int)(
-        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
     network = builder.create_network(explicit_batch)
 
     with trt.OnnxParser(network, TRT_LOGGER) as parser:
@@ -59,56 +64,63 @@ def build_engine(onnx_path, seq_len=192, max_seq_len=256, batch_size=8,
 
 
 def get_parser():
-    parser = argparse.ArgumentParser(
-        description="Convert Jasper ONNX model to TRT Plan")
+    parser = argparse.ArgumentParser(description="Convert Jasper ONNX model to TRT Plan")
     parser.add_argument(
-        "onnx_encoder", default=None, type=str,
-        help="Path to Jasper ONNX encoder")
+        "onnx_encoder", default=None, type=str, help="Path to Jasper ONNX encoder",
+    )
     parser.add_argument(
-        "trt_encoder", default=None, type=str,
-        help="Path to output Jasper TRT encoder")
+        "trt_encoder", default=None, type=str, help="Path to output Jasper TRT encoder",
+    )
     parser.add_argument(
-        "onnx_decoder", default=None, type=str,
-        help="Path to Jasper ONNX encoder")
+        "onnx_decoder", default=None, type=str, help="Path to Jasper ONNX encoder",
+    )
     parser.add_argument(
-        "trt_decoder", default=None, type=str,
-        help="Path to output Jasper TRT encoder")
+        "trt_decoder", default=None, type=str, help="Path to output Jasper TRT encoder",
+    )
     parser.add_argument(
-        "--max-seq-len", type=int, default=256,
-        help="Maximum sequence length of input")
+        "--max-seq-len", type=int, default=256, help="Maximum sequence length of input",
+    )
     parser.add_argument(
-        "--seq-len", type=int, default=192,
-        help="Preferred sequence length of input")
+        "--seq-len", type=int, default=192, help="Preferred sequence length of input",
+    )
     parser.add_argument(
-        "--max-batch-size", type=int, default=64,
-        help="Maximum sequence length of input")
+        "--max-batch-size", type=int, default=64, help="Maximum sequence length of input",
+    )
     parser.add_argument(
-        "--batch-size", type=int, default=8,
-        help="Preferred batch size of input")
+        "--batch-size", type=int, default=8, help="Preferred batch size of input",
+    )
     parser.add_argument(
-        "--no-fp16", action="store_true",
-        help="Disable fp16 model building, use fp32 instead")
+        "--no-fp16", action="store_true", help="Disable fp16 model building, use fp32 instead",
+    )
 
     return parser
 
 
 if __name__ == '__main__':
     args = get_parser().parse_args()
-    engine = build_engine(args.onnx_encoder, seq_len=args.seq_len,
-                          max_seq_len=args.max_seq_len,
-                          batch_size=args.batch_size,
-                          max_batch_size=args.max_batch_size,
-                          trt_fp16=not args.no_fp16, encoder=True)
+    engine = build_engine(
+        args.onnx_encoder,
+        seq_len=args.seq_len,
+        max_seq_len=args.max_seq_len,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        trt_fp16=not args.no_fp16,
+        encoder=True,
+    )
     if engine is not None:
         with open(args.trt_encoder, 'wb') as f:
             f.write(engine.serialize())
             print("TRT engine saved at " + args.trt_encoder + " ...")
 
-    engine = build_engine(args.onnx_decoder, seq_len=args.seq_len // 2,
-                          max_seq_len=args.max_seq_len // 2,
-                          batch_size=args.batch_size,
-                          max_batch_size=args.max_batch_size,
-                          trt_fp16=not args.no_fp16, encoder=False)
+    engine = build_engine(
+        args.onnx_decoder,
+        seq_len=args.seq_len // 2,
+        max_seq_len=args.max_seq_len // 2,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        trt_fp16=not args.no_fp16,
+        encoder=False,
+    )
     if engine is not None:
         with open(args.trt_decoder, 'wb') as f:
             f.write(engine.serialize())
diff --git a/scripts/export_jasper_to_onnx.py b/scripts/export_jasper_to_onnx.py
index f28e94515fbd..72a61503319b 100644
--- a/scripts/export_jasper_to_onnx.py
+++ b/scripts/export_jasper_to_onnx.py
@@ -1,39 +1,46 @@
 # Copyright (c) 2019 NVIDIA Corporation
 import argparse
 
-import nemo
-import nemo.collections.asr as nemo_asr
 import torch
 from ruamel.yaml import YAML
 
+import nemo
+import nemo.collections.asr as nemo_asr
+
 
 def get_parser():
-    parser = argparse.ArgumentParser(
-        description="Convert Jasper NeMo checkpoint to ONNX")
+    parser = argparse.ArgumentParser(description="Convert Jasper NeMo checkpoint to ONNX")
     parser.add_argument(
-        "--config", default=None, type=str, required=True,
-        help="Config from nemo")
+        "--config", default=None, type=str, required=True, help="Config from nemo",
+    )
     parser.add_argument(
-        "--nn_encoder", default=None, type=str, required=True,
-        help="Path to the nn encoder checkpoint.")
+        "--nn_encoder", default=None, type=str, required=True, help="Path to the nn encoder checkpoint.",
+    )
     parser.add_argument(
-        "--nn_decoder", default=None, type=str, required=True,
-        help="Path to the nn encoder checkpoint.")
+        "--nn_decoder", default=None, type=str, required=True, help="Path to the nn encoder checkpoint.",
+    )
     parser.add_argument(
-        "--onnx_encoder", default=None, type=str, required=True,
-        help="Path to the onnx encoder output.")
+        "--onnx_encoder", default=None, type=str, required=True, help="Path to the onnx encoder output.",
+    )
     parser.add_argument(
-        "--onnx_decoder", default=None, type=str, required=True,
-        help="Path to the onnx decoder output.")
+        "--onnx_decoder", default=None, type=str, required=True, help="Path to the onnx decoder output.",
+    )
     parser.add_argument(
-        "--pre-v09-model", action="store_true",
-        help="Use if checkpoints were generated from NeMo < v0.9")
+        "--pre-v09-model", action="store_true", help="Use if checkpoints were generated from NeMo < v0.9",
+    )
     return parser
 
 
-def main(config_file, nn_encoder, nn_decoder, nn_onnx_encoder,
-         nn_onnx_decoder, pre_v09_model=False,
-         batch_size=1, time_steps=256):
+def main(
+    config_file,
+    nn_encoder,
+    nn_decoder,
+    nn_onnx_encoder,
+    nn_onnx_decoder,
+    pre_v09_model=False,
+    batch_size=1,
+    time_steps=256,
+):
     yaml = YAML(typ="safe")
 
     print("Loading config file...")
@@ -42,28 +49,22 @@ def main(config_file, nn_encoder, nn_decoder, nn_onnx_encoder,
 
     print("Determining model shape...")
     if 'AudioPreprocessing' in jasper_model_definition:
-        num_encoder_input_features = \
-            jasper_model_definition['AudioPreprocessing']['features']
+        num_encoder_input_features = jasper_model_definition['AudioPreprocessing']['features']
     elif 'AudioToMelSpectrogramPreprocessor' in jasper_model_definition:
-        num_encoder_input_features = \
-            jasper_model_definition['AudioToMelSpectrogramPreprocessor'][
-                'features']
+        num_encoder_input_features = jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features']
     else:
         num_encoder_input_features = 64
-    num_decoder_input_features = \
-        jasper_model_definition['JasperEncoder']['jasper'][-1]['filters']
-    print(
-        "  Num encoder input features: {}".format(num_encoder_input_features))
-    print(
-        "  Num decoder input features: {}".format(num_decoder_input_features))
+    num_decoder_input_features = jasper_model_definition['JasperEncoder']['jasper'][-1]['filters']
+    print("  Num encoder input features: {}".format(num_encoder_input_features))
+    print("  Num decoder input features: {}".format(num_decoder_input_features))
 
     print("Initializing models...")
     jasper_encoder = nemo_asr.JasperEncoder(
-        feat_in=num_encoder_input_features,
-        **jasper_model_definition['JasperEncoder'])
+        feat_in=num_encoder_input_features, **jasper_model_definition['JasperEncoder']
+    )
     jasper_decoder = nemo_asr.JasperDecoderForCTC(
-        feat_in=num_decoder_input_features,
-        num_classes=len(jasper_model_definition['labels']))
+        feat_in=num_decoder_input_features, num_classes=len(jasper_model_definition['labels']),
+    )
 
     # This is necessary if you are using checkpoints trained with NeMo
     # version before 0.9
@@ -84,23 +85,29 @@ def main(config_file, nn_encoder, nn_decoder, nn_onnx_encoder,
 
     nf = nemo.core.NeuralModuleFactory(create_tb_writer=False)
     print("Exporting encoder...")
-    nf.deployment_export(jasper_encoder, nn_onnx_encoder,
-                         nemo.core.neural_factory.DeploymentFormat.ONNX,
-                         torch.zeros(batch_size,
-                                     num_encoder_input_features,
-                                     time_steps,
-                                     dtype=torch.float, device="cuda:0"))
+    nf.deployment_export(
+        jasper_encoder,
+        nn_onnx_encoder,
+        nemo.core.neural_factory.DeploymentFormat.ONNX,
+        torch.zeros(batch_size, num_encoder_input_features, time_steps, dtype=torch.float, device="cuda:0",),
+    )
     print("Exporting decoder...")
-    nf.deployment_export(jasper_decoder, nn_onnx_decoder,
-                         nemo.core.neural_factory.DeploymentFormat.ONNX,
-                         (torch.zeros(batch_size,
-                                      num_decoder_input_features,
-                                      time_steps // 2,
-                                      dtype=torch.float, device="cuda:0")))
+    nf.deployment_export(
+        jasper_decoder,
+        nn_onnx_decoder,
+        nemo.core.neural_factory.DeploymentFormat.ONNX,
+        (torch.zeros(batch_size, num_decoder_input_features, time_steps // 2, dtype=torch.float, device="cuda:0",)),
+    )
     print("Export completed successfully.")
 
 
 if __name__ == "__main__":
     args = get_parser().parse_args()
-    main(args.config, args.nn_encoder, args.nn_decoder, args.onnx_encoder,
-         args.onnx_decoder, pre_v09_model=args.pre_v09_model)
+    main(
+        args.config,
+        args.nn_encoder,
+        args.nn_decoder,
+        args.onnx_encoder,
+        args.onnx_decoder,
+        pre_v09_model=args.pre_v09_model,
+    )
diff --git a/scripts/fisher_audio_to_wav.py b/scripts/fisher_audio_to_wav.py
index ac9bf017444e..94507b5451ce 100644
--- a/scripts/fisher_audio_to_wav.py
+++ b/scripts/fisher_audio_to_wav.py
@@ -17,11 +17,11 @@
 
 parser = argparse.ArgumentParser(description='Convert Fisher .sph to .wav')
 parser.add_argument(
-        "--data_root", default=None, type=str, required=True,
-        help="The path to the root Fisher dataset folder.")
+    "--data_root", default=None, type=str, required=True, help="The path to the root Fisher dataset folder.",
+)
 parser.add_argument(
-        "--dest_root", default=None, type=str, required=True,
-        help="Path to the destination root directory.")
+    "--dest_root", default=None, type=str, required=True, help="Path to the destination root directory.",
+)
 args = parser.parse_args()
 
 
@@ -72,21 +72,15 @@ def main():
 
     print("\n\nConverting audio for Part 1")
     __process_set(
-            os.path.join(
-                data_root, "LDC2004S13-Part1", "fisher_eng_tr_sp_d*",
-                "audio", "*", "*.sph"),
-            os.path.join(
-                dest_root, "LDC2004S13-Part1", "audio_wav")
-            )
+        os.path.join(data_root, "LDC2004S13-Part1", "fisher_eng_tr_sp_d*", "audio", "*", "*.sph",),
+        os.path.join(dest_root, "LDC2004S13-Part1", "audio_wav"),
+    )
 
     print("\n\nConverting audio for Part 2")
     __process_set(
-            os.path.join(
-                data_root, "LDC2005S13-Part2", "fe_03_p2_sph*",
-                "audio", "*", "*.sph"),
-            os.path.join(
-                dest_root, "LDC2005S13-Part2", "audio_wav")
-            )
+        os.path.join(data_root, "LDC2005S13-Part2", "fe_03_p2_sph*", "audio", "*", "*.sph",),
+        os.path.join(dest_root, "LDC2005S13-Part2", "audio_wav"),
+    )
 
 
 if __name__ == '__main__':
diff --git a/scripts/get_aishell_data.py b/scripts/get_aishell_data.py
index 9991d1e73cfd..5e8c00c7cdae 100644
--- a/scripts/get_aishell_data.py
+++ b/scripts/get_aishell_data.py
@@ -3,11 +3,11 @@
 # USAGE: python get_aishell_data.py --data_root=<where to put data>
 
 import argparse
+import json
 import os
-import urllib.request
-import tarfile
 import subprocess
-import json
+import tarfile
+import urllib.request
 
 parser = argparse.ArgumentParser(description='Aishell Data download')
 parser.add_argument("--data_root", required=True, default=None, type=str)
@@ -71,9 +71,7 @@ def __process_data(data_folder: str, dst_folder: str):
     if not os.path.exists(dst_folder):
         os.makedirs(dst_folder)
 
-    transcript_file = os.path.join(data_folder,
-                                   'transcript',
-                                   'aishell_transcript_v0.8.txt')
+    transcript_file = os.path.join(data_folder, 'transcript', 'aishell_transcript_v0.8.txt')
     transcript_dict = {}
     with open(transcript_file, 'r', encoding='utf-8') as f:
         for line in f:
@@ -97,17 +95,14 @@ def __process_data(data_folder: str, dst_folder: str):
                 text = transcript_dict[audio_id]
                 for li in text:
                     vocab_count[li] = vocab_count.get(li, 0) + 1
-                duration = subprocess.check_output(
-                    'soxi -D {0}'.format(audio_path), shell=True)
+                duration = subprocess.check_output('soxi -D {0}'.format(audio_path), shell=True)
                 duration = float(duration)
                 json_lines.append(
                     json.dumps(
-                        {
-                            'audio_filepath': os.path.abspath(audio_path),
-                            'duration': duration,
-                            'text': text
-                        },
-                        ensure_ascii=False))
+                        {'audio_filepath': os.path.abspath(audio_path), 'duration': duration, 'text': text,},
+                        ensure_ascii=False,
+                    )
+                )
 
         manifest_path = os.path.join(dst_folder, dt + '.json')
         with open(manifest_path, 'w', encoding='utf-8') as fout:
diff --git a/scripts/get_databaker_data.py b/scripts/get_databaker_data.py
index bf2ef8e2695e..bc044f1a0576 100644
--- a/scripts/get_databaker_data.py
+++ b/scripts/get_databaker_data.py
@@ -18,22 +18,18 @@
 
 import argparse
 import glob
+import json
 import os
+import random
 import urllib.request
-
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
-from pypinyin import lazy_pinyin, Style
+
 import librosa
-import json
-import random
+from pypinyin import Style, lazy_pinyin
 from tqdm import tqdm
 
-
-URLS = {
-    'DATABAKER_CSMSC':
-    "https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar"
-}
+URLS = {'DATABAKER_CSMSC': "https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar"}
 
 
 def __maybe_download_file(destination, source):
@@ -68,14 +64,15 @@ def __extract_rar(rar_path, dest_dir):
     """
     if not os.path.exists(dest_dir):
         if os.system("which unrar > /dev/null") != 0:
-            message = "Please install unrar and run the script again.\n" \
-                    "On Ubuntu/Debian, run: sudo apt-get install unrar -y"
+            message = (
+                "Please install unrar and run the script again.\n"
+                "On Ubuntu/Debian, run: sudo apt-get install unrar -y"
+            )
             print(message)
             exit(1)
         os.makedirs(dest_dir)
         print("Extracting... This might take a few minutes.", flush=True)
-        status = os.system(
-            "unrar x {0} {1} > /dev/null".format(rar_path, dest_dir))
+        status = os.system("unrar x {0} {1} > /dev/null".format(rar_path, dest_dir))
         if status != 0:
             print("Extraction failed.")
             exit(1)
@@ -123,9 +120,7 @@ def __convert_transcript(raw_transcript):
     wavename = waveid + ".wav"
     symbols = ",.!?"
     # For simplicity, we only retain the Chinese chars and symbols
-    trans = ''.join(
-        [_char for _char in __replace_symbols(raw_trans) if __is_chinese(
-            _char) or _char in symbols])
+    trans = ''.join([_char for _char in __replace_symbols(raw_trans) if __is_chinese(_char) or _char in symbols])
     pinyin_trans = []
     for pinyin in lazy_pinyin(trans, style=Style.TONE3):
         if pinyin not in symbols and not pinyin[-1].isdigit():
@@ -142,19 +137,22 @@ def __prepare_databaker_csmsc(data_root, train_size, sr=22050):
     Generate train manifest json and eval manifest json.
     """
     dataset_name = "DATABAKER_CSMSC"
-    copyright_statement = "Chinese Standard Mandarin Speech Copus and its " \
-        "download link is provided by Databaker (Beijing) Technology Co.," \
-        "Ltd. Supports Non-Commercial use only. \nFor more info about this" \
+    copyright_statement = (
+        "Chinese Standard Mandarin Speech Copus and its "
+        "download link is provided by Databaker (Beijing) Technology Co.,"
+        "Ltd. Supports Non-Commercial use only. \nFor more info about this"
         " dataset, visit: https://www.data-baker.com/open_source.html"
+    )
     print(copyright_statement)
-    rar_path = os.path.join(data_root, dataset_name+'.rar')
+    rar_path = os.path.join(data_root, dataset_name + '.rar')
     dataset_dir = os.path.join(data_root, dataset_name)
     __maybe_download_file(rar_path, dataset_name)
     __extract_rar(rar_path, dataset_dir)
     wavedir = os.path.join(dataset_dir, "Wave")
     wavepaths = glob.glob(os.path.join(wavedir, "*.wav"))
-    print("Found {} wav files, converting them to {} HZ sample rate...".format(
-        len(wavepaths), sr), flush=True)
+    print(
+        "Found {} wav files, converting them to {} HZ sample rate...".format(len(wavepaths), sr), flush=True,
+    )
     converted_wavedir = os.path.join(dataset_dir, str(sr))
     if not os.path.exists(converted_wavedir):
         os.mkdir(converted_wavedir)
@@ -163,24 +161,20 @@ def __prepare_databaker_csmsc(data_root, train_size, sr=22050):
     duration_dict = {}
     for wavepath in wavepaths:
         wavename = os.path.basename(wavepath)
-        durations.append(executor.submit(
-            partial(
-                __convert_waves, wavedir, converted_wavedir, wavename, sr)))
+        durations.append(executor.submit(partial(__convert_waves, wavedir, converted_wavedir, wavename, sr)))
     for duration in tqdm(durations):
         wavename, dur = duration.result()
         duration_dict[wavename] = dur
     del durations
     print("Phoneticizing transcripts...", flush=True)
-    transcriptfile = os.path.join(
-        dataset_dir, "ProsodyLabeling", "000001-010000.txt")
+    transcriptfile = os.path.join(dataset_dir, "ProsodyLabeling", "000001-010000.txt")
     with open(transcriptfile, "r", encoding="utf-8") as f:
         all_lines = f.readlines()
     raw_transcripts = all_lines[::2]
     pinyin_transcripts = []
     pinyin_transcripts_dist = {}
     for raw_transcript in raw_transcripts:
-        pinyin_transcripts.append(executor.submit(
-            partial(__convert_transcript, raw_transcript)))
+        pinyin_transcripts.append(executor.submit(partial(__convert_transcript, raw_transcript)))
     for pinyin_transcript in tqdm(pinyin_transcripts):
         wavename, pinyin_trans = pinyin_transcript.result()
         pinyin_transcripts_dist[wavename] = pinyin_trans
@@ -211,21 +205,18 @@ def __prepare_databaker_csmsc(data_root, train_size, sr=22050):
     tr_mani = "databaker_csmsc_train.json"
     ev_mani = "databaker_csmsc_eval.json"
     if len(train_lines) > 0:
-        with open(os.path.join(data_root, tr_mani), "w", encoding="utf-8") \
-                as f:
+        with open(os.path.join(data_root, tr_mani), "w", encoding="utf-8") as f:
             for line in train_lines:
                 f.write("%s\n" % line)
     if len(eval_lines) > 0:
-        with open(os.path.join(data_root, ev_mani), "w", encoding="utf-8") \
-                as f:
+        with open(os.path.join(data_root, ev_mani), "w", encoding="utf-8") as f:
             for line in eval_lines:
                 f.write("%s\n" % line)
     print("Complete.")
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Prepare Databaker Mandarin TTS Dataset")
+    parser = argparse.ArgumentParser(description="Prepare Databaker Mandarin TTS Dataset")
     parser.add_argument("--dataset_name", default='databaker_csmsc', type=str)
     parser.add_argument("--data_root", required=True, type=str)
     parser.add_argument("--train_size", type=float, default=0.9)
diff --git a/scripts/get_decoder_params_from_bert.py b/scripts/get_decoder_params_from_bert.py
index 654ec4d6bb6d..df4cd7c1d2f0 100644
--- a/scripts/get_decoder_params_from_bert.py
+++ b/scripts/get_decoder_params_from_bert.py
@@ -1,18 +1,16 @@
+import argparse
+
 import torch
 from transformers import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from transformers.file_utils import cached_path
-import argparse
 
 state_dict_mappings = {
     'gamma': 'weight',
     'beta': 'bias',
     'bert.encoder.layer': 'encoder.layers',
-    'bert.embeddings.word_embeddings.weight': 'embedding_layer.word_embedding.'
-    'weight',
-    'bert.embeddings.position_embeddings.weight': 'embedding_layer.'
-    'position_embedding.weight',
-    'bert.embeddings.token_type_embeddings.weight': 'embedding_layer.token_'
-    'type_embedding.weight',
+    'bert.embeddings.word_embeddings.weight': 'embedding_layer.word_embedding.' 'weight',
+    'bert.embeddings.position_embeddings.weight': 'embedding_layer.' 'position_embedding.weight',
+    'bert.embeddings.token_type_embeddings.weight': 'embedding_layer.token_' 'type_embedding.weight',
     'bert.embeddings.LayerNorm.weight': 'embedding_layer.layer_norm.weight',
     'bert.embeddings.LayerNorm.bias': 'embedding_layer.layer_norm.bias',
     'attention.self.query.weight': 'first_sub_layer.query_net.weight',
@@ -30,14 +28,15 @@
     'output.dense.weight': 'second_sub_layer.dense_out.weight',
     'output.dense.bias': 'second_sub_layer.dense_out.bias',
     'output.LayerNorm.weight': 'second_sub_layer.layer_norm.weight',
-    'output.LayerNorm.bias': 'second_sub_layer.layer_norm.bias'
+    'output.LayerNorm.bias': 'second_sub_layer.layer_norm.bias',
 }
 
 decoder_keys = [
     'embedding_layer.token_embedding.weight',
     'embedding_layer.position_embedding.weight',
     'embedding_layer.token_type_embedding.weight',
-    'embedding_layer.layer_norm.weight', 'embedding_layer.layer_norm.bias'
+    'embedding_layer.layer_norm.weight',
+    'embedding_layer.layer_norm.bias',
 ]
 
 parser = argparse.ArgumentParser(description="BERT parameters to decoder")
@@ -74,17 +73,12 @@
         tmp = key.split(".")
         cur_layer = int(tmp[2])
         if "first" in key:
-            key_first = ".".join(
-                ["decoder", "layers", str(cur_layer)] + tmp[3:])
-            key_second = ".".join(
-                ["decoder", "layers",
-                 str(cur_layer), "second_sub_layer"] + tmp[4:])
+            key_first = ".".join(["decoder", "layers", str(cur_layer)] + tmp[3:])
+            key_second = ".".join(["decoder", "layers", str(cur_layer), "second_sub_layer"] + tmp[4:])
             decoder_from_bert[key_first] = bert_keys[i]
             decoder_from_bert[key_second] = bert_keys[i]
         elif "second" in key:
-            key_third = ".".join(
-                ["decoder", "layers",
-                 str(cur_layer), "third_sub_layer"] + tmp[4:])
+            key_third = ".".join(["decoder", "layers", str(cur_layer), "third_sub_layer"] + tmp[4:])
             decoder_from_bert[key_third] = bert_keys[i]
 
 new_decoder_weights = {}
@@ -93,13 +87,11 @@
 
 # Add zeros to make vocab_size divisible by 8 for fast training in
 # mixed precision
-vocab_size, d_model = new_decoder_weights[
-    "embedding_layer.token_embedding.weight"].size()
+vocab_size, d_model = new_decoder_weights["embedding_layer.token_embedding.weight"].size()
 tokens_to_add = 8 - vocab_size % 8
 zeros = torch.zeros((tokens_to_add, d_model)).to(device="cpu")
 
-tmp = torch.cat(
-    (new_decoder_weights['embedding_layer.token_embedding.weight'], zeros))
+tmp = torch.cat((new_decoder_weights['embedding_layer.token_embedding.weight'], zeros))
 
 new_decoder_weights['embedding_layer.token_embedding.weight'] = tmp
 torch.save(new_decoder_weights, args.save_to + args.model_name + "_decoder.pt")
diff --git a/scripts/get_librispeech_data.py b/scripts/get_librispeech_data.py
index d30a660f6832..b0b8448a4239 100755
--- a/scripts/get_librispeech_data.py
+++ b/scripts/get_librispeech_data.py
@@ -7,14 +7,15 @@
 # You can also put more than one data_set comma-separated:
 # --data_set=dev_clean,train_clean_100
 import argparse
-import os
-import urllib.request
-import tarfile
 import fnmatch
-import subprocess
 import json
-from tqdm import tqdm
+import os
+import subprocess
+import tarfile
+import urllib.request
+
 from sox import Transformer
+from tqdm import tqdm
 
 parser = argparse.ArgumentParser(description='LibriSpeech Data download')
 parser.add_argument("--data_root", required=True, default=None, type=str)
@@ -22,16 +23,13 @@
 args = parser.parse_args()
 
 URLS = {
-    'TRAIN_CLEAN_100': ("http://www.openslr.org/resources/12/train-clean-100"
-                        ".tar.gz"),
-    'TRAIN_CLEAN_360': ("http://www.openslr.org/resources/12/train-clean-360"
-                        ".tar.gz"),
-    'TRAIN_OTHER_500': ("http://www.openslr.org/resources/12/train-other-500"
-                        ".tar.gz"),
+    'TRAIN_CLEAN_100': ("http://www.openslr.org/resources/12/train-clean-100" ".tar.gz"),
+    'TRAIN_CLEAN_360': ("http://www.openslr.org/resources/12/train-clean-360" ".tar.gz"),
+    'TRAIN_OTHER_500': ("http://www.openslr.org/resources/12/train-other-500" ".tar.gz"),
     'DEV_CLEAN': "http://www.openslr.org/resources/12/dev-clean.tar.gz",
     'DEV_OTHER': "http://www.openslr.org/resources/12/dev-other.tar.gz",
     'TEST_CLEAN': "http://www.openslr.org/resources/12/test-clean.tar.gz",
-    'TEST_OTHER': "http://www.openslr.org/resources/12/test-other.tar.gz"
+    'TEST_OTHER': "http://www.openslr.org/resources/12/test-other.tar.gz",
 }
 
 
@@ -91,8 +89,7 @@ def __process_data(data_folder: str, dst_folder: str, manifest_file: str):
     for transcripts_file, root in tqdm(files):
         with open(transcripts_file, encoding="utf-8") as fin:
             for line in fin:
-                id, text = line[:line.index(" ")], line[
-                                                   line.index(" ") + 1:]
+                id, text = line[: line.index(" ")], line[line.index(" ") + 1 :]
                 transcript_text = text.lower().strip()
 
                 # Convert FLAC file to WAV
@@ -101,8 +98,7 @@ def __process_data(data_folder: str, dst_folder: str, manifest_file: str):
                 if not os.path.exists(wav_file):
                     Transformer().build(flac_file, wav_file)
                 # check duration
-                duration = subprocess.check_output(
-                    "soxi -D {0}".format(wav_file), shell=True)
+                duration = subprocess.check_output("soxi -D {0}".format(wav_file), shell=True)
 
                 entry = dict()
                 entry['audio_filepath'] = os.path.abspath(wav_file)
@@ -121,8 +117,7 @@ def main():
     data_sets = args.data_sets
 
     if data_sets == "ALL":
-        data_sets = "dev_clean,dev_other,train_clean_100,train_clean_360," \
-                    "train_other_500,test_clean,test_other"
+        data_sets = "dev_clean,dev_other,train_clean_100,train_clean_360," "train_other_500,test_clean,test_other"
 
     for data_set in data_sets.split(','):
         print("\n\nWorking on: {0}".format(data_set))
@@ -132,11 +127,11 @@ def main():
         print("Extracting {0}".format(data_set))
         __extract_file(filepath, data_root)
         print("Processing {0}".format(data_set))
-        __process_data(os.path.join(os.path.join(data_root, "LibriSpeech"),
-                                    data_set.replace("_", "-")),
-                       os.path.join(os.path.join(data_root, "LibriSpeech"),
-                                    data_set.replace("_", "-")) + "-processed",
-                       os.path.join(data_root, data_set + ".json"))
+        __process_data(
+            os.path.join(os.path.join(data_root, "LibriSpeech"), data_set.replace("_", "-"),),
+            os.path.join(os.path.join(data_root, "LibriSpeech"), data_set.replace("_", "-"),) + "-processed",
+            os.path.join(data_root, data_set + ".json"),
+        )
     print('Done!')
 
 
diff --git a/scripts/get_ljspeech_data.py b/scripts/get_ljspeech_data.py
index a6a94766c998..7ca160921a95 100644
--- a/scripts/get_ljspeech_data.py
+++ b/scripts/get_ljspeech_data.py
@@ -8,6 +8,7 @@
 import random
 import tarfile
 import urllib.request
+
 from scipy.io.wavfile import read
 
 URL = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
diff --git a/scripts/get_tatoeba_data.py b/scripts/get_tatoeba_data.py
index de25bc69ddeb..e714c3ee43a3 100644
--- a/scripts/get_tatoeba_data.py
+++ b/scripts/get_tatoeba_data.py
@@ -18,7 +18,6 @@
 import re
 import string
 import urllib.request
-
 from collections import Counter
 
 URL = {'tatoeba': 'https://downloads.tatoeba.org/exports/sentences.csv'}
@@ -35,20 +34,18 @@ def __maybe_download_file(destination: str, source: str):
     source = URL[source]
     if not os.path.exists(destination):
         print(f'Downloading {source}')
-        print(f'Downloading could take a long time ' +
-              'To get the data faster consider running in a terminal:\n' +
-              'wget https://downloads.tatoeba.org/exports/sentences.csv\n' +
-              'grep -P "\teng\t" sentences.csv > eng_sentences.csv\n' +
-              'mv eng_sentences.csv sentences.csv\n' +
-              'And then rerun this script to preprocess the data.')
+        print(
+            f'Downloading could take a long time '
+            + 'To get the data faster consider running in a terminal:\n'
+            + 'wget https://downloads.tatoeba.org/exports/sentences.csv\n'
+            + 'grep -P "\teng\t" sentences.csv > eng_sentences.csv\n'
+            + 'mv eng_sentences.csv sentences.csv\n'
+            + 'And then rerun this script to preprocess the data.'
+        )
         urllib.request.urlretrieve(source, filename=destination)
 
 
-def __process_english_sentences(in_file,
-                                out_file,
-                                percent_to_cut=0,
-                                num_to_combine=1,
-                                num_samples=-1):
+def __process_english_sentences(in_file, out_file, percent_to_cut=0, num_to_combine=1, num_samples=-1):
     """
     Extract English sentences from the Tatoeba dataset.
     Expected in_file format
@@ -86,7 +83,7 @@ def __process_english_sentences(in_file,
                 if percent_to_cut > 0:
                     line = line.split()
                     if random.random() < percent_to_cut:
-                        line = line[:len(line)//2]
+                        line = line[: len(line) // 2]
                     line = ' '.join(line)
 
                 # combine multiple sentences into a single example
@@ -99,15 +96,11 @@ def __process_english_sentences(in_file,
                     samples_count += 1
                 lines_to_combine.append(line)
 
-    if len(lines_to_combine) > 0 and \
-            (samples_count < num_samples or num_samples < 0):
+    if len(lines_to_combine) > 0 and (samples_count < num_samples or num_samples < 0):
         out_file.write(' '.join(lines_to_combine) + '\n')
 
 
-def __split_into_train_dev(in_file,
-                           train_file,
-                           dev_file,
-                           percent_dev):
+def __split_into_train_dev(in_file, train_file, dev_file, percent_dev):
     """
     Create train and dev split of the dataset.
     Args:
@@ -134,12 +127,10 @@ def remove_punctuation(word):
     that is often a part of word: don't, it's, and so on
     """
     all_punct_marks = string.punctuation.replace("'", '')
-    return re.sub('['+all_punct_marks+']', '', word)
+    return re.sub('[' + all_punct_marks + ']', '', word)
 
 
-def __create_text_and_labels(data_dir,
-                             file,
-                             punct_marks=',.?'):
+def __create_text_and_labels(data_dir, file, punct_marks=',.?'):
     '''
     Create datasets for training and evaluation.
     The data will be splitted into 2 files: text.txt and labels.txt. \
@@ -190,14 +181,18 @@ def __delete_file(file_to_del):
     parser = argparse.ArgumentParser(description='Prepare tatoeba dataset')
     parser.add_argument("--data_dir", required=True, type=str)
     parser.add_argument("--dataset", default='tatoeba', type=str)
-    parser.add_argument("--num_samples", default=-1, type=int,
-                        help='-1 to use the whole dataset')
-    parser.add_argument("--percent_to_cut", default=0, type=float,
-                        help='Percent of sentences to cut in the middle')
-    parser.add_argument("--num_lines_to_combine", default=1, type=int,
-                        help='Number of lines to combine into single example')
-    parser.add_argument("--percent_dev", default=0.2, type=float,
-                        help='Size of the dev set, float')
+    parser.add_argument(
+        "--num_samples", default=-1, type=int, help='-1 to use the whole dataset',
+    )
+    parser.add_argument(
+        "--percent_to_cut", default=0, type=float, help='Percent of sentences to cut in the middle',
+    )
+    parser.add_argument(
+        "--num_lines_to_combine", default=1, type=int, help='Number of lines to combine into single example',
+    )
+    parser.add_argument(
+        "--percent_dev", default=0.2, type=float, help='Size of the dev set, float',
+    )
     parser.add_argument("--clean_dir", action='store_true')
     args = parser.parse_args()
 
@@ -212,23 +207,16 @@ def __delete_file(file_to_del):
     __maybe_download_file(tatoeba_dataset, args.dataset)
 
     print(f'Processing English sentences...')
-    clean_eng_sentences = os.path.join(args.data_dir,
-                                       'clean_eng_sentences.txt')
-    __process_english_sentences(tatoeba_dataset,
-                                clean_eng_sentences,
-                                args.percent_to_cut,
-                                args.num_lines_to_combine,
-                                args.num_samples)
+    clean_eng_sentences = os.path.join(args.data_dir, 'clean_eng_sentences.txt')
+    __process_english_sentences(
+        tatoeba_dataset, clean_eng_sentences, args.percent_to_cut, args.num_lines_to_combine, args.num_samples,
+    )
 
     train_file = os.path.join(args.data_dir, 'train.txt')
     dev_file = os.path.join(args.data_dir, 'dev.txt')
 
-    print(f'Splitting the {args.dataset} dataset into train and dev sets' +
-          ' and creating labels and text files')
-    __split_into_train_dev(clean_eng_sentences,
-                           train_file,
-                           dev_file,
-                           args.percent_dev)
+    print(f'Splitting the {args.dataset} dataset into train and dev sets' + ' and creating labels and text files')
+    __split_into_train_dev(clean_eng_sentences, train_file, dev_file, args.percent_dev)
 
     print(f'Creating text and label files for training')
     __create_text_and_labels(args.data_dir, 'train.txt')
diff --git a/scripts/get_timit_data.py b/scripts/get_timit_data.py
index 13f0001395eb..361710d8aaeb 100644
--- a/scripts/get_timit_data.py
+++ b/scripts/get_timit_data.py
@@ -3,15 +3,16 @@
 #        --data_new_root=<where to put new data .json>
 
 import argparse
-import os
-import urllib.request
-import tarfile
 import fnmatch
-import subprocess
 import json
-from tqdm import tqdm
-from sox import Transformer
+import os
+import subprocess
 import sys
+import tarfile
+import urllib.request
+
+from sox import Transformer
+from tqdm import tqdm
 
 #
 # This script proposes to create the *json manifest necessary to use the TIMIT
@@ -30,17 +31,69 @@
 #
 
 PHN_DICT = {
-    "aa": "aa", "ae": "ae", "ah": "ah", "ao": "aa", "aw": "aw", "ax": "ah",
-    "ax-h": 	"ah", "axr": "er", "ay": "ay", "b": "b", "bcl"	: "sil",
-    "ch": "ch", "d": "d", "dcl"	: "sil", "dh": "dh", "dx": "dx", "eh": "eh",
-    "el": "l", "em": "m", "en": "n", "eng": "ng", "epi": "sil", "er": "er",
-    "ey": "ey", "f": "f", "g": "g", "gcl": "sil", "h#": "sil", "hh": "hh",
-    "hv": "hh", "ih": "ih", "ix": "ih", "iy": "iy", "jh": "jh", "k": "k",
-    "kcl": "sil", "l": "l", "m"	: "m", "n": "n", "ng": "ng", "nx": "n",
-    "ow": "ow", "oy": "oy", "p": "p", "pau": "sil", "pcl": "sil", "q": "",
-    "r"	: "r", "s": "s", "sh": "sh", "t": "t", "tcl": "sil", "th": "th",
-    "uh": "uh", "uw": "uw", "ux": "uw", "v": "v", "w": "w", "y": "y",
-    "z"	: "z", "zh": "sh", "h#": ""}
+    "aa": "aa",
+    "ae": "ae",
+    "ah": "ah",
+    "ao": "aa",
+    "aw": "aw",
+    "ax": "ah",
+    "ax-h": "ah",
+    "axr": "er",
+    "ay": "ay",
+    "b": "b",
+    "bcl": "sil",
+    "ch": "ch",
+    "d": "d",
+    "dcl": "sil",
+    "dh": "dh",
+    "dx": "dx",
+    "eh": "eh",
+    "el": "l",
+    "em": "m",
+    "en": "n",
+    "eng": "ng",
+    "epi": "sil",
+    "er": "er",
+    "ey": "ey",
+    "f": "f",
+    "g": "g",
+    "gcl": "sil",
+    "h#": "sil",
+    "hh": "hh",
+    "hv": "hh",
+    "ih": "ih",
+    "ix": "ih",
+    "iy": "iy",
+    "jh": "jh",
+    "k": "k",
+    "kcl": "sil",
+    "l": "l",
+    "m": "m",
+    "n": "n",
+    "ng": "ng",
+    "nx": "n",
+    "ow": "ow",
+    "oy": "oy",
+    "p": "p",
+    "pau": "sil",
+    "pcl": "sil",
+    "q": "",
+    "r": "r",
+    "s": "s",
+    "sh": "sh",
+    "t": "t",
+    "tcl": "sil",
+    "th": "th",
+    "uh": "uh",
+    "uw": "uw",
+    "ux": "uw",
+    "v": "v",
+    "w": "w",
+    "y": "y",
+    "z": "z",
+    "zh": "sh",
+    "h#": "",
+}
 
 #
 # DEFINE STANDARD SPEAKERS LISTS
@@ -48,20 +101,84 @@
 #
 
 DEV_LIST = {
-    'faks0', 'fdac1', 'fjem0', 'mgwt0', 'mjar0', 'mmdb1', 'mmdm2',
-    'mpdf0', 'fcmh0', 'fkms0', 'mbdg0', 'mbwm0', 'mcsh0', 'fadg0',
-    'fdms0', 'fedw0', 'mgjf0', 'mglb0', 'mrtk0', 'mtaa0', 'mtdt0',
-    'mthc0', 'mwjg0', 'fnmr0', 'frew0', 'fsem0', 'mbns0', 'mmjr0',
-    'mdls0', 'mdlf0', 'mdvc0', 'mers0', 'fmah0', 'fdrw0', 'mrcs0',
-    'mrjm4', 'fcal1', 'mmwh0', 'fjsj0', 'majc0', 'mjsw0', 'mreb0',
-    'fgjd0', 'fjmg0', 'mroa0', 'mteb0', 'mjfc0', 'mrjr0', 'fmml0',
-    'mrws1'}
+    'faks0',
+    'fdac1',
+    'fjem0',
+    'mgwt0',
+    'mjar0',
+    'mmdb1',
+    'mmdm2',
+    'mpdf0',
+    'fcmh0',
+    'fkms0',
+    'mbdg0',
+    'mbwm0',
+    'mcsh0',
+    'fadg0',
+    'fdms0',
+    'fedw0',
+    'mgjf0',
+    'mglb0',
+    'mrtk0',
+    'mtaa0',
+    'mtdt0',
+    'mthc0',
+    'mwjg0',
+    'fnmr0',
+    'frew0',
+    'fsem0',
+    'mbns0',
+    'mmjr0',
+    'mdls0',
+    'mdlf0',
+    'mdvc0',
+    'mers0',
+    'fmah0',
+    'fdrw0',
+    'mrcs0',
+    'mrjm4',
+    'fcal1',
+    'mmwh0',
+    'fjsj0',
+    'majc0',
+    'mjsw0',
+    'mreb0',
+    'fgjd0',
+    'fjmg0',
+    'mroa0',
+    'mteb0',
+    'mjfc0',
+    'mrjr0',
+    'fmml0',
+    'mrws1',
+}
 
 TEST_LIST = {
-    'mdab0', 'mwbt0', 'felc0', 'mtas1', 'mwew0', 'fpas0', 'mjmp0',
-    'mlnt0', 'fpkt0', 'mlll0', 'mtls0', 'fjlm0', 'mbpm0', 'mklt0',
-    'fnlp0', 'mcmj0', 'mjdh0', 'fmgd0', 'mgrt0', 'mnjm0', 'fdhc0',
-    'mjln0', 'mpam0', 'fmld0'}
+    'mdab0',
+    'mwbt0',
+    'felc0',
+    'mtas1',
+    'mwew0',
+    'fpas0',
+    'mjmp0',
+    'mlnt0',
+    'fpkt0',
+    'mlll0',
+    'mtls0',
+    'fjlm0',
+    'mbpm0',
+    'mklt0',
+    'fnlp0',
+    'mcmj0',
+    'mjdh0',
+    'fmgd0',
+    'mgrt0',
+    'mnjm0',
+    'fdhc0',
+    'mjln0',
+    'mpam0',
+    'fmld0',
+}
 
 parser = argparse.ArgumentParser(description='TIMIT data processing')
 parser.add_argument("--data_root", required=True, default=None, type=str)
@@ -106,7 +223,7 @@ def __process_data(data_folder: str, dst_folder: str):
 
     for data_set in ['train', 'dev', 'test']:
 
-        print("Processing: "+data_set)
+        print("Processing: " + data_set)
         entries = []
         if data_set == 'train':
             files = files_train
@@ -127,8 +244,7 @@ def __process_data(data_folder: str, dst_folder: str):
                     phn_transcript += mapped + " "
 
                 wav_file = transcripts_file.split(".")[0] + ".WAV"
-                duration = subprocess.check_output(
-                        "soxi -D {0}".format(wav_file), shell=True)
+                duration = subprocess.check_output("soxi -D {0}".format(wav_file), shell=True)
                 entry = dict()
                 entry['audio_filepath'] = os.path.abspath(wav_file)
                 entry['duration'] = float(duration)
diff --git a/scripts/process_aishell2_data.py b/scripts/process_aishell2_data.py
index 92425f350b64..3cab5b55950a 100644
--- a/scripts/process_aishell2_data.py
+++ b/scripts/process_aishell2_data.py
@@ -4,17 +4,18 @@
 #                   --audio_folder=<source data>
 #                   --dest_folder=<where to store the results>
 import argparse
-import os
-import sys
 import json
+import os
 import subprocess
+import sys
+
 parser = argparse.ArgumentParser(description="Processing Aishell2 Data")
 parser.add_argument(
-        "--audio_folder", default=None, type=str, required=True,
-        help="Audio (wav) data directory.")
+    "--audio_folder", default=None, type=str, required=True, help="Audio (wav) data directory.",
+)
 parser.add_argument(
-        "--dest_folder", default=None, type=str, required=True,
-        help="Destination directory.")
+    "--dest_folder", default=None, type=str, required=True, help="Destination directory.",
+)
 args = parser.parse_args()
 
 
@@ -31,11 +32,10 @@ def __process_data(data_folder: str, dst_folder: str):
         os.makedirs(dst_folder)
     data_type = ['dev', 'test', 'train']
     for data in data_type:
-        dst_file = os.path.join(dst_folder, data+".json")
+        dst_file = os.path.join(dst_folder, data + ".json")
         uttrances = []
         wav_dir = os.path.join(data_folder, "wav", data)
-        transcript_file = os.path.join(
-                data_folder, "transcript", data, "trans.txt")
+        transcript_file = os.path.join(data_folder, "transcript", data, "trans.txt")
         trans_text = {}
         with open(transcript_file, "r", encoding='utf-8') as f:
             for line in f:
@@ -48,18 +48,17 @@ def __process_data(data_folder: str, dst_folder: str):
             for wavs in os.listdir(cur_dir):
                 audio_id = wavs.strip(".wav")
                 audio_filepath = os.path.abspath(os.path.join(cur_dir, wavs))
-                duration = subprocess.check_output(
-                        'soxi -D {0}'.format(audio_filepath), shell=True)
+                duration = subprocess.check_output('soxi -D {0}'.format(audio_filepath), shell=True)
                 duration = float(duration)
                 text = trans_text[audio_id]
                 uttrances.append(
-                    json.dumps({
-                        "audio_filepath": audio_filepath,
-                        "duration": duration,
-                        "text": text}, ensure_ascii=False))
+                    json.dumps(
+                        {"audio_filepath": audio_filepath, "duration": duration, "text": text,}, ensure_ascii=False,
+                    )
+                )
         with open(dst_file, "w") as f:
             for line in uttrances:
-                f.write(line+"\n")
+                f.write(line + "\n")
 
 
 def __get_vocab(data_folder: str, des_dir: str):
@@ -87,7 +86,7 @@ def __get_vocab(data_folder: str, des_dir: str):
     vocab = os.path.join(des_dir, "vocab.txt")
     vocab = open(vocab, "w", encoding='utf-8')
     for k in vocab_dict:
-        vocab.write(k[0]+"\n")
+        vocab.write(k[0] + "\n")
     vocab.close()
 
 
diff --git a/scripts/process_an4_data.py b/scripts/process_an4_data.py
index 0b74f8b5af64..69e67b7ae69a 100644
--- a/scripts/process_an4_data.py
+++ b/scripts/process_an4_data.py
@@ -14,14 +14,14 @@
 import argparse
 import glob
 import json
-import librosa
 import os
 import subprocess
 import tarfile
+
+import librosa
 import wget
 
-parser = argparse.ArgumentParser(
-        description="AN4 dataset download and processing")
+parser = argparse.ArgumentParser(description="AN4 dataset download and processing")
 parser.add_argument("--data_root", required=True, default=None, type=str)
 args = parser.parse_args()
 
@@ -32,15 +32,14 @@ def build_manifest(data_root, transcripts_path, manifest_path, wav_path):
             for line in fin:
                 # Lines look like this:
                 # <s> transcript </s> (fileID)
-                transcript = line[: line.find('(')-1].lower()
+                transcript = line[: line.find('(') - 1].lower()
                 transcript = transcript.replace('<s>', '').replace('</s>', '')
                 transcript = transcript.strip()
 
-                file_id = line[line.find('(')+1: -2]  # e.g. "cen4-fash-b"
+                file_id = line[line.find('(') + 1 : -2]  # e.g. "cen4-fash-b"
                 audio_path = os.path.join(
-                    data_root, wav_path,
-                    file_id[file_id.find('-')+1: file_id.rfind('-')],
-                    file_id + '.wav')
+                    data_root, wav_path, file_id[file_id.find('-') + 1 : file_id.rfind('-')], file_id + '.wav',
+                )
 
                 duration = librosa.core.get_duration(filename=audio_path)
 
@@ -48,7 +47,7 @@ def build_manifest(data_root, transcripts_path, manifest_path, wav_path):
                 metadata = {
                     "audio_filepath": audio_path,
                     "duration": duration,
-                    "text": transcript
+                    "text": transcript,
                 }
                 json.dump(metadata, fout)
                 fout.write('\n')
@@ -59,9 +58,7 @@ def main():
 
     # Convert from .sph to .wav
     print("Converting audio files to .wav...")
-    sph_list = glob.glob(
-            os.path.join(data_root, 'an4/**/*.sph'),
-            recursive=True)
+    sph_list = glob.glob(os.path.join(data_root, 'an4/**/*.sph'), recursive=True)
     for sph_path in sph_list:
         wav_path = sph_path[:-4] + '.wav'
         cmd = ['sox', sph_path, wav_path]
@@ -70,22 +67,16 @@ def main():
 
     # Build manifests
     print("Building training manifest...")
-    train_transcripts = os.path.join(
-            data_root, 'an4/etc/an4_train.transcription')
-    train_manifest = os.path.join(
-            data_root, 'an4/train_manifest.json')
-    train_wavs = os.path.join(
-            data_root, 'an4/wav/an4_clstk')
+    train_transcripts = os.path.join(data_root, 'an4/etc/an4_train.transcription')
+    train_manifest = os.path.join(data_root, 'an4/train_manifest.json')
+    train_wavs = os.path.join(data_root, 'an4/wav/an4_clstk')
     build_manifest(data_root, train_transcripts, train_manifest, train_wavs)
     print("Training manifests created.")
 
     print("Building test manifest...")
-    test_transcripts = os.path.join(
-            data_root, 'an4/etc/an4_test.transcription')
-    test_manifest = os.path.join(
-            data_root, 'an4/test_manifest.json')
-    test_wavs = os.path.join(
-            data_root, 'an4/wav/an4test_clstk')
+    test_transcripts = os.path.join(data_root, 'an4/etc/an4_test.transcription')
+    test_manifest = os.path.join(data_root, 'an4/test_manifest.json')
+    test_wavs = os.path.join(data_root, 'an4/wav/an4test_clstk')
     build_manifest(data_root, test_transcripts, test_manifest, test_wavs)
     print("Test manifest created.")
 
diff --git a/scripts/process_fisher_data.py b/scripts/process_fisher_data.py
index 4693c13bd6b1..a0d782ee6e83 100644
--- a/scripts/process_fisher_data.py
+++ b/scripts/process_fisher_data.py
@@ -20,36 +20,40 @@
 import json
 import os
 import re
+from math import ceil, floor
 
-from math import floor, ceil
 import numpy as np
 import scipy.io.wavfile as wavfile
 from tqdm import tqdm
 
 parser = argparse.ArgumentParser(description="Fisher Data Processing")
 parser.add_argument(
-        "--audio_root", default=None, type=str, required=True,
-        help="The path to the root of the audio (wav) data folder.")
+    "--audio_root", default=None, type=str, required=True, help="The path to the root of the audio (wav) data folder.",
+)
 parser.add_argument(
-        "--transcript_root", default=None, type=str, required=True,
-        help="The path to the root of the transcript data folder.")
+    "--transcript_root",
+    default=None,
+    type=str,
+    required=True,
+    help="The path to the root of the transcript data folder.",
+)
 parser.add_argument(
-        "--dest_root", default=None, type=str, required=True,
-        help="Path to the destination root directory.")
+    "--dest_root", default=None, type=str, required=True, help="Path to the destination root directory.",
+)
 
 # Optional arguments
 parser.add_argument(
-        "--min_slice_duration", default=10.0, type=float,
-        help="Minimum audio slice duration after processing.")
+    "--min_slice_duration", default=10.0, type=float, help="Minimum audio slice duration after processing.",
+)
 parser.add_argument(
-        "--keep_low_conf", action="store_true",
-        help="Keep all utterances with low confidence transcripts")
+    "--keep_low_conf", action="store_true", help="Keep all utterances with low confidence transcripts",
+)
 parser.add_argument(
-        "--remove_noises", action="store_true",
-        help="Removes transcripted noises such as [laughter].")
+    "--remove_noises", action="store_true", help="Removes transcripted noises such as [laughter].",
+)
 parser.add_argument(
-        "--noises_to_emoji", action="store_true",
-        help="Converts transcripts for noises to an emoji character.")
+    "--noises_to_emoji", action="store_true", help="Converts transcripts for noises to an emoji character.",
+)
 args = parser.parse_args()
 
 # Total number of files before segmenting, and train/val/test splits
@@ -62,9 +66,9 @@
     "fe_03_00265-B-3353-3381": "correct",
     "fe_03_00991-B-52739-52829": "that's one of those",
     "fe_03_10282-A-34442-34484.wav": "they don't want",
-    "fe_03_10677-B-10104-10641": "uh my mine yeah the german shepherd " +
-                                 "pitbull mix he snores almost as loud " +
-                                 "as i do",
+    "fe_03_10677-B-10104-10641": "uh my mine yeah the german shepherd "
+    + "pitbull mix he snores almost as loud "
+    + "as i do",
     "fe_03_00027-B-39380-39405": None,
     "fe_03_11487-B-3109-23406": None,
     "fe_03_01326-A-30742-30793": None,
@@ -85,7 +89,7 @@
     "dc3s": "d c threes",
     "book 2": "book two",
     "s2b": "s two b",
-    "3d": "three d"
+    "3d": "three d",
 }
 
 TAG_MAP = {
@@ -99,13 +103,11 @@
     "[lipsmack]": "😕",
     "[[skip]]": "",
     "[pause]": "",
-    "[sneeze]": "😕"
+    "[sneeze]": "😕",
 }
 
 
-def __write_sample(
-        dest, file_id, count, file_count, sample_rate,
-        audio, duration, transcript):
+def __write_sample(dest, file_id, count, file_count, sample_rate, audio, duration, transcript):
     """
     Writes one slice to the given target directory.
     Args:
@@ -128,7 +130,7 @@ def __write_sample(
     transcript = {
         "audio_filepath": audio_path,
         "duration": duration,
-        "text": transcript
+        "text": transcript,
     }
 
     # Append to manifest
@@ -140,22 +142,23 @@ def __write_sample(
 
 def __normalize(utt):
     replace_table = str.maketrans(dict.fromkeys('()*;:"!&{},.-?'))
-    utt = utt.lower() \
-             .replace('[uh]', 'uh') \
-             .replace('[um]', 'um') \
-             .replace('<noise>', '[noise]') \
-             .replace('<spoken_noise>', '[vocalized-noise]') \
-             .replace('.period', 'period') \
-             .replace('.dot', 'dot') \
-             .replace('-hyphen', 'hyphen') \
-             .replace('._', ' ') \
-             .translate(replace_table)
+    utt = (
+        utt.lower()
+        .replace('[uh]', 'uh')
+        .replace('[um]', 'um')
+        .replace('<noise>', '[noise]')
+        .replace('<spoken_noise>', '[vocalized-noise]')
+        .replace('.period', 'period')
+        .replace('.dot', 'dot')
+        .replace('-hyphen', 'hyphen')
+        .replace('._', ' ')
+        .translate(replace_table)
+    )
     utt = re.sub(r"'([a-z]+)'", r'\1', utt)  # Unquote quoted words
     return utt
 
 
-def __process_utterance(file_id, trans_path, line,
-                        keep_low_conf, rem_noises, emojify):
+def __process_utterance(file_id, trans_path, line, keep_low_conf, rem_noises, emojify):
     """
     Processes one utterance (one line of a transcript).
     Args:
@@ -167,9 +170,7 @@ def __process_utterance(file_id, trans_path, line,
         emojify: whether to convert noise symbols to emoji, lower precedence
     """
     # Check for lines to skip (comments, empty, low confidence)
-    if line.startswith('#') \
-            or not line.strip() \
-            or (not keep_low_conf and '((' in line):
+    if line.startswith('#') or not line.strip() or (not keep_low_conf and '((' in line):
         return None, None, None, None
 
     # Data and sanity checks
@@ -188,7 +189,7 @@ def __process_utterance(file_id, trans_path, line,
         return None, None, None, None
 
     # Replacements as necessary
-    line_id = '-'.join([file_id, channel[0], str(t_start*10), str(t_end*10)])
+    line_id = '-'.join([file_id, channel[0], str(t_start * 10), str(t_end * 10)])
 
     content = TRANSCRIPT_BUGS.get(line_id, ' '.join(line[3:]))
 
@@ -211,9 +212,17 @@ def __process_utterance(file_id, trans_path, line,
 
 
 def __process_one_file(
-        trans_path, sample_rate, audio_data, file_id,
-        dst_root, min_slice_duration, file_count,
-        keep_low_conf, rem_noises, emojify):
+    trans_path,
+    sample_rate,
+    audio_data,
+    file_id,
+    dst_root,
+    min_slice_duration,
+    file_count,
+    keep_low_conf,
+    rem_noises,
+    emojify,
+):
     """
     Creates one block of audio slices and their corresponding transcripts.
     Args:
@@ -240,8 +249,8 @@ def __process_one_file(
 
         for line in fin:
             t_start, t_end, idx, content = __process_utterance(
-                    file_id, trans_path, line,
-                    keep_low_conf, rem_noises, emojify)
+                file_id, trans_path, line, keep_low_conf, rem_noises, emojify
+            )
 
             if content is None or not content:
                 continue
@@ -251,9 +260,8 @@ def __process_one_file(
             # Append utterance to buffer
             transcript_buffers[idx] += content
             audio_buffers[idx].append(
-                    audio_data[floor(t_start*sample_rate):
-                               ceil(t_end*sample_rate),
-                               idx])
+                audio_data[floor(t_start * sample_rate) : ceil(t_end * sample_rate), idx,]
+            )
             buffer_durations[idx] += duration
 
             if buffer_durations[idx] < min_slice_duration:
@@ -269,7 +277,7 @@ def __process_one_file(
                     sample_rate,
                     np.concatenate(audio_buffers[idx], axis=0),
                     buffer_durations[idx],
-                    transcript_buffers[idx]
+                    transcript_buffers[idx],
                 )
 
                 # Clear buffers
@@ -290,8 +298,9 @@ def __partition_name(file_count):
         return "train"
 
 
-def __process_data(audio_root, transcript_root, dst_root, min_slice_duration,
-                   file_count, keep_low_conf, rem_noises, emojify):
+def __process_data(
+    audio_root, transcript_root, dst_root, min_slice_duration, file_count, keep_low_conf, rem_noises, emojify,
+):
     """
     Converts Fisher wav files to numpy arrays, segments audio and transcripts.
     Args:
@@ -308,8 +317,7 @@ def __process_data(audio_root, transcript_root, dst_root, min_slice_duration,
         1. There is exactly one transcripts directory in data_folder
         2. Audio files are all: <audio_root>/audio-wav/fe_03_xxxxx.wav
     """
-    transcript_list = glob.glob(os.path.join(
-        transcript_root, "fe_03_p*_tran*", "data", "trans", "*", "*.txt"))
+    transcript_list = glob.glob(os.path.join(transcript_root, "fe_03_p*_tran*", "data", "trans", "*", "*.txt"))
     print("Found {} transcripts.".format(len(transcript_list)))
 
     count = file_count
@@ -317,15 +325,23 @@ def __process_data(audio_root, transcript_root, dst_root, min_slice_duration,
     # Grab audio file associated with each transcript, and slice
     for trans_path in tqdm(transcript_list, desc="Matching and segmenting"):
         file_id, _ = os.path.splitext(os.path.basename(trans_path))
-        audio_path = os.path.join(audio_root, "audio_wav", file_id+".wav")
+        audio_path = os.path.join(audio_root, "audio_wav", file_id + ".wav")
 
         sample_rate, audio_data = wavfile.read(audio_path)
 
         # Create a set of segments (a block) for each file
         __process_one_file(
-                trans_path, sample_rate, audio_data, file_id,
-                dst_root, min_slice_duration, count, keep_low_conf,
-                rem_noises, emojify)
+            trans_path,
+            sample_rate,
+            audio_data,
+            file_id,
+            dst_root,
+            min_slice_duration,
+            count,
+            keep_low_conf,
+            rem_noises,
+            emojify,
+        )
         count += 1
 
     return count
@@ -363,14 +379,15 @@ def main():
     for data_set in ['LDC2004S13-Part1', 'LDC2005S13-Part2']:
         print(f"\n\nWorking on dataset: {data_set}")
         file_count = __process_data(
-                os.path.join(audio_root, data_set),
-                os.path.join(transcript_root, data_set),
-                dest_root,
-                min_slice_duration,
-                file_count,
-                keep_low_conf,
-                rem_noises,
-                emojify)
+            os.path.join(audio_root, data_set),
+            os.path.join(transcript_root, data_set),
+            dest_root,
+            min_slice_duration,
+            file_count,
+            keep_low_conf,
+            rem_noises,
+            emojify,
+        )
 
         print(f"Total file count so far: {file_count}")
 
diff --git a/scripts/process_hub5_data.py b/scripts/process_hub5_data.py
index 31bf7c849354..04096b8b01fe 100644
--- a/scripts/process_hub5_data.py
+++ b/scripts/process_hub5_data.py
@@ -19,55 +19,47 @@
 import re
 import subprocess
 import sys
-
 from collections import namedtuple
-from math import floor, ceil
-import numpy as np
+from math import ceil, floor
 from operator import attrgetter
+
+import numpy as np
 import scipy.io.wavfile as wavfile
 from tqdm import tqdm
 
-parser = argparse.ArgumentParser(
-        description="Prepare HUB5 data for training/eval")
+parser = argparse.ArgumentParser(description="Prepare HUB5 data for training/eval")
 parser.add_argument(
-        "--data_root", default=None, type=str, required=True,
-        help="The path to the root LDC HUB5 dataset directory.")
+    "--data_root", default=None, type=str, required=True, help="The path to the root LDC HUB5 dataset directory.",
+)
 parser.add_argument(
-        "--dest_root", default=None, type=str, required=True,
-        help="Path to the destination root directory for processed files.")
+    "--dest_root",
+    default=None,
+    type=str,
+    required=True,
+    help="Path to the destination root directory for processed files.",
+)
 
 # Optional arguments
 parser.add_argument(
-        "--min_slice_duration", default=10.0, type=float,
-        help="Minimum audio slice duration after processing.")
+    "--min_slice_duration", default=10.0, type=float, help="Minimum audio slice duration after processing.",
+)
 
 args = parser.parse_args()
 
 StmUtterance = namedtuple(
-    'StmUtterance',
-    ['filename', 'channel', 'speaker_id', 'begin', 'end',
-     'label', 'transcript']
-)
-STM_LINE_FMT = re.compile(
-    r"^(\w+)\s+(\w+)\s+(\w+)\s+([0-9.]+)\s+([0-9.]+)\s+(<.*>)?\s+(.+)$"
+    'StmUtterance', ['filename', 'channel', 'speaker_id', 'begin', 'end', 'label', 'transcript',],
 )
+STM_LINE_FMT = re.compile(r"^(\w+)\s+(\w+)\s+(\w+)\s+([0-9.]+)\s+([0-9.]+)\s+(<.*>)?\s+(.+)$")
 
 # Transcription errors and their fixes
-TRANSCRIPT_BUGS = {
-    "en_4622-B-12079-12187": "KIND OF WEIRD BUT"
-}
+TRANSCRIPT_BUGS = {"en_4622-B-12079-12187": "KIND OF WEIRD BUT"}
 
 
 def get_utt_id(segment):
     """
     Gives utterance IDs in a form like: en_4156-a-36558-37113
     """
-    return "{}-{}-{}-{}".format(
-        segment.filename,
-        segment.channel,
-        int(segment.begin * 100),
-        int(segment.end * 100)
-    )
+    return "{}-{}-{}-{}".format(segment.filename, segment.channel, int(segment.begin * 100), int(segment.end * 100),)
 
 
 def convert_utterances(sph_path, wav_path):
@@ -97,10 +89,7 @@ def process_transcripts(dataset_root):
     """
     Reads in transcripts for each audio segment and processes them.
     """
-    stm_path = os.path.join(dataset_root,
-                            "2000_hub5_eng_eval_tr",
-                            "reference",
-                            "hub5e00.english.000405.stm")
+    stm_path = os.path.join(dataset_root, "2000_hub5_eng_eval_tr", "reference", "hub5e00.english.000405.stm",)
     results = []
     chars = set()
 
@@ -138,8 +127,7 @@ def process_transcripts(dataset_root):
     return results, chars
 
 
-def write_one_segment(
-        dest_root, speaker_id, count, audio, sr, duration, transcript):
+def write_one_segment(dest_root, speaker_id, count, audio, sr, duration, transcript):
     """
     Writes out one segment of audio, and writes its corresponding transcript
     in the manifest.
@@ -153,8 +141,7 @@ def write_one_segment(
         duration: duration of the audio
         transcript: the corresponding transcript
     """
-    audio_path = os.path.join(
-            dest_root, "audio", f"{speaker_id}_{count:03}.wav")
+    audio_path = os.path.join(dest_root, "audio", f"{speaker_id}_{count:03}.wav")
 
     manifest_path = os.path.join(dest_root, "manifest_hub5.json")
 
@@ -165,7 +152,7 @@ def write_one_segment(
     transcript = {
         "audio_filepath": audio_path,
         "duration": duration,
-        "text": transcript
+        "text": transcript,
     }
     with open(manifest_path, 'a') as f:
         json.dump(transcript, f)
@@ -199,10 +186,7 @@ def segment_audio(info_list, dest_root, min_slice_duration):
             prev_id = info.speaker_id
             id_count = 0
 
-            sample_rate, audio_data = wavfile.read(
-                os.path.join(
-                    dest_root, 'full_audio_wav', info.filename + '.wav')
-            )
+            sample_rate, audio_data = wavfile.read(os.path.join(dest_root, 'full_audio_wav', info.filename + '.wav'))
             transcript_buffer = ''
             audio_buffer = []
             buffer_duration = 0.0
@@ -211,11 +195,9 @@ def segment_audio(info_list, dest_root, min_slice_duration):
         transcript_buffer += info.transcript
         channel = 0 if info.channel.lower() == 'a' else 1
         audio_buffer.append(
-            audio_data[floor(info.begin * sample_rate):
-                       ceil(info.end * sample_rate),
-                       channel]
+            audio_data[floor(info.begin * sample_rate) : ceil(info.end * sample_rate), channel,]
         )
-        buffer_duration += (info.end - info.begin)
+        buffer_duration += info.end - info.begin
 
         if buffer_duration < min_slice_duration:
             transcript_buffer += ' '
@@ -229,7 +211,7 @@ def segment_audio(info_list, dest_root, min_slice_duration):
                 np.concatenate(audio_buffer, axis=0),
                 sample_rate,
                 buffer_duration,
-                transcript_buffer
+                transcript_buffer,
             )
 
             transcript_buffer = ''
diff --git a/setup.py b/setup.py
index eba2055f8396..0df8f441450b 100644
--- a/setup.py
+++ b/setup.py
@@ -19,12 +19,27 @@
 """Setup for pip package."""
 
 import codecs
+import distutils.cmd
+import distutils.log
 import os
+import subprocess
 import sys
+from itertools import chain
 
 import setuptools
 
-from itertools import chain
+from nemo.package_info import (
+    __contact_emails__,
+    __contact_names__,
+    __description__,
+    __download_url__,
+    __homepage__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __repository_url__,
+    __version__,
+)
 
 
 # pep8: disable=E402
@@ -43,24 +58,13 @@ def is_build_action():
 if is_build_action():
     os.environ['NEMO_PACKAGE_BUILDING'] = 'True'
 
-from nemo.package_info import __contact_emails__
-from nemo.package_info import __contact_names__
-from nemo.package_info import __description__
-from nemo.package_info import __download_url__
-from nemo.package_info import __homepage__
-from nemo.package_info import __keywords__
-from nemo.package_info import __license__
-from nemo.package_info import __package_name__
-from nemo.package_info import __repository_url__
-from nemo.package_info import __version__
 # pep8: enable=E402
 
 
 if os.path.exists('README.rst'):
     # codec is used for consistent encoding
     long_description = codecs.open(
-        os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'),
-        'r', 'utf-8'
+        os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), 'r', 'utf-8',
     ).read()
     long_description_content_type = "text/x-rst"
 
@@ -73,9 +77,10 @@ def is_build_action():
     long_description = 'See ' + __homepage__
 
 
-################################################################################
-#                              Dependency Loading                              #
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
+###############################################################################
+#                             Dependency Loading                              #
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
+
 
 def req_file(filename, folder="requirements"):
     with open(os.path.join(folder, filename)) as f:
@@ -91,7 +96,6 @@ def req_file(filename, folder="requirements"):
     # User packages
     'docker': req_file("requirements_docker.txt"),
     'test': req_file("requirements_test.txt"),
-
     # Collections Packages
     'asr': req_file("requirements_asr.txt"),
     'nlp': req_file("requirements_nlp.txt"),
@@ -102,38 +106,112 @@ def req_file(filename, folder="requirements"):
 extras_require['all'] = list(chain(extras_require.values()))
 
 # TTS depends on ASR
-extras_require['tts'] = list(chain([
-    extras_require['tts'],
-    extras_require['asr']
-]))
+extras_require['tts'] = list(chain([extras_require['tts'], extras_require['asr']]))
 
 tests_requirements = extras_require["test"]
 
-################################################################################
+###############################################################################
+#                            Code style checkers                              #
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
+
+LINE_WIDTH = 119
+# Options should be compatible with black.
+ISORT_BASE = (
+    'isort',
+    '--multi-line=3',
+    '--trailing-comma',
+    '--force-grid-wrap=0',
+    '--use-parentheses',
+    f'--line-width={LINE_WIDTH}',
+    '-rc',
+    '.',
+)
+BLACK_BASE = (
+    'black',
+    '--skip-string-normalization',
+    f'--line-length={LINE_WIDTH}',
+    '.',
+)
+
+
+class Checker(distutils.cmd.Command):
+    def _call_checker(self, base_command, check=True):
+        command = list(base_command)
+
+        if check:
+            command.extend(['--check', '--diff'])
+
+        self.announce(
+            msg='Running command: %s' % str(' '.join(command)), level=distutils.log.INFO,
+        )
+
+        return_code = subprocess.call(command)
+
+        return return_code
+
+    def _pass(self):
+        self.announce(msg='\033[32mPASS\x1b[0m', level=distutils.log.INFO)
+
+    def _fail(self):
+        self.announce(msg='\033[31mFAIL\x1b[0m', level=distutils.log.INFO)
+
+
+class CheckStyleCommand(Checker):
+    description = 'checks overall project code style'
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def run(self):
+        isort_return = self._call_checker(ISORT_BASE)
+        black_return = self._call_checker(BLACK_BASE)
+
+        if isort_return == 0 and black_return == 0:
+            self._pass()
+        else:
+            self._fail()
+            exit(isort_return if isort_return != 0 else black_return)
+
+    def finalize_options(self):
+        pass
+
+
+class FixStyleCommand(Checker):
+    description = 'fix overall project code style in-place'
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def run(self):
+        self._call_checker(ISORT_BASE, check=False)
+        self._call_checker(BLACK_BASE, check=False)
+
+    def finalize_options(self):
+        pass
+
+
+###############################################################################
 
 
 setuptools.setup(
     name=__package_name__,
-
     # Versions should comply with PEP440.  For a discussion on single-sourcing
     # the version across setup.py and the project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
     version=__version__,
     description=__description__,
     long_description=long_description,
-
     # The project's main homepage.
     url=__repository_url__,
     download_url=__download_url__,
-
     # Author details
     author=__contact_names__,
     author_email=__contact_emails__,
-
     # maintainer Details
     maintainer=__contact_names__,
     maintainer_email=__contact_emails__,
-
     # The licence under which the project is released
     license=__license__,
     classifiers=[
@@ -146,12 +224,10 @@ def req_file(filename, folder="requirements"):
         #  6 - Mature
         #  7 - Inactive
         'Development Status :: 4 - Beta',
-
         # Indicate who your project is intended for
         'Intended Audience :: Developers',
         'Intended Audience :: Science/Research',
         'Intended Audience :: Information Technology',
-
         # Indicate what your project relates to
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Mathematics',
@@ -160,17 +236,14 @@ def req_file(filename, folder="requirements"):
         'Topic :: Software Development :: Libraries',
         'Topic :: Software Development :: Libraries :: Python Modules',
         'Topic :: Utilities',
-
         # Pick your license as you wish (should match "license" above)
         'License :: OSI Approved :: Apache Software License',
-
         # Supported python versions
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
-
         # Additional Setting
         'Environment :: Console',
         'Natural Language :: English',
@@ -180,17 +253,16 @@ def req_file(filename, folder="requirements"):
     install_requires=install_requires,
     setup_requires=['pytest-runner'],
     tests_require=tests_requirements,
-
     # List additional groups of dependencies here (e.g. development
     # dependencies). You can install these using the following syntax,
     # $ pip install -e ".[all]"
     # $ pip install nemo_toolkit[all]
     extras_require=extras_require,
-
     # Add in any packaged data.
     include_package_data=True,
     zip_safe=False,
-
     # PyPI package information.
     keywords=__keywords__,
+    # Custom commands.
+    cmdclass={'check_style': CheckStyleCommand, 'fix_style': FixStyleCommand},
 )
diff --git a/tests/common_setup.py b/tests/common_setup.py
index b8f1cfa269c0..6d353ee04a6e 100644
--- a/tests/common_setup.py
+++ b/tests/common_setup.py
@@ -17,10 +17,10 @@
 # =============================================================================
 
 import unittest
+
 import nemo
 
 
 class NeMoUnitTest(unittest.TestCase):
-
     def setUp(self) -> None:
         nemo.core.neural_factory.NeuralModuleFactory.reset_default_factory()
diff --git a/tests/other/jasper.py b/tests/other/jasper.py
index 9afbc82e2348..2f8d67e12bb6 100644
--- a/tests/other/jasper.py
+++ b/tests/other/jasper.py
@@ -1,7 +1,4 @@
 # Copyright (c) 2019 NVIDIA Corporation
-from nemo.backends.pytorch.asr.helpers import monitor_asr_train_progress, \
-    process_evaluation_batch, process_evaluation_epoch
-import nemo
 import argparse
 import os
 import sys
@@ -9,8 +6,14 @@
 import toml
 from tensorboardX import SummaryWriter
 
-sys.path.insert(0, os.path.abspath(
-    os.path.join(os.path.dirname(__file__), '../..')))
+import nemo
+from nemo.backends.pytorch.asr.helpers import (
+    monitor_asr_train_progress,
+    process_evaluation_batch,
+    process_evaluation_epoch,
+)
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
 
 parser = argparse.ArgumentParser(description='Jasper')
 parser.add_argument("--local_rank", default=None, type=int)
@@ -41,15 +44,12 @@
 
 
 def construct_name(name, lr, batch_size, num_gpus, num_epochs, wd):
-    return "{0}-lr_{1}-bs_{2}x{3}-e_{4}-wd_{5}-OPT-{6}".format(name, lr,
-                                                               batch_size,
-                                                               num_gpus,
-                                                               num_epochs, wd,
-                                                               opt_level)
+    return "{0}-lr_{1}-bs_{2}x{3}-e_{4}-wd_{5}-OPT-{6}".format(
+        name, lr, batch_size, num_gpus, num_epochs, wd, opt_level
+    )
 
 
-name = construct_name('Jasper10x5', lr, batch_size, num_gpus, num_epochs,
-                      weight_decay)
+name = construct_name('Jasper10x5', lr, batch_size, num_gpus, num_epochs, weight_decay)
 tb_writer = SummaryWriter(name)
 
 if args.local_rank is not None:
@@ -60,33 +60,33 @@ def construct_name(name, lr, batch_size, num_gpus, num_epochs, wd):
 
 # instantiate Neural Factory with supported backend
 neural_factory = nemo.core.NeuralModuleFactory(
-    backend=nemo.core.Backend.PyTorch,
-    local_rank=args.local_rank,
-    optimization_level=opt_level,
-    placement=device)
+    backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=opt_level, placement=device,
+)
 
 jasper_model_definition = toml.load("../../examples/nemo_asr/jasper10x5.toml")
 jasper_model_definition['placement'] = device
 labels = jasper_model_definition['labels']['labels']
 
 train_manifest = "/mnt/D1/Data/librispeech/librivox-train-all.json"
-#train_manifest = args.train_manifest
+# train_manifest = args.train_manifest
 val_manifest1 = "/mnt/D1/Data/librispeech/librivox-dev-clean.json"
 # val_manifest2 = "/mnt/D1/Data/librispeech/librivox-dev-other.json"
-#val_manifest1 = args.val_manifest1
+# val_manifest1 = args.val_manifest1
 
 
 featurizer_config = jasper_model_definition['input']
-data_layer = neural_factory.get_module(name="AudioToTextDataLayer",
-                                       params={
-                                           "featurizer_config": featurizer_config,
-                                           "manifest_filepath": train_manifest,
-                                           "labels": labels,
-                                           "batch_size": batch_size,
-                                           "placement": device,
-                                           "max_duration": 16.7
-                                       },
-                                       collection="nemo_asr")
+data_layer = neural_factory.get_module(
+    name="AudioToTextDataLayer",
+    params={
+        "featurizer_config": featurizer_config,
+        "manifest_filepath": train_manifest,
+        "labels": labels,
+        "batch_size": batch_size,
+        "placement": device,
+        "max_duration": 16.7,
+    },
+    collection="nemo_asr",
+)
 N = len(data_layer)
 print('-----------------')
 print('Have {0} examples to train on.'.format(N))
@@ -94,87 +94,73 @@ def construct_name(name, lr, batch_size, num_gpus, num_epochs, wd):
 step_per_epoch = int(N / (batch_size * num_gpus))
 
 data_preprocessor = neural_factory.get_module(
-        name="AudioToMelSpectrogramPreprocessor",
-        collection="nemo_asr",
-        params=featurizer_config)
-
-data_layer_eval1 = neural_factory.get_module(name="AudioToTextDataLayer",
-                                             params={
-                                                 "featurizer_config": featurizer_config,
-                                                 "manifest_filepath": val_manifest1,
-                                                 "labels": labels,
-                                                 "batch_size": 8,
-                                                 "placement": device,
-                                             },
-                                             collection="nemo_asr")
-jasper_encoder = neural_factory.get_module(name="JasperEncoder",
-                                           params=jasper_model_definition,
-                                           collection="nemo_asr")
-jasper_decoder = neural_factory.get_module(name="JasperDecoderForCTC",
-                                           params={
-                                               "feat_in": 1024,
-                                               "num_classes": len(labels),
-                                               "placement": device
-                                           },
-                                           collection="nemo_asr")
-
-ctc_loss = neural_factory.get_module(name="CTCLossNM",
-                                     params={
-                                         "num_classes": len(labels),
-                                         "placement": device
-                                     },
-                                     collection="nemo_asr")
-
-greedy_decoder = neural_factory.get_module(name="GreedyCTCDecoder",
-                                           params={"placement": device},
-                                           collection="nemo_asr")
+    name="AudioToMelSpectrogramPreprocessor", collection="nemo_asr", params=featurizer_config,
+)
+
+data_layer_eval1 = neural_factory.get_module(
+    name="AudioToTextDataLayer",
+    params={
+        "featurizer_config": featurizer_config,
+        "manifest_filepath": val_manifest1,
+        "labels": labels,
+        "batch_size": 8,
+        "placement": device,
+    },
+    collection="nemo_asr",
+)
+jasper_encoder = neural_factory.get_module(name="JasperEncoder", params=jasper_model_definition, collection="nemo_asr")
+jasper_decoder = neural_factory.get_module(
+    name="JasperDecoderForCTC",
+    params={"feat_in": 1024, "num_classes": len(labels), "placement": device},
+    collection="nemo_asr",
+)
+
+ctc_loss = neural_factory.get_module(
+    name="CTCLossNM", params={"num_classes": len(labels), "placement": device}, collection="nemo_asr",
+)
+
+greedy_decoder = neural_factory.get_module(
+    name="GreedyCTCDecoder", params={"placement": device}, collection="nemo_asr",
+)
 # Train DAG
 audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t = data_layer()
-processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t,
-                                                   length=a_sig_length_t)
-encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t,
-                                          length=p_length_t)
+processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t)
+encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t, length=p_length_t)
 log_probs_t = jasper_decoder(encoder_output=encoded_t)
 predictions_t = greedy_decoder(log_probs=log_probs_t)
-loss_t = ctc_loss(log_probs=log_probs_t,
-                  targets=transcript_t,
-                  input_length=encoded_len_t,
-                  target_length=transcript_len_t)
+loss_t = ctc_loss(
+    log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t,
+)
 
 # Eval DAG1
-audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1 = data_layer_eval1()
-processed_signal_e1, p_length_e1 = data_preprocessor(
-    input_signal=audio_signal_e1,
-    length=a_sig_length_e1)
-encoded_e1, encoded_len_e1 = jasper_encoder(audio_signal=processed_signal_e1,
-                                            length=p_length_e1)
+(audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1,) = data_layer_eval1()
+processed_signal_e1, p_length_e1 = data_preprocessor(input_signal=audio_signal_e1, length=a_sig_length_e1)
+encoded_e1, encoded_len_e1 = jasper_encoder(audio_signal=processed_signal_e1, length=p_length_e1)
 log_probs_e1 = jasper_decoder(encoder_output=encoded_e1)
 predictions_e1 = greedy_decoder(log_probs=log_probs_e1)
-loss_e1 = ctc_loss(log_probs=log_probs_e1,
-                   targets=transcript_e1,
-                   input_length=encoded_len_e1,
-                   target_length=transcript_len_e1)
+loss_e1 = ctc_loss(
+    log_probs=log_probs_e1, targets=transcript_e1, input_length=encoded_len_e1, target_length=transcript_len_e1,
+)
 
 
 print('\n\n\n================================')
-print("Total number of parameters: {0}".format(
-    jasper_decoder.num_weights + jasper_encoder.num_weights))
+print("Total number of parameters: {0}".format(jasper_decoder.num_weights + jasper_encoder.num_weights))
 print('================================')
 
 # Callbacks needed to print info to console and Tensorboard
 train_callback = nemo.core.SimpleLossLoggerCallback(
     tensor_list2str=lambda x: str(x[0].item()),
     tb_writer=tb_writer,
-    tensor_list2str_evl=lambda x: monitor_asr_train_progress(x, labels=labels))
+    tensor_list2str_evl=lambda x: monitor_asr_train_progress(x, labels=labels),
+)
 
 eval_callback1 = nemo.core.EvaluatorCallback(
     eval_tensors=[loss_e1, predictions_e1, transcript_e1, transcript_len_e1],
-    user_iter_callback=lambda x, y: process_evaluation_batch(
-        x, y, labels=labels),
-    user_epochs_done_callback=lambda x: process_evaluation_epoch(x,
-                                                                 tag="DEV-CLEAN"),
+    user_iter_callback=lambda x, y: process_evaluation_batch(x, y, labels=labels),
+    user_epochs_done_callback=lambda x: process_evaluation_epoch(x, tag="DEV-CLEAN"),
     eval_step=500,
-    tb_writer=tb_writer)
+    tb_writer=tb_writer,
+)
 
 
 def lr_policy(initial_lr, step, N):
@@ -183,12 +169,14 @@ def lr_policy(initial_lr, step, N):
 
 
 optimizer = neural_factory.get_trainer(
-    params={"optimizer_kind": "novograd",
-            "optimization_params": {"num_epochs": num_epochs, "lr": lr,
-                                    "weight_decay": weight_decay}})
-optimizer.train(tensors_to_optimize=[loss_t],
-                callbacks=[train_callback, eval_callback1],
-                tensors_to_evaluate=[predictions_t, transcript_t,
-                                     transcript_len_t],
-                lr_policy=lambda lr, s, e: lr_policy(lr, s,
-                                                     num_epochs * step_per_epoch))
+    params={
+        "optimizer_kind": "novograd",
+        "optimization_params": {"num_epochs": num_epochs, "lr": lr, "weight_decay": weight_decay,},
+    }
+)
+optimizer.train(
+    tensors_to_optimize=[loss_t],
+    callbacks=[train_callback, eval_callback1],
+    tensors_to_evaluate=[predictions_t, transcript_t, transcript_len_t],
+    lr_policy=lambda lr, s, e: lr_policy(lr, s, num_epochs * step_per_epoch),
+)
diff --git a/tests/other/jasper_zero_dl.py b/tests/other/jasper_zero_dl.py
index 51fa862b2165..37c395015daa 100644
--- a/tests/other/jasper_zero_dl.py
+++ b/tests/other/jasper_zero_dl.py
@@ -1,7 +1,4 @@
 # Copyright (c) 2019 NVIDIA Corporation
-from nemo.backends.pytorch.asr.helpers import monitor_asr_train_progress
-from nemo.core.neural_types import *
-import nemo
 import argparse
 import os
 import sys
@@ -10,8 +7,11 @@
 import torch
 from tensorboardX import SummaryWriter
 
-sys.path.insert(0, os.path.abspath(
-    os.path.join(os.path.dirname(__file__), '../..')))
+import nemo
+from nemo.backends.pytorch.asr.helpers import monitor_asr_train_progress
+from nemo.core.neural_types import *
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
 
 parser = argparse.ArgumentParser(description='Jasper')
 parser.add_argument("--local_rank", default=None, type=int)
@@ -42,15 +42,12 @@
 
 
 def construct_name(name, lr, batch_size, num_gpus, num_epochs, wd):
-    return "{0}-lr_{1}-bs_{2}x{3}-e_{4}-wd_{5}-OPT-{6}".format(name, lr,
-                                                               batch_size,
-                                                               num_gpus,
-                                                               num_epochs, wd,
-                                                               opt_level)
+    return "{0}-lr_{1}-bs_{2}x{3}-e_{4}-wd_{5}-OPT-{6}".format(
+        name, lr, batch_size, num_gpus, num_epochs, wd, opt_level
+    )
 
 
-name = construct_name('ZeroDS-Jasper10x5', lr, batch_size, num_gpus, num_epochs,
-                      weight_decay)
+name = construct_name('ZeroDS-Jasper10x5', lr, batch_size, num_gpus, num_epochs, weight_decay)
 
 tb_writer = SummaryWriter(name)
 
@@ -62,10 +59,8 @@ def construct_name(name, lr, batch_size, num_gpus, num_epochs, wd):
 
 # instantiate Neural Factory with supported backend
 neural_factory = nemo.core.NeuralModuleFactory(
-    backend=nemo.core.Backend.PyTorch,
-    local_rank=args.local_rank,
-    optimization_level=opt_level,
-    placement=device)
+    backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=opt_level, placement=device,
+)
 
 jasper_model_definition = toml.load("../../examples/nemo_asr/jasper10x5.toml")
 jasper_model_definition['placement'] = device
@@ -76,76 +71,61 @@ def construct_name(name, lr, batch_size, num_gpus, num_epochs, wd):
 
 featurizer_config = jasper_model_definition['input']
 data_preprocessor = neural_factory.get_module(
-        name="AudioToMelSpectrogramPreprocessor",
-        collection="nemo_asr",
-        params=featurizer_config)
+    name="AudioToMelSpectrogramPreprocessor", collection="nemo_asr", params=featurizer_config,
+)
 N = 288000
 time = 256
-dl = nemo.backends.pytorch.ZerosDataLayer(size=N, dtype=torch.FloatTensor,
-                                          batch_size=batch_size,
-                                          output_ports={
-                                              "processed_signal": NeuralType(
-                                                  {0: AxisType(BatchTag),
-                                                   1: AxisType(ChannelTag, dim=64),
-                                                   2: AxisType(TimeTag, dim=time)}),
-
-                                              "processed_length": NeuralType(
-                                                  {0: AxisType(BatchTag)}),
-
-                                              "transcript": NeuralType(
-                                                  {0: AxisType(BatchTag),
-                                                   1: AxisType(TimeTag, dim=time)}),
-
-                                              "transcript_length": NeuralType(
-                                                  {0: AxisType(BatchTag)})
-                                          })
+dl = nemo.backends.pytorch.ZerosDataLayer(
+    size=N,
+    dtype=torch.FloatTensor,
+    batch_size=batch_size,
+    output_ports={
+        "processed_signal": NeuralType(
+            {0: AxisType(BatchTag), 1: AxisType(ChannelTag, dim=64), 2: AxisType(TimeTag, dim=time),}
+        ),
+        "processed_length": NeuralType({0: AxisType(BatchTag)}),
+        "transcript": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag, dim=time)}),
+        "transcript_length": NeuralType({0: AxisType(BatchTag)}),
+    },
+)
 print('-----------------')
 print('Have {0} examples to train on.'.format(N))
 print('-----------------')
 step_per_epoch = int(N / (batch_size * num_gpus))
 
-jasper_encoder = neural_factory.get_module(name="JasperEncoder",
-                                           params=jasper_model_definition,
-                                           collection="nemo_asr")
-jasper_decoder = neural_factory.get_module(name="JasperDecoderForCTC",
-                                           params={
-                                               "feat_in": 1024,
-                                               "num_classes": len(labels),
-                                               "placement": device
-                                           },
-                                           collection="nemo_asr")
-
-ctc_loss = neural_factory.get_module(name="CTCLossNM",
-                                     params={
-                                         "num_classes": len(labels),
-                                         "placement": device
-                                     },
-                                     collection="nemo_asr")
-
-greedy_decoder = neural_factory.get_module(name="GreedyCTCDecoder",
-                                           params={"placement": device},
-                                           collection="nemo_asr")
+jasper_encoder = neural_factory.get_module(name="JasperEncoder", params=jasper_model_definition, collection="nemo_asr")
+jasper_decoder = neural_factory.get_module(
+    name="JasperDecoderForCTC",
+    params={"feat_in": 1024, "num_classes": len(labels), "placement": device},
+    collection="nemo_asr",
+)
+
+ctc_loss = neural_factory.get_module(
+    name="CTCLossNM", params={"num_classes": len(labels), "placement": device}, collection="nemo_asr",
+)
+
+greedy_decoder = neural_factory.get_module(
+    name="GreedyCTCDecoder", params={"placement": device}, collection="nemo_asr",
+)
 # Train DAG
 processed_signal_t, p_length_t, transcript_t, transcript_len_t = dl()
-encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t,
-                                          length=p_length_t)
+encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t, length=p_length_t)
 log_probs_t = jasper_decoder(encoder_output=encoded_t)
 predictions_t = greedy_decoder(log_probs=log_probs_t)
-loss_t = ctc_loss(log_probs=log_probs_t,
-                  targets=transcript_t,
-                  input_length=encoded_len_t,
-                  target_length=transcript_len_t)
+loss_t = ctc_loss(
+    log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t,
+)
 
 print('\n\n\n================================')
-print("Total number of parameters: {0}".format(
-    jasper_decoder.num_weights + jasper_encoder.num_weights))
+print("Total number of parameters: {0}".format(jasper_decoder.num_weights + jasper_encoder.num_weights))
 print('================================')
 
 # Callbacks needed to print info to console and Tensorboard
 train_callback = nemo.core.SimpleLossLoggerCallback(
     tensor_list2str=lambda x: str(x[0].item()),
     tb_writer=tb_writer,
-    tensor_list2str_evl=lambda x: monitor_asr_train_progress(x, labels=labels))
+    tensor_list2str_evl=lambda x: monitor_asr_train_progress(x, labels=labels),
+)
 
 
 def lr_policy(initial_lr, step, N):
@@ -154,12 +134,14 @@ def lr_policy(initial_lr, step, N):
 
 
 optimizer = neural_factory.get_trainer(
-    params={"optimizer_kind": "novograd",
-            "optimization_params": {"num_epochs": num_epochs, "lr": lr,
-                                    "weight_decay": weight_decay}})
-optimizer.train(tensors_to_optimize=[loss_t],
-                callbacks=[train_callback],
-                tensors_to_evaluate=[predictions_t, transcript_t,
-                                     transcript_len_t],
-                lr_policy=lambda lr, s, e: lr_policy(lr, s,
-                                                     num_epochs * step_per_epoch))
+    params={
+        "optimizer_kind": "novograd",
+        "optimization_params": {"num_epochs": num_epochs, "lr": lr, "weight_decay": weight_decay,},
+    }
+)
+optimizer.train(
+    tensors_to_optimize=[loss_t],
+    callbacks=[train_callback],
+    tensors_to_evaluate=[predictions_t, transcript_t, transcript_len_t],
+    lr_policy=lambda lr, s, e: lr_policy(lr, s, num_epochs * step_per_epoch),
+)
diff --git a/tests/test_actions_api.py b/tests/test_actions_api.py
index c0c0112a6673..3f0d5b568499 100644
--- a/tests/test_actions_api.py
+++ b/tests/test_actions_api.py
@@ -19,11 +19,11 @@
 import os
 
 import nemo
+
 from .common_setup import NeMoUnitTest
 
 
 class TestTrainers(NeMoUnitTest):
-
     def test_checkpointing(self):
         path = 'optimizer.pt'
         optimizer = nemo.backends.pytorch.actions.PtActions()
@@ -38,30 +38,25 @@ def test_checkpointing(self):
 
     def test_multi_optimizer(self):
         path = 'optimizer.pt'
-        module = nemo.backends.pytorch.common.SequenceEmbedding(
-            voc_size=8, hidden_size=16)
+        module = nemo.backends.pytorch.common.SequenceEmbedding(voc_size=8, hidden_size=16)
         optimizer = nemo.backends.pytorch.actions.PtActions()
-        optimizer.create_optimizer("sgd", module, optimizer_params={"lr": 1.})
-        optimizer.create_optimizer("sgd", [module],
-                                   optimizer_params={"lr": 2.})
-        optimizer.create_optimizer("novograd", [module],
-                                   optimizer_params={"lr": 3.})
-        optimizer.create_optimizer("adam", [module],
-                                   optimizer_params={"lr": 4.})
-        optimizer.create_optimizer("adam_w", [module],
-                                   optimizer_params={"lr": 5.})
+        optimizer.create_optimizer("sgd", module, optimizer_params={"lr": 1.0})
+        optimizer.create_optimizer("sgd", [module], optimizer_params={"lr": 2.0})
+        optimizer.create_optimizer("novograd", [module], optimizer_params={"lr": 3.0})
+        optimizer.create_optimizer("adam", [module], optimizer_params={"lr": 4.0})
+        optimizer.create_optimizer("adam_w", [module], optimizer_params={"lr": 5.0})
         self.assertEqual(len(optimizer.optimizers), 5)
         optimizer.save_state_to(path)
         optimizer.step = 123
         optimizer.epoch_num = 324
         for i, opt in enumerate(optimizer.optimizers):
             for param_group in opt.param_groups:
-                self.assertEqual(param_group['lr'], float(i+1))
+                self.assertEqual(param_group['lr'], float(i + 1))
                 param_group['lr'] = i
         optimizer.restore_state_from(path)
         for i, opt in enumerate(optimizer.optimizers):
             for param_group in opt.param_groups:
-                self.assertEqual(param_group['lr'], float(i+1))
+                self.assertEqual(param_group['lr'], float(i + 1))
         self.assertEqual(optimizer.step, 0)
         self.assertEqual(optimizer.epoch_num, 0)
         self.assertEqual(len(optimizer.optimizers), 5)
diff --git a/tests/test_asr.py b/tests/test_asr.py
index 3242701f5f10..7e067f718618 100644
--- a/tests/test_asr.py
+++ b/tests/test_asr.py
@@ -4,8 +4,15 @@
 import tarfile
 import unittest
 
+from ruamel.yaml import YAML
+
 import nemo
+import nemo.collections.asr as nemo_asr
+from nemo.collections.asr.parts import AudioDataset, WaveformFeaturizer, collections, parsers
 from nemo.core import DeviceType
+
+from .common_setup import NeMoUnitTest
+
 # ! /usr/bin/python
 # -*- coding: utf-8 -*-
 
@@ -24,14 +31,6 @@
 # limitations under the License.
 # =============================================================================
 
-import nemo.collections.asr as nemo_asr
-from nemo.collections.asr.parts import AudioDataset, WaveformFeaturizer
-from nemo.collections.asr.parts import collections
-from nemo.collections.asr.parts import parsers
-
-from ruamel.yaml import YAML
-
-from .common_setup import NeMoUnitTest
 
 freq = 16000
 
@@ -141,8 +140,7 @@ def test_transcript_normalizers(self):
             "ten dollars ten point nine zero one eight hundred zero zero",
             "eighteen billion one thousand two thousand and twenty",
             # Two line string below
-            "one ten thousand one hundred one thousand ten thousand one "
-            "hundred thousand one million",
+            "one ten thousand one hundred one thousand ten thousand one " "hundred thousand one million",
             "i loveeee aaa a ccnntts",
             "''",
             "it only costs one million dollars cheap right",
@@ -166,25 +164,16 @@ def remove_test_json():
 
         with open(manifest_paths, "w") as f:
             for s in test_strings:
-                f.write(
-                    '{"audio_filepath": "", "duration": 1.0, "text": '
-                    f'"{s}"}}\n'
-                )
+                f.write('{"audio_filepath": "", "duration": 1.0, "text": ' f'"{s}"}}\n')
         parser = parsers.make_parser(self.labels, 'en')
-        manifest = collections.ASRAudioText(
-            manifests_files=[manifest_paths], parser=parser,
-        )
+        manifest = collections.ASRAudioText(manifests_files=[manifest_paths], parser=parser,)
 
         for i, s in enumerate(normalized_strings):
             self.assertTrue(manifest[i].text_tokens == parser(s))
 
     def test_pytorch_audio_dataset(self):
         featurizer = WaveformFeaturizer.from_config(self.featurizer_config)
-        ds = AudioDataset(
-            manifest_filepath=self.manifest_filepath,
-            labels=self.labels,
-            featurizer=featurizer,
-        )
+        ds = AudioDataset(manifest_filepath=self.manifest_filepath, labels=self.labels, featurizer=featurizer,)
 
         for i in range(len(ds)):
             if i == 5:
@@ -218,29 +207,20 @@ def test_dataloader(self):
 
     def test_preprocessor_errors(self):
         def create_broken_preprocessor_1():
-            nemo_asr.AudioToMelSpectrogramPreprocessor(
-                window_size=2, n_window_size=2
-            )
+            nemo_asr.AudioToMelSpectrogramPreprocessor(window_size=2, n_window_size=2)
 
         def create_broken_preprocessor_2():
-            nemo_asr.AudioToMelSpectrogramPreprocessor(
-                window_stride=2, n_window_stride=2
-            )
+            nemo_asr.AudioToMelSpectrogramPreprocessor(window_stride=2, n_window_stride=2)
 
         def create_broken_preprocessor_3():
             nemo_asr.AudioToMelSpectrogramPreprocessor(n_window_stride=2)
 
         def create_good_preprocessor_1():
-            nemo_asr.AudioToMelSpectrogramPreprocessor(
-                window_size=0.02, window_stride=0.01
-            )
+            nemo_asr.AudioToMelSpectrogramPreprocessor(window_size=0.02, window_stride=0.01)
 
         def create_good_preprocessor_2():
             nemo_asr.AudioToMelSpectrogramPreprocessor(
-                window_size=None,
-                window_stride=None,
-                n_window_size=256,
-                n_window_stride=32,
+                window_size=None, window_stride=None, n_window_size=256, n_window_stride=32,
             )
 
         self.assertRaises(ValueError, create_broken_preprocessor_1)
@@ -252,26 +232,18 @@ def create_good_preprocessor_2():
     def test_kaldi_dataloader(self):
         batch_size = 4
         dl = nemo_asr.KaldiFeatureDataLayer(
-            kaldi_dir='tests/data/asr/kaldi_an4/',
-            labels=self.labels,
-            batch_size=batch_size,
+            kaldi_dir='tests/data/asr/kaldi_an4/', labels=self.labels, batch_size=batch_size,
         )
         for data in dl.data_iterator:
             self.assertTrue(data[0].size(0) == batch_size)
 
         dl_test_min = nemo_asr.KaldiFeatureDataLayer(
-            kaldi_dir='tests/data/asr/kaldi_an4/',
-            labels=self.labels,
-            batch_size=batch_size,
-            min_duration=1.0,
+            kaldi_dir='tests/data/asr/kaldi_an4/', labels=self.labels, batch_size=batch_size, min_duration=1.0,
         )
         self.assertTrue(len(dl_test_min) == 18)
 
         dl_test_max = nemo_asr.KaldiFeatureDataLayer(
-            kaldi_dir='tests/data/asr/kaldi_an4/',
-            labels=self.labels,
-            batch_size=batch_size,
-            max_duration=5.0,
+            kaldi_dir='tests/data/asr/kaldi_an4/', labels=self.labels, batch_size=batch_size, max_duration=5.0,
         )
         self.assertTrue(len(dl_test_max) == 19)
 
@@ -296,9 +268,7 @@ def test_trim_silence(self):
             drop_last=True,
             shuffle=False,
         )
-        for norm, trim in zip(
-            normal_dl.data_iterator, trimmed_dl.data_iterator
-        ):
+        for norm, trim in zip(normal_dl.data_iterator, trimmed_dl.data_iterator):
             for point in range(batch_size):
                 self.assertTrue(norm[1][point].data >= trim[1][point].data)
 
@@ -320,16 +290,12 @@ def test_audio_preprocessors(self):
         except ModuleNotFoundError:
             installed_torchaudio = False
             with self.assertRaises(ModuleNotFoundError):
-                to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(
-                    n_fft=400, window=None
-                )
+                to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(n_fft=400, window=None)
             with self.assertRaises(ModuleNotFoundError):
                 to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15)
 
         if installed_torchaudio:
-            to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(
-                n_fft=400, window=None
-            )
+            to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(n_fft=400, window=None)
             to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15)
 
         to_melspec = nemo_asr.AudioToMelSpectrogramPreprocessor(features=50)
@@ -373,54 +339,35 @@ def test_jasper_training(self):
             'normalize': 'per_feature',
             'window_stride': 0.01,
         }
-        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
-            **pre_process_params
-        )
+        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
         jasper_encoder = nemo_asr.JasperEncoder(
-            feat_in=jasper_model_definition[
-                'AudioToMelSpectrogramPreprocessor'
-            ]['features'],
+            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
             **jasper_model_definition['JasperEncoder'],
         )
-        jasper_decoder = nemo_asr.JasperDecoderForCTC(
-            feat_in=1024, num_classes=len(self.labels)
-        )
+        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
         ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
 
         # DAG
         audio_signal, a_sig_length, transcript, transcript_len = dl()
-        processed_signal, p_length = preprocessing(
-            input_signal=audio_signal, length=a_sig_length
-        )
+        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)
 
-        encoded, encoded_len = jasper_encoder(
-            audio_signal=processed_signal, length=p_length
-        )
+        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
         # print(jasper_encoder)
         log_probs = jasper_decoder(encoder_output=encoded)
         loss = ctc_loss(
-            log_probs=log_probs,
-            targets=transcript,
-            input_length=encoded_len,
-            target_length=transcript_len,
+            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
         )
 
         callback = nemo.core.SimpleLossLoggerCallback(
-            tensors=[loss],
-            print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
+            tensors=[loss], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
         )
         # Instantiate an optimizer to perform `train` action
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch,
-            local_rank=None,
-            create_tb_writer=False,
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
         )
         optimizer = neural_factory.get_trainer()
         optimizer.train(
-            [loss],
-            callbacks=[callback],
-            optimizer="sgd",
-            optimization_params={"num_epochs": 10, "lr": 0.0003},
+            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 10, "lr": 0.0003},
         )
 
     def test_double_jasper_training(self):
@@ -444,70 +391,44 @@ def test_double_jasper_training(self):
             'normalize': 'per_feature',
             'window_stride': 0.01,
         }
-        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
-            **pre_process_params
-        )
+        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
         jasper_encoder1 = nemo_asr.JasperEncoder(
-            feat_in=jasper_model_definition[
-                'AudioToMelSpectrogramPreprocessor'
-            ]['features'],
+            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
             **jasper_model_definition['JasperEncoder'],
         )
         jasper_encoder2 = nemo_asr.JasperEncoder(
-            feat_in=jasper_model_definition[
-                'AudioToMelSpectrogramPreprocessor'
-            ]['features'],
+            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
             **jasper_model_definition['JasperEncoder'],
         )
         mx_max1 = nemo.backends.pytorch.common.SimpleCombiner(mode="max")
         mx_max2 = nemo.backends.pytorch.common.SimpleCombiner(mode="max")
-        jasper_decoder1 = nemo_asr.JasperDecoderForCTC(
-            feat_in=1024, num_classes=len(self.labels)
-        )
-        jasper_decoder2 = nemo_asr.JasperDecoderForCTC(
-            feat_in=1024, num_classes=len(self.labels)
-        )
+        jasper_decoder1 = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
+        jasper_decoder2 = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
 
         ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
 
         # DAG
         audio_signal, a_sig_length, transcript, transcript_len = dl()
-        processed_signal, p_length = preprocessing(
-            input_signal=audio_signal, length=a_sig_length
-        )
+        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)
 
-        encoded1, encoded_len1 = jasper_encoder1(
-            audio_signal=processed_signal, length=p_length
-        )
-        encoded2, encoded_len2 = jasper_encoder2(
-            audio_signal=processed_signal, length=p_length
-        )
+        encoded1, encoded_len1 = jasper_encoder1(audio_signal=processed_signal, length=p_length)
+        encoded2, encoded_len2 = jasper_encoder2(audio_signal=processed_signal, length=p_length)
         log_probs1 = jasper_decoder1(encoder_output=encoded1)
         log_probs2 = jasper_decoder2(encoder_output=encoded2)
         log_probs = mx_max1(x1=log_probs1, x2=log_probs2)
         encoded_len = mx_max2(x1=encoded_len1, x2=encoded_len2)
         loss = ctc_loss(
-            log_probs=log_probs,
-            targets=transcript,
-            input_length=encoded_len,
-            target_length=transcript_len,
+            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
         )
 
-        callback = nemo.core.SimpleLossLoggerCallback(
-            tensors=[loss], print_func=lambda x: print(str(x[0].item()))
-        )
+        callback = nemo.core.SimpleLossLoggerCallback(tensors=[loss], print_func=lambda x: print(str(x[0].item())))
         # Instantiate an optimizer to perform `train` action
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch,
-            local_rank=None,
-            create_tb_writer=False,
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
         )
         optimizer = neural_factory.get_trainer()
         optimizer.train(
-            [loss],
-            callbacks=[callback],
-            optimizer="sgd",
-            optimization_params={"num_epochs": 10, "lr": 0.0003},
+            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 10, "lr": 0.0003},
         )
 
     def test_quartznet_training(self):
@@ -532,53 +453,34 @@ def test_quartznet_training(self):
             'normalize': 'per_feature',
             'window_stride': 0.01,
         }
-        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
-            **pre_process_params
-        )
+        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
         jasper_encoder = nemo_asr.JasperEncoder(
-            feat_in=quartz_model_definition[
-                'AudioToMelSpectrogramPreprocessor'
-            ]['features'],
+            feat_in=quartz_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
             **quartz_model_definition['JasperEncoder'],
         )
-        jasper_decoder = nemo_asr.JasperDecoderForCTC(
-            feat_in=1024, num_classes=len(self.labels)
-        )
+        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
         ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
 
         # DAG
         audio_signal, a_sig_length, transcript, transcript_len = dl()
-        processed_signal, p_length = preprocessing(
-            input_signal=audio_signal, length=a_sig_length
-        )
+        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)
 
-        encoded, encoded_len = jasper_encoder(
-            audio_signal=processed_signal, length=p_length
-        )
+        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
         log_probs = jasper_decoder(encoder_output=encoded)
         loss = ctc_loss(
-            log_probs=log_probs,
-            targets=transcript,
-            input_length=encoded_len,
-            target_length=transcript_len,
+            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
         )
 
         callback = nemo.core.SimpleLossLoggerCallback(
-            tensors=[loss],
-            print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
+            tensors=[loss], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
         )
         # Instantiate an optimizer to perform `train` action
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch,
-            local_rank=None,
-            create_tb_writer=False,
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
         )
         optimizer = neural_factory.get_trainer()
         optimizer.train(
-            [loss],
-            callbacks=[callback],
-            optimizer="sgd",
-            optimization_params={"num_epochs": 10, "lr": 0.0003},
+            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 10, "lr": 0.0003},
         )
 
     def test_stft_conv(self):
@@ -603,54 +505,34 @@ def test_stft_conv(self):
             'window_stride': 0.01,
             'stft_conv': True,
         }
-        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
-            **pre_process_params
-        )
+        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
         jasper_encoder = nemo_asr.JasperEncoder(
-            feat_in=jasper_model_definition[
-                'AudioToMelSpectrogramPreprocessor'
-            ]['features'],
+            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
             **jasper_model_definition['JasperEncoder'],
         )
-        jasper_decoder = nemo_asr.JasperDecoderForCTC(
-            feat_in=1024, num_classes=len(self.labels)
-        )
+        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
 
         ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
 
         # DAG
         audio_signal, a_sig_length, transcript, transcript_len = dl()
-        processed_signal, p_length = preprocessing(
-            input_signal=audio_signal, length=a_sig_length
-        )
+        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)
 
-        encoded, encoded_len = jasper_encoder(
-            audio_signal=processed_signal, length=p_length
-        )
+        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
         # print(jasper_encoder)
         log_probs = jasper_decoder(encoder_output=encoded)
         loss = ctc_loss(
-            log_probs=log_probs,
-            targets=transcript,
-            input_length=encoded_len,
-            target_length=transcript_len,
+            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
         )
 
-        callback = nemo.core.SimpleLossLoggerCallback(
-            tensors=[loss], print_func=lambda x: print(str(x[0].item()))
-        )
+        callback = nemo.core.SimpleLossLoggerCallback(tensors=[loss], print_func=lambda x: print(str(x[0].item())))
         # Instantiate an optimizer to perform `train` action
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch,
-            local_rank=None,
-            create_tb_writer=False,
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
         )
         optimizer = neural_factory.get_trainer()
         optimizer.train(
-            [loss],
-            callbacks=[callback],
-            optimizer="sgd",
-            optimization_params={"num_epochs": 10, "lr": 0.0003},
+            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 10, "lr": 0.0003},
         )
 
     def test_clas(self):
@@ -675,17 +557,14 @@ def test_clas(self):
             'window_stride': 0.01,
             'stft_conv': True,
         }
-        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
-            **pre_process_params
-        )
+        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
         encoder = nemo_asr.JasperEncoder(
             jasper=cfg['encoder']['jasper'],
             activation=cfg['encoder']['activation'],
             feat_in=cfg['input']['train']['features'],
         )
         connector = nemo_asr.JasperRNNConnector(
-            in_channels=cfg['encoder']['jasper'][-1]['filters'],
-            out_channels=cfg['decoder']['hidden_size'],
+            in_channels=cfg['encoder']['jasper'][-1]['filters'], out_channels=cfg['decoder']['hidden_size'],
         )
         decoder = nemo.backends.pytorch.common.DecoderRNN(
             voc_size=len(self.labels), bos_id=0, **cfg['decoder']  # fictive
@@ -694,32 +573,21 @@ def test_clas(self):
 
         # DAG
         audio_signal, a_sig_length, transcripts, transcript_len = dl()
-        processed_signal, p_length = preprocessing(
-            input_signal=audio_signal, length=a_sig_length
-        )
-        encoded, encoded_len = encoder(
-            audio_signal=processed_signal, length=p_length
-        )
+        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)
+        encoded, encoded_len = encoder(audio_signal=processed_signal, length=p_length)
         encoded = connector(tensor=encoded)
         log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded)
         loss = loss(log_probs=log_probs, targets=transcripts)
 
         # Train
-        callback = nemo.core.SimpleLossLoggerCallback(
-            tensors=[loss], print_func=lambda x: print(str(x[0].item()))
-        )
+        callback = nemo.core.SimpleLossLoggerCallback(tensors=[loss], print_func=lambda x: print(str(x[0].item())))
         # Instantiate an optimizer to perform `train` action
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch,
-            local_rank=None,
-            create_tb_writer=False,
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
         )
         optimizer = neural_factory.get_trainer()
         optimizer.train(
-            [loss],
-            callbacks=[callback],
-            optimizer="sgd",
-            optimization_params={"num_epochs": 10, "lr": 0.0003},
+            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 10, "lr": 0.0003},
         )
 
     def test_jasper_eval(self):
@@ -743,36 +611,23 @@ def test_jasper_eval(self):
             'normalize': 'per_feature',
             'window_stride': 0.01,
         }
-        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
-            **pre_process_params
-        )
+        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
         jasper_encoder = nemo_asr.JasperEncoder(
-            feat_in=jasper_model_definition[
-                'AudioToMelSpectrogramPreprocessor'
-            ]['features'],
+            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
             **jasper_model_definition['JasperEncoder'],
         )
-        jasper_decoder = nemo_asr.JasperDecoderForCTC(
-            feat_in=1024, num_classes=len(self.labels)
-        )
+        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
         ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
         greedy_decoder = nemo_asr.GreedyCTCDecoder()
         # DAG
         audio_signal, a_sig_length, transcript, transcript_len = dl()
-        processed_signal, p_length = preprocessing(
-            input_signal=audio_signal, length=a_sig_length
-        )
+        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)
 
-        encoded, encoded_len = jasper_encoder(
-            audio_signal=processed_signal, length=p_length
-        )
+        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
         # print(jasper_encoder)
         log_probs = jasper_decoder(encoder_output=encoded)
         loss = ctc_loss(
-            log_probs=log_probs,
-            targets=transcript,
-            input_length=encoded_len,
-            target_length=transcript_len,
+            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
         )
         predictions = greedy_decoder(log_probs=log_probs)
 
@@ -783,16 +638,12 @@ def test_jasper_eval(self):
 
         eval_callback = nemo.core.EvaluatorCallback(
             eval_tensors=[loss, predictions, transcript, transcript_len],
-            user_iter_callback=lambda x, y: process_evaluation_batch(
-                x, y, labels=self.labels
-            ),
+            user_iter_callback=lambda x, y: process_evaluation_batch(x, y, labels=self.labels),
             user_epochs_done_callback=process_evaluation_epoch,
         )
         # Instantiate an optimizer to perform `train` action
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch,
-            local_rank=None,
-            create_tb_writer=False,
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
         )
         neural_factory.eval(callbacks=[eval_callback])
 
diff --git a/tests/test_bert.py b/tests/test_bert.py
index fe1afbff91c5..00ab74917f28 100644
--- a/tests/test_bert.py
+++ b/tests/test_bert.py
@@ -17,6 +17,7 @@
 # =============================================================================
 
 import nemo.collections.nlp as nemo_nlp
+
 from .common_setup import NeMoUnitTest
 
 
diff --git a/tests/test_deploy_export.py b/tests/test_deploy_export.py
index 0d89eb511e0a..d21311f7d805 100644
--- a/tests/test_deploy_export.py
+++ b/tests/test_deploy_export.py
@@ -22,83 +22,79 @@
 import torch
 from ruamel.yaml import YAML
 
-from .common_setup import NeMoUnitTest
-
 import nemo
 import nemo.collections.asr as nemo_asr
 import nemo.collections.nlp as nemo_nlp
 
+from .common_setup import NeMoUnitTest
+
 
 class TestDeployExport(NeMoUnitTest):
     def setUp(self) -> None:
-        self.nf = nemo.core.NeuralModuleFactory(
-            placement=nemo.core.DeviceType.GPU)
+        self.nf = nemo.core.NeuralModuleFactory(placement=nemo.core.DeviceType.GPU)
 
-    def __test_export_route(self, module, out_name, mode,
-                            input_example=None):
+    def __test_export_route(self, module, out_name, mode, input_example=None):
         out = Path(out_name)
         if out.exists():
             os.remove(out)
 
         self.nf.deployment_export(
-            module=module,
-            output=out_name,
-            input_example=input_example,
-            d_format=mode)
+            module=module, output=out_name, input_example=input_example, d_format=mode,
+        )
 
         self.assertTrue(out.exists())
         if out.exists():
             os.remove(out)
 
     def test_simple_module_export(self):
-        simplest_module = \
-            nemo.backends.pytorch.tutorials.TaylorNet(dim=4, factory=self.nf)
-        self.__test_export_route(module=simplest_module,
-                                 out_name="simple.pt",
-                                 mode=nemo.core.DeploymentFormat.TORCHSCRIPT,
-                                 input_example=None)
+        simplest_module = nemo.backends.pytorch.tutorials.TaylorNet(dim=4, factory=self.nf)
+        self.__test_export_route(
+            module=simplest_module,
+            out_name="simple.pt",
+            mode=nemo.core.DeploymentFormat.TORCHSCRIPT,
+            input_example=None,
+        )
 
     def test_TokenClassifier_module_export(self):
-        t_class = nemo_nlp.TokenClassifier(hidden_size=512, num_classes=16,
-                                           use_transformer_pretrained=False)
-        self.__test_export_route(module=t_class,
-                                 out_name="t_class.pt",
-                                 mode=nemo.core.DeploymentFormat.TORCHSCRIPT,
-                                 input_example=torch.randn(16, 16, 512).cuda())
+        t_class = nemo_nlp.TokenClassifier(hidden_size=512, num_classes=16, use_transformer_pretrained=False)
+        self.__test_export_route(
+            module=t_class,
+            out_name="t_class.pt",
+            mode=nemo.core.DeploymentFormat.TORCHSCRIPT,
+            input_example=torch.randn(16, 16, 512).cuda(),
+        )
 
     def test_TokenClassifier_module_onnx_export(self):
-        t_class = nemo_nlp.TokenClassifier(hidden_size=512, num_classes=16,
-                                           use_transformer_pretrained=False)
-        self.__test_export_route(module=t_class,
-                                 out_name="t_class.onnx",
-                                 mode=nemo.core.DeploymentFormat.ONNX,
-                                 input_example=torch.randn(16, 16, 512).cuda())
+        t_class = nemo_nlp.TokenClassifier(hidden_size=512, num_classes=16, use_transformer_pretrained=False)
+        self.__test_export_route(
+            module=t_class,
+            out_name="t_class.onnx",
+            mode=nemo.core.DeploymentFormat.ONNX,
+            input_example=torch.randn(16, 16, 512).cuda(),
+        )
 
     def test_jasper_decoder_export_ts(self):
-        j_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
-                                                 num_classes=33)
-        self.__test_export_route(module=j_decoder,
-                                 out_name="j_decoder.ts",
-                                 mode=nemo.core.DeploymentFormat.TORCHSCRIPT,
-                                 input_example=None)
+        j_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=33)
+        self.__test_export_route(
+            module=j_decoder, out_name="j_decoder.ts", mode=nemo.core.DeploymentFormat.TORCHSCRIPT, input_example=None,
+        )
 
     def test_hf_bert_ts(self):
-        bert = nemo_nlp.huggingface.BERT(
-            pretrained_model_name="bert-base-uncased")
-        input_example = (torch.randint(low=0, high=16, size=(2, 16)).cuda(),
-                         torch.randint(low=0, high=1, size=(2, 16)).cuda(),
-                         torch.randint(low=0, high=1, size=(2, 16)).cuda())
-        self.__test_export_route(module=bert,
-                                 out_name="bert.ts",
-                                 mode=nemo.core.DeploymentFormat.TORCHSCRIPT,
-                                 input_example=input_example)
+        bert = nemo_nlp.huggingface.BERT(pretrained_model_name="bert-base-uncased")
+        input_example = (
+            torch.randint(low=0, high=16, size=(2, 16)).cuda(),
+            torch.randint(low=0, high=1, size=(2, 16)).cuda(),
+            torch.randint(low=0, high=1, size=(2, 16)).cuda(),
+        )
+        self.__test_export_route(
+            module=bert, out_name="bert.ts", mode=nemo.core.DeploymentFormat.TORCHSCRIPT, input_example=input_example,
+        )
 
     def test_hf_bert_pt(self):
-        bert = nemo_nlp.huggingface.BERT(
-            pretrained_model_name="bert-base-uncased")
-        self.__test_export_route(module=bert,
-                                 out_name="bert.pt",
-                                 mode=nemo.core.DeploymentFormat.PYTORCH)
+        bert = nemo_nlp.huggingface.BERT(pretrained_model_name="bert-base-uncased")
+        self.__test_export_route(
+            module=bert, out_name="bert.pt", mode=nemo.core.DeploymentFormat.PYTORCH,
+        )
 
     def test_jasper_encoder_to_onnx(self):
         with open("tests/data/jasper_smaller.yaml") as file:
@@ -107,14 +103,13 @@ def test_jasper_encoder_to_onnx(self):
 
         jasper_encoder = nemo_asr.JasperEncoder(
             conv_mask=False,
-            feat_in=jasper_model_definition[
-                'AudioToMelSpectrogramPreprocessor']['features'],
+            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
             **jasper_model_definition['JasperEncoder']
         )
 
-        self.__test_export_route(module=jasper_encoder,
-                                 out_name="jasper_encoder.onnx",
-                                 mode=nemo.core.DeploymentFormat.ONNX,
-                                 input_example=(
-                                     torch.randn(16, 64, 256).cuda(),
-                                     torch.randn(256).cuda()))
+        self.__test_export_route(
+            module=jasper_encoder,
+            out_name="jasper_encoder.onnx",
+            mode=nemo.core.DeploymentFormat.ONNX,
+            input_example=(torch.randn(16, 64, 256).cuda(), torch.randn(256).cuda(),),
+        )
diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py
index 45f1834560ae..f21e9a9943a8 100644
--- a/tests/test_deprecated.py
+++ b/tests/test_deprecated.py
@@ -14,15 +14,14 @@
 
 
 from io import StringIO
-
 from unittest.mock import patch
 
-from .common_setup import NeMoUnitTest
 from nemo.utils.decorators.deprecated import deprecated
 
+from .common_setup import NeMoUnitTest
 
-class DeprecatedTestCase(NeMoUnitTest):
 
+class DeprecatedTestCase(NeMoUnitTest):
     def test_say_whee_deprecated(self):
         """ Tests whether both std and err streams return the right values
         when function is deprecated."""
@@ -37,12 +36,10 @@ def say_whee():
                 say_whee()
 
         # Check std output.
-        self.assertEqual(std_out.getvalue().strip(),
-                         "Whee!")
+        self.assertEqual(std_out.getvalue().strip(), "Whee!")
 
         # Check error output.
-        self.assertEqual(std_err.getvalue().strip(),
-                         'Function ``say_whee`` is deprecated.')
+        self.assertEqual(std_err.getvalue().strip(), 'Function ``say_whee`` is deprecated.')
 
     def test_say_wow_twice_deprecated(self):
         """ Tests whether both std and err streams return the right values
@@ -58,12 +55,10 @@ def say_wow():
                 say_wow()
 
         # Check std output.
-        self.assertEqual(std_out.getvalue().strip(),
-                         "Woooow!")
+        self.assertEqual(std_out.getvalue().strip(), "Woooow!")
 
         # Check error output.
-        self.assertEqual(std_err.getvalue().strip(),
-                         'Function ``say_wow`` is deprecated.')
+        self.assertEqual(std_err.getvalue().strip(), 'Function ``say_wow`` is deprecated.')
 
         # Second call.
         with patch('sys.stdout', new=StringIO()) as std_out:
@@ -71,12 +66,10 @@ def say_wow():
                 say_wow()
 
         # Check std output.
-        self.assertEqual(std_out.getvalue().strip(),
-                         "Woooow!")
+        self.assertEqual(std_out.getvalue().strip(), "Woooow!")
 
         # Check error output - should be empty.
-        self.assertEqual(std_err.getvalue().strip(),
-                         '')
+        self.assertEqual(std_err.getvalue().strip(), '')
 
     def test_say_whoopie_deprecated_version(self):
         """ Tests whether both std and err streams return the right values
@@ -92,13 +85,14 @@ def say_whoopie():
                 say_whoopie()
 
         # Check std output.
-        self.assertEqual(std_out.getvalue().strip(),
-                         "Whoopie!")
+        self.assertEqual(std_out.getvalue().strip(), "Whoopie!")
 
         # Check error output.
-        self.assertEqual(std_err.getvalue().strip(),
-                         'Function ``say_whoopie`` is deprecated. It is going \
-to be removed in version 0.1.')
+        self.assertEqual(
+            std_err.getvalue().strip(),
+            'Function ``say_whoopie`` is deprecated. It is going \
+to be removed in version 0.1.',
+        )
 
     def test_say_kowabunga_deprecated_explanation(self):
         """ Tests whether both std and err streams return the right values
@@ -114,10 +108,11 @@ def say_kowabunga():
                 say_kowabunga()
 
         # Check std output.
-        self.assertEqual(std_out.getvalue().strip(),
-                         "Kowabunga!")
+        self.assertEqual(std_out.getvalue().strip(), "Kowabunga!")
 
         # Check error output.
-        self.assertEqual(std_err.getvalue().strip(),
-                         'Function ``say_kowabunga`` is deprecated. Please \
-use ``print_ihaa`` instead.')
+        self.assertEqual(
+            std_err.getvalue().strip(),
+            'Function ``say_kowabunga`` is deprecated. Please \
+use ``print_ihaa`` instead.',
+        )
diff --git a/tests/test_infer.py b/tests/test_infer.py
index 35ce600e22a0..32385e2f5b23 100644
--- a/tests/test_infer.py
+++ b/tests/test_infer.py
@@ -31,17 +31,11 @@ def __init__(self, **kwargs):
 
     @property
     def input_ports(self):
-        return {
-            "mod_in": NeuralType({0: AxisType(BatchTag),
-                                  1: AxisType(BaseTag, dim=1)})
-        }
+        return {"mod_in": NeuralType({0: AxisType(BatchTag), 1: AxisType(BaseTag, dim=1)})}
 
     @property
     def output_ports(self):
-        return {
-            "mod_out": NeuralType({0: AxisType(BatchTag),
-                                   1: AxisType(BaseTag, dim=1)})
-        }
+        return {"mod_out": NeuralType({0: AxisType(BatchTag), 1: AxisType(BaseTag, dim=1)})}
 
     def forward(self, mod_in):
         return mod_in + 10
@@ -53,17 +47,11 @@ def __init__(self, **kwargs):
 
     @property
     def input_ports(self):
-        return {
-            "mod_in": NeuralType({0: AxisType(BatchTag),
-                                  1: AxisType(BaseTag, dim=1)})
-        }
+        return {"mod_in": NeuralType({0: AxisType(BatchTag), 1: AxisType(BaseTag, dim=1)})}
 
     @property
     def output_ports(self):
-        return {
-            "mod_out": NeuralType({0: AxisType(BatchTag),
-                                   1: AxisType(BaseTag, dim=1)})
-        }
+        return {"mod_out": NeuralType({0: AxisType(BatchTag), 1: AxisType(BaseTag, dim=1)})}
 
     def forward(self, mod_in):
         return mod_in - 10
@@ -72,15 +60,15 @@ def forward(self, mod_in):
 class TestInfer(NeMoUnitTest):
     def test_infer_caching(self):
         neural_factory = nemo.core.neural_factory.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, create_tb_writer=False)
+            backend=nemo.core.Backend.PyTorch, create_tb_writer=False
+        )
 
         data_source = nemo.backends.pytorch.common.ZerosDataLayer(
             size=1,
             dtype=torch.FloatTensor,
             batch_size=1,
-            output_ports={
-                "dl_out": NeuralType({0: AxisType(BatchTag),
-                                      1: AxisType(BaseTag, dim=1)})})
+            output_ports={"dl_out": NeuralType({0: AxisType(BatchTag), 1: AxisType(BaseTag, dim=1)})},
+        )
         addten = AddsTen()
         minusten = SubtractsTen()
 
@@ -89,33 +77,25 @@ def test_infer_caching(self):
         twenty_tensor = addten(mod_in=ten_tensor)
         thirty_tensor = addten(mod_in=twenty_tensor)
 
-        evaluated_tensors = neural_factory.infer(
-            tensors=[twenty_tensor, thirty_tensor],
-            verbose=False,
-            cache=True
-        )
+        evaluated_tensors = neural_factory.infer(tensors=[twenty_tensor, thirty_tensor], verbose=False, cache=True)
         self.assertEqual(evaluated_tensors[0][0].squeeze().data, 20)
         self.assertEqual(evaluated_tensors[1][0].squeeze().data, 30)
 
         new_ten_tensor = minusten(mod_in=twenty_tensor)
-        evaluated_tensors = neural_factory.infer(
-            tensors=[new_ten_tensor],
-            verbose=False,
-            use_cache=True
-        )
+        evaluated_tensors = neural_factory.infer(tensors=[new_ten_tensor], verbose=False, use_cache=True)
         self.assertEqual(evaluated_tensors[0][0].squeeze().data, 10)
 
     def test_infer_errors(self):
         neural_factory = nemo.core.neural_factory.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, create_tb_writer=False)
+            backend=nemo.core.Backend.PyTorch, create_tb_writer=False
+        )
 
         data_source = nemo.backends.pytorch.common.ZerosDataLayer(
             size=1,
             dtype=torch.FloatTensor,
             batch_size=1,
-            output_ports={
-                "dl_out": NeuralType({0: AxisType(BatchTag),
-                                      1: AxisType(BaseTag, dim=1)})})
+            output_ports={"dl_out": NeuralType({0: AxisType(BatchTag), 1: AxisType(BaseTag, dim=1)})},
+        )
         addten = AddsTen()
         minusten = SubtractsTen()
 
@@ -124,42 +104,24 @@ def test_infer_errors(self):
         twenty_tensor = addten(mod_in=ten_tensor)
         thirty_tensor = addten(mod_in=twenty_tensor)
 
-        with self.assertRaisesRegex(ValueError,
-                                    "use_cache was set, but cache was empty"):
+        with self.assertRaisesRegex(ValueError, "use_cache was set, but cache was empty"):
             evaluated_tensors = neural_factory.infer(
-                tensors=[twenty_tensor, thirty_tensor],
-                verbose=False,
-                use_cache=True
+                tensors=[twenty_tensor, thirty_tensor], verbose=False, use_cache=True,
             )
 
         new_ten_tensor = minusten(mod_in=twenty_tensor)
-        evaluated_tensors = neural_factory.infer(
-            tensors=[new_ten_tensor],
-            verbose=False,
-            cache=True
-        )
+        evaluated_tensors = neural_factory.infer(tensors=[new_ten_tensor], verbose=False, cache=True)
 
-        with self.assertRaisesRegex(ValueError,
-                                    "cache was set but was not empty"):
+        with self.assertRaisesRegex(ValueError, "cache was set but was not empty"):
             evaluated_tensors = neural_factory.infer(
-                tensors=[twenty_tensor, thirty_tensor],
-                verbose=False,
-                cache=True
+                tensors=[twenty_tensor, thirty_tensor], verbose=False, cache=True,
             )
 
         neural_factory.clear_cache()
-        evaluated_tensors = neural_factory.infer(
-            tensors=[new_ten_tensor],
-            verbose=False,
-            cache=True
-        )
+        evaluated_tensors = neural_factory.infer(tensors=[new_ten_tensor], verbose=False, cache=True)
 
-        with self.assertRaisesRegex(ValueError,
-                                    "cache and use_cache were both set."):
+        with self.assertRaisesRegex(ValueError, "cache and use_cache were both set."):
             evaluated_tensors = neural_factory.infer(
-                tensors=[twenty_tensor, thirty_tensor],
-                verbose=False,
-                cache=True,
-                use_cache=True
+                tensors=[twenty_tensor, thirty_tensor], verbose=False, cache=True, use_cache=True,
             )
         self.assertEqual(evaluated_tensors[0][0].squeeze().data, 10)
diff --git a/tests/test_neural_factory.py b/tests/test_neural_factory.py
index 766165d832b9..3c06fcd20f29 100644
--- a/tests/test_neural_factory.py
+++ b/tests/test_neural_factory.py
@@ -17,39 +17,33 @@
 # =============================================================================
 
 import nemo
+
 from .common_setup import NeMoUnitTest
 
 
 class TestNeuralFactory(NeMoUnitTest):
-
     def test_creation(self):
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, local_rank=None,
-            create_tb_writer=False)
-        instance = neural_factory.get_module(
-            name="TaylorNet", collection="toys",
-            params={"dim": 4})
-        self.assertTrue(isinstance(
-            instance, nemo.backends.pytorch.tutorials.TaylorNet))
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
+        )
+        instance = neural_factory.get_module(name="TaylorNet", collection="toys", params={"dim": 4})
+        self.assertTrue(isinstance(instance, nemo.backends.pytorch.tutorials.TaylorNet))
 
     def test_simple_example(self):
         neural_factory = nemo.core.neural_factory.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch,
-            local_rank=None,
-            create_tb_writer=False)
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
+        )
         dl = neural_factory.get_module(
-            name="RealFunctionDataLayer", collection="toys",
-            params={"n": 10000, "batch_size": 128})
-        fx = neural_factory.get_module(name="TaylorNet", collection="toys",
-                                       params={"dim": 4})
-        loss = neural_factory.get_module(name="MSELoss", collection="toys",
-                                         params={})
+            name="RealFunctionDataLayer", collection="toys", params={"n": 10000, "batch_size": 128},
+        )
+        fx = neural_factory.get_module(name="TaylorNet", collection="toys", params={"dim": 4})
+        loss = neural_factory.get_module(name="MSELoss", collection="toys", params={})
 
         x, y = dl()
         y_pred = fx(x=x)
         loss_tensor = loss(predictions=y_pred, target=y)
 
         optimizer = neural_factory.get_trainer()
-        optimizer.train([loss_tensor], optimizer="sgd",
-                        optimization_params={"lr": 1e-3,
-                                             "num_epochs": 1})
+        optimizer.train(
+            [loss_tensor], optimizer="sgd", optimization_params={"lr": 1e-3, "num_epochs": 1},
+        )
diff --git a/tests/test_neural_modules.py b/tests/test_neural_modules.py
index 01c2542a7a34..2e10390573fe 100644
--- a/tests/test_neural_modules.py
+++ b/tests/test_neural_modules.py
@@ -17,6 +17,7 @@
 # =============================================================================
 
 import unittest
+
 import nemo
 from nemo.backends.pytorch.nm import TrainableNM
 
@@ -102,15 +103,16 @@ def test_constructor_TaylorNet(self):
 
     def test_call_TaylorNet(self):
         x_tg = nemo.core.neural_modules.NmTensor(
-            producer=None, producer_args=None,
+            producer=None,
+            producer_args=None,
             name=None,
             ntype=nemo.core.neural_types.NeuralType(
                 {
-                    0: nemo.core.neural_types.AxisType(
-                        nemo.core.neural_types.BatchTag),
-                    1: nemo.core.neural_types.AxisType(
-                        nemo.core.neural_types.ChannelTag)
-                }))
+                    0: nemo.core.neural_types.AxisType(nemo.core.neural_types.BatchTag),
+                    1: nemo.core.neural_types.AxisType(nemo.core.neural_types.ChannelTag),
+                }
+            ),
+        )
 
         tn = nemo.backends.pytorch.tutorials.TaylorNet(dim=4)
         # note that real port's name: x was used
@@ -119,8 +121,7 @@ def test_call_TaylorNet(self):
         self.assertEqual(y_pred.producer_args.get("x"), x_tg)
 
     def test_simple_chain(self):
-        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(
-            n=10000, batch_size=1)
+        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(n=10000, batch_size=1)
         trainable_module = nemo.backends.pytorch.tutorials.TaylorNet(dim=4)
         loss = nemo.backends.pytorch.tutorials.MSELoss()
         x, y = data_source()
@@ -129,8 +130,7 @@ def test_simple_chain(self):
 
         # check producers' bookkeeping
         self.assertEqual(loss_tensor.producer, loss)
-        self.assertEqual(loss_tensor.producer_args,
-                         {"predictions": y_pred, "target": y})
+        self.assertEqual(loss_tensor.producer_args, {"predictions": y_pred, "target": y})
         self.assertEqual(y_pred.producer, trainable_module)
         self.assertEqual(y_pred.producer_args, {"x": x})
         self.assertEqual(y.producer, data_source)
diff --git a/tests/test_neural_types.py b/tests/test_neural_types.py
index 28d4521279a2..52674201f4e4 100644
--- a/tests/test_neural_types.py
+++ b/tests/test_neural_types.py
@@ -19,11 +19,11 @@
 import tarfile
 import unittest
 
-from nemo.core import *
-import nemo.collections.asr as nemo_asr
-
 from ruamel.yaml import YAML
 
+import nemo.collections.asr as nemo_asr
+from nemo.core import *
+
 from .common_setup import NeMoUnitTest
 
 
@@ -44,153 +44,112 @@ def setUp(self) -> None:
             print("ASR data found in: {0}".format(data_folder + "asr"))
 
     def test_same(self):
-        btc = NeuralType(axis2type={0: AxisType(BatchTag),
-                                    1: AxisType(TimeTag),
-                                    2: AxisType(ChannelTag)})
-        btc2 = NeuralType(axis2type={0: AxisType(BatchTag),
-                                     1: AxisType(TimeTag),
-                                     2: AxisType(ChannelTag)})
+        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})
+        btc2 = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})
         self.assertEqual(btc2.compare(btc), NeuralTypeComparisonResult.SAME)
 
     def test_transpose_same(self):
-        btc = NeuralType(axis2type={0: AxisType(BatchTag),
-                                    1: AxisType(TimeTag),
-                                    2: AxisType(ChannelTag)})
-        tbc = NeuralType(axis2type={1: AxisType(BatchTag),
-                                    0: AxisType(TimeTag),
-                                    2: AxisType(ChannelTag)})
-
-        self.assertEqual(btc.compare(tbc),
-                         NeuralTypeComparisonResult.TRANSPOSE_SAME)
-        self.assertEqual(tbc.compare(btc),
-                         NeuralTypeComparisonResult.TRANSPOSE_SAME)
+        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})
+        tbc = NeuralType(axis2type={1: AxisType(BatchTag), 0: AxisType(TimeTag), 2: AxisType(ChannelTag),})
+
+        self.assertEqual(btc.compare(tbc), NeuralTypeComparisonResult.TRANSPOSE_SAME)
+        self.assertEqual(tbc.compare(btc), NeuralTypeComparisonResult.TRANSPOSE_SAME)
 
     def test_dim_incompatible(self):
-        nchw1 = NeuralType(axis2type={0: AxisType(BatchTag),
-                                      1: AxisType(ChannelTag),
-                                      2: AxisType(HeightTag, 224),
-                                      3: AxisType(WidthTag, 224)})
-        nchw2 = NeuralType(axis2type={0: AxisType(BatchTag),
-                                      1: AxisType(ChannelTag),
-                                      2: AxisType(HeightTag, 256),
-                                      3: AxisType(WidthTag, 256)})
-        self.assertEqual(nchw1.compare(nchw2),
-                         NeuralTypeComparisonResult.DIM_INCOMPATIBLE)
+        nchw1 = NeuralType(
+            axis2type={
+                0: AxisType(BatchTag),
+                1: AxisType(ChannelTag),
+                2: AxisType(HeightTag, 224),
+                3: AxisType(WidthTag, 224),
+            }
+        )
+        nchw2 = NeuralType(
+            axis2type={
+                0: AxisType(BatchTag),
+                1: AxisType(ChannelTag),
+                2: AxisType(HeightTag, 256),
+                3: AxisType(WidthTag, 256),
+            }
+        )
+        self.assertEqual(nchw1.compare(nchw2), NeuralTypeComparisonResult.DIM_INCOMPATIBLE)
 
     def test_rank_incompatible(self):
-        btc = NeuralType(axis2type={0: AxisType(BatchTag),
-                                    1: AxisType(TimeTag),
-                                    2: AxisType(ChannelTag)})
-        nchw = NeuralType(axis2type={0: AxisType(BatchTag),
-                                     1: AxisType(ChannelTag),
-                                     2: AxisType(HeightTag),
-                                     3: AxisType(WidthTag)})
-        self.assertEqual(nchw.compare(
-            btc), NeuralTypeComparisonResult.INCOMPATIBLE)
+        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})
+        nchw = NeuralType(
+            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag),}
+        )
+        self.assertEqual(nchw.compare(btc), NeuralTypeComparisonResult.INCOMPATIBLE)
 
     def test_axis_type(self):
         ax1 = AxisType(BatchTag)
         ax2 = AxisType(TimeTag)
         ax3 = AxisType(ProcessedTimeTag)
-        self.assertEqual(ax1.compare_to(ax2),
-                         NeuralTypeComparisonResult.INCOMPATIBLE)
-        self.assertEqual(ax3.compare_to(ax2),
-                         NeuralTypeComparisonResult.LESS)
-        self.assertEqual(ax2.compare_to(ax3),
-                         NeuralTypeComparisonResult.GREATER)
-        self.assertEqual(ax2.compare_to(AxisType(TimeTag)),
-                         NeuralTypeComparisonResult.SAME)
+        self.assertEqual(ax1.compare_to(ax2), NeuralTypeComparisonResult.INCOMPATIBLE)
+        self.assertEqual(ax3.compare_to(ax2), NeuralTypeComparisonResult.LESS)
+        self.assertEqual(ax2.compare_to(ax3), NeuralTypeComparisonResult.GREATER)
+        self.assertEqual(ax2.compare_to(AxisType(TimeTag)), NeuralTypeComparisonResult.SAME)
 
     def test_semantic_incompatible(self):
-        nchw = NeuralType(axis2type={0: AxisType(BatchTag),
-                                     1: AxisType(ChannelTag),
-                                     2: AxisType(HeightTag),
-                                     3: AxisType(WidthTag)})
-        badd = NeuralType(axis2type={0: AxisType(BatchTag),
-                                     1: AxisType(ChannelTag),
-                                     2: AxisType(ChannelTag),
-                                     3: AxisType(WidthTag)})
-        self.assertEqual(nchw.compare(
-            badd), NeuralTypeComparisonResult.INCOMPATIBLE)
-        self.assertEqual(badd.compare(
-            nchw), NeuralTypeComparisonResult.INCOMPATIBLE)
+        nchw = NeuralType(
+            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag),}
+        )
+        badd = NeuralType(
+            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(ChannelTag), 3: AxisType(WidthTag),}
+        )
+        self.assertEqual(nchw.compare(badd), NeuralTypeComparisonResult.INCOMPATIBLE)
+        self.assertEqual(badd.compare(nchw), NeuralTypeComparisonResult.INCOMPATIBLE)
 
     def test_root(self):
         root = NeuralType({})
         non_tensor = NeuralType(None)
-        btc = NeuralType(axis2type={0: AxisType(BatchTag),
-                                    1: AxisType(TimeTag),
-                                    2: AxisType(ChannelTag)})
-        nchw = NeuralType(axis2type={0: AxisType(BatchTag),
-                                     1: AxisType(ChannelTag),
-                                     2: AxisType(HeightTag),
-                                     3: AxisType(WidthTag)})
-        self.assertEqual(root.compare(btc),
-                         NeuralTypeComparisonResult.SAME)
-        self.assertEqual(root.compare(nchw),
-                         NeuralTypeComparisonResult.SAME)
-        self.assertEqual(root.compare(non_tensor),
-                         NeuralTypeComparisonResult.SAME)
-
-        self.assertEqual(non_tensor.compare(root),
-                         NeuralTypeComparisonResult.INCOMPATIBLE)
-        self.assertEqual(btc.compare(root),
-                         NeuralTypeComparisonResult.INCOMPATIBLE)
-        self.assertEqual(nchw.compare(root),
-                         NeuralTypeComparisonResult.INCOMPATIBLE)
+        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})
+        nchw = NeuralType(
+            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag),}
+        )
+        self.assertEqual(root.compare(btc), NeuralTypeComparisonResult.SAME)
+        self.assertEqual(root.compare(nchw), NeuralTypeComparisonResult.SAME)
+        self.assertEqual(root.compare(non_tensor), NeuralTypeComparisonResult.SAME)
+
+        self.assertEqual(non_tensor.compare(root), NeuralTypeComparisonResult.INCOMPATIBLE)
+        self.assertEqual(btc.compare(root), NeuralTypeComparisonResult.INCOMPATIBLE)
+        self.assertEqual(nchw.compare(root), NeuralTypeComparisonResult.INCOMPATIBLE)
 
     def test_combiner_type_infer(self):
         combiner = nemo.backends.pytorch.common.SimpleCombiner(mode="add")
-        x_tg = nemo.core.NmTensor(producer=None, producer_args=None,
-                                  name=None,
-                                  ntype=NeuralType(
-                                      {
-                                          0: AxisType(BatchTag),
-                                      }))
-        y_tg = nemo.core.NmTensor(producer=None, producer_args=None,
-                                  name=None,
-                                  ntype=NeuralType(
-                                      {
-                                          0: AxisType(BatchTag),
-                                      }))
+        x_tg = nemo.core.NmTensor(
+            producer=None, producer_args=None, name=None, ntype=NeuralType({0: AxisType(BatchTag),}),
+        )
+        y_tg = nemo.core.NmTensor(
+            producer=None, producer_args=None, name=None, ntype=NeuralType({0: AxisType(BatchTag),}),
+        )
         res = combiner(x1=y_tg, x2=x_tg)
-        self.assertEqual(res.compare(x_tg),
-                         NeuralTypeComparisonResult.SAME)
-        self.assertEqual(res.compare(y_tg),
-                         NeuralTypeComparisonResult.SAME)
-        self.assertEqual(x_tg.compare(res),
-                         NeuralTypeComparisonResult.SAME)
-        self.assertEqual(y_tg.compare(res),
-                         NeuralTypeComparisonResult.SAME)
+        self.assertEqual(res.compare(x_tg), NeuralTypeComparisonResult.SAME)
+        self.assertEqual(res.compare(y_tg), NeuralTypeComparisonResult.SAME)
+        self.assertEqual(x_tg.compare(res), NeuralTypeComparisonResult.SAME)
+        self.assertEqual(y_tg.compare(res), NeuralTypeComparisonResult.SAME)
 
         combiner1 = nemo.backends.pytorch.common.SimpleCombiner(mode="add")
-        x_tg1 = NmTensor(producer=None, producer_args=None,
-                         name=None,
-                         ntype=NeuralType(
-                             {
-                                 0: AxisType(BatchTag),
-                                 1: AxisType(ChannelTag)
-                             }))
-        y_tg1 = NmTensor(producer=None, producer_args=None,
-                         name=None,
-                         ntype=NeuralType(
-                             {
-                                 0: AxisType(BatchTag),
-                                 1: AxisType(ChannelTag)
-                             }))
+        x_tg1 = NmTensor(
+            producer=None,
+            producer_args=None,
+            name=None,
+            ntype=NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+        )
+        y_tg1 = NmTensor(
+            producer=None,
+            producer_args=None,
+            name=None,
+            ntype=NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+        )
         res1 = combiner1(x1=y_tg1, x2=x_tg1)
-        self.assertEqual(res1.compare(x_tg1),
-                         NeuralTypeComparisonResult.SAME)
-        self.assertEqual(res1.compare(y_tg1),
-                         NeuralTypeComparisonResult.SAME)
-        self.assertEqual(x_tg1.compare(res1),
-                         NeuralTypeComparisonResult.SAME)
-        self.assertEqual(y_tg1.compare(res1),
-                         NeuralTypeComparisonResult.SAME)
+        self.assertEqual(res1.compare(x_tg1), NeuralTypeComparisonResult.SAME)
+        self.assertEqual(res1.compare(y_tg1), NeuralTypeComparisonResult.SAME)
+        self.assertEqual(x_tg1.compare(res1), NeuralTypeComparisonResult.SAME)
+        self.assertEqual(y_tg1.compare(res1), NeuralTypeComparisonResult.SAME)
 
     def test_optional_input_no_input(self):
-        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(
-            n=100, batch_size=128)
+        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(n=100, batch_size=128)
         trainable_module = nemo.backends.pytorch.tutorials.TaylorNetO(dim=4)
         loss = nemo.backends.pytorch.tutorials.MSELoss()
         x, y = data_source()
@@ -199,13 +158,11 @@ def test_optional_input_no_input(self):
 
         optimizer = nemo.backends.pytorch.actions.PtActions()
         optimizer.train(
-            tensors_to_optimize=[loss_tensor],
-            optimizer="sgd",
-            optimization_params={"lr": 0.0003, "num_epochs": 1})
+            tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1},
+        )
 
     def test_optional_input_no_with_input(self):
-        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(
-            n=100, batch_size=128)
+        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(n=100, batch_size=128)
         trainable_module = nemo.backends.pytorch.tutorials.TaylorNetO(dim=4)
         loss = nemo.backends.pytorch.tutorials.MSELoss()
         x, y = data_source()
@@ -213,34 +170,29 @@ def test_optional_input_no_with_input(self):
         loss_tensor = loss(predictions=y_pred, target=y)
         optimizer = nemo.backends.pytorch.actions.PtActions()
         optimizer.train(
-            tensors_to_optimize=[loss_tensor],
-            optimizer="sgd",
-            optimization_params={"lr": 0.0003, "num_epochs": 1})
+            tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1},
+        )
 
     def test_optional_input_no_with_wrong_input(self):
-
         def wrong_fn():
-            data_source = \
-                nemo.backends.pytorch.tutorials.RealFunctionDataLayer(
-                    n=100, batch_size=128)
-            trainable_module = nemo.backends.pytorch.tutorials.TaylorNetO(
-                dim=4)
+            data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(n=100, batch_size=128)
+            trainable_module = nemo.backends.pytorch.tutorials.TaylorNetO(dim=4)
             loss = nemo.backends.pytorch.tutorials.MSELoss()
             x, y = data_source()
-            wrong_optional = NmTensor(producer=None, producer_args=None,
-                                      name=None,
-                                      ntype=NeuralType(
-                                          {
-                                              0: AxisType(ChannelTag),
-                                              1: AxisType(BatchTag)
-                                          }))
+            wrong_optional = NmTensor(
+                producer=None,
+                producer_args=None,
+                name=None,
+                ntype=NeuralType({0: AxisType(ChannelTag), 1: AxisType(BatchTag)}),
+            )
             y_pred = trainable_module(x=x, o=wrong_optional)
             loss_tensor = loss(predictions=y_pred, target=y)
             optimizer = nemo.backends.pytorch.actions.PtActions()
             optimizer.train(
                 tensors_to_optimize=[loss_tensor],
                 optimizer="sgd",
-                optimization_params={"lr": 0.0003, "num_epochs": 1})
+                optimization_params={"lr": 0.0003, "num_epochs": 1},
+            )
 
         self.assertRaises(NeuralPortNmTensorMismatchError, wrong_fn)
 
@@ -251,35 +203,32 @@ def test_simple_dags(self):
         labels = jasper_model_definition['labels']
 
         data_layer = nemo_asr.AudioToTextDataLayer(
-            manifest_filepath=self.manifest_filepath,
-            labels=labels, batch_size=4)
+            manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4,
+        )
         data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
-            **jasper_model_definition['AudioToMelSpectrogramPreprocessor'])
+            **jasper_model_definition['AudioToMelSpectrogramPreprocessor']
+        )
         jasper_encoder = nemo_asr.JasperEncoder(
-            feat_in=jasper_model_definition[
-                'AudioToMelSpectrogramPreprocessor']['features'],
-            **jasper_model_definition['JasperEncoder'])
-        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
-                                                      num_classes=len(labels))
+            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
+            **jasper_model_definition['JasperEncoder']
+        )
+        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(labels))
         ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels))
         greedy_decoder = nemo_asr.GreedyCTCDecoder()
 
         # DAG definition
-        audio_signal, audio_signal_len, transcript, transcript_len = \
-            data_layer()
-        processed_signal, processed_signal_len = data_preprocessor(
-            input_signal=audio_signal,
-            length=audio_signal_len)
+        (audio_signal, audio_signal_len, transcript, transcript_len,) = data_layer()
+        processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len)
 
         spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
         aug_signal = spec_augment(input_spec=processed_signal)
 
-        encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
-                                              length=processed_signal_len)
+        encoded, encoded_len = jasper_encoder(audio_signal=aug_signal, length=processed_signal_len)
         log_probs = jasper_decoder(encoder_output=encoded)
         predictions = greedy_decoder(log_probs=log_probs)
-        loss = ctc_loss(log_probs=log_probs, targets=transcript,
-                        input_length=encoded_len, target_length=transcript_len)
+        loss = ctc_loss(
+            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
+        )
 
         def wrong():
             with open("tests/data/jasper_smaller.yaml") as file:
@@ -287,30 +236,26 @@ def wrong():
             labels = jasper_config['labels']
 
             data_layer = nemo_asr.AudioToTextDataLayer(
-                manifest_filepath=self.manifest_filepath,
-                labels=labels, batch_size=4)
+                manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4,
+            )
             data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
-                **jasper_config['AudioToMelSpectrogramPreprocessor'])
+                **jasper_config['AudioToMelSpectrogramPreprocessor']
+            )
             jasper_encoder = nemo_asr.JasperEncoder(
-                feat_in=jasper_config[
-                    'AudioToMelSpectrogramPreprocessor']['features'],
+                feat_in=jasper_config['AudioToMelSpectrogramPreprocessor']['features'],
                 **jasper_config['JasperEncoder']
             )
-            jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
-                                                          num_classes=len(
-                                                              labels))
+            jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(labels))
             # DAG definition
-            audio_signal, audio_signal_len, transcript, transcript_len = \
-                data_layer()
+            (audio_signal, audio_signal_len, transcript, transcript_len,) = data_layer()
             processed_signal, processed_signal_len = data_preprocessor(
-                input_signal=audio_signal,
-                length=audio_signal_len)
+                input_signal=audio_signal, length=audio_signal_len
+            )
 
             spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
             aug_signal = spec_augment(input_spec=processed_signal)
 
-            encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
-                                                  length=processed_signal_len)
+            encoded, encoded_len = jasper_encoder(audio_signal=aug_signal, length=processed_signal_len)
             log_probs = jasper_decoder(encoder_output=processed_signal)
 
         self.assertRaises(NeuralPortNmTensorMismatchError, wrong)
diff --git a/tests/test_policies.py b/tests/test_policies.py
index 470a36d7f200..eb4e31110018 100644
--- a/tests/test_policies.py
+++ b/tests/test_policies.py
@@ -16,8 +16,7 @@
 # limitations under the License.
 # =============================================================================
 
-from nemo.utils.lr_policies import SquareAnnealing, CosineAnnealing, \
-    WarmupAnnealing
+from nemo.utils.lr_policies import CosineAnnealing, SquareAnnealing, WarmupAnnealing
 
 from .common_setup import NeMoUnitTest
 
diff --git a/tests/test_pytorch_trainers.py b/tests/test_pytorch_trainers.py
index dd938ccfa189..a9bb8bc27edc 100644
--- a/tests/test_pytorch_trainers.py
+++ b/tests/test_pytorch_trainers.py
@@ -17,17 +17,16 @@
 # =============================================================================
 
 import unittest
+
 import nemo
 
 from .common_setup import NeMoUnitTest
 
 
 class TestPytorchTrainers(NeMoUnitTest):
-
     def test_simple_train(self):
         print("Simplest train test")
-        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(
-            n=10000, batch_size=128)
+        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(n=10000, batch_size=128)
         trainable_module = nemo.backends.pytorch.tutorials.TaylorNet(dim=4)
         loss = nemo.backends.pytorch.tutorials.MSELoss()
         x, y = data_source()
@@ -36,17 +35,12 @@ def test_simple_train(self):
 
         optimizer = nemo.backends.pytorch.actions.PtActions()
         optimizer.train(
-            tensors_to_optimize=[loss_tensor],
-            optimizer="sgd",
-            optimization_params={"lr": 0.0003, "num_epochs": 1}
+            tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1},
         )
 
     def test_simple_train_named_output(self):
         print('Simplest train test with using named output.')
-        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(
-            n=10000,
-            batch_size=128,
-        )
+        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(n=10000, batch_size=128,)
         trainable_module = nemo.backends.pytorch.tutorials.TaylorNet(dim=4)
         loss = nemo.backends.pytorch.tutorials.MSELoss()
 
@@ -61,15 +55,12 @@ def test_simple_train_named_output(self):
 
         optimizer = nemo.backends.pytorch.actions.PtActions()
         optimizer.train(
-            tensors_to_optimize=[loss_tensor],
-            optimizer="sgd",
-            optimization_params={"lr": 0.0003, "num_epochs": 1}
+            tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1},
         )
 
     def test_simple_chained_train(self):
         print("Chained train test")
-        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(
-            n=10000, batch_size=32)
+        data_source = nemo.backends.pytorch.tutorials.RealFunctionDataLayer(n=10000, batch_size=32)
         trainable_module1 = nemo.backends.pytorch.tutorials.TaylorNet(dim=4)
         trainable_module2 = nemo.backends.pytorch.tutorials.TaylorNet(dim=2)
         trainable_module3 = nemo.backends.pytorch.tutorials.TaylorNet(dim=2)
@@ -82,7 +73,5 @@ def test_simple_chained_train(self):
 
         optimizer = nemo.backends.pytorch.actions.PtActions()
         optimizer.train(
-            tensors_to_optimize=[loss_tensor],
-            optimizer="sgd",
-            optimization_params={"lr": 0.0003, "num_epochs": 1}
+            tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1},
         )
diff --git a/tests/test_spc_tokenizer.py b/tests/test_spc_tokenizer.py
index 01cbd54c5b99..61ebb9a8bb39 100644
--- a/tests/test_spc_tokenizer.py
+++ b/tests/test_spc_tokenizer.py
@@ -22,15 +22,13 @@
 
 
 class TestSPCTokenizer(NeMoUnitTest):
-
     def test_add_special_tokens(self):
         tokenizer = SentencePieceTokenizer("./tests/data/m_common.model")
 
         special_tokens = ["[CLS]", "[MASK]", "[SEP]"]
         tokenizer.add_special_tokens(special_tokens)
 
-        self.assertTrue(tokenizer.vocab_size == tokenizer.original_vocab_size
-                        + len(special_tokens))
+        self.assertTrue(tokenizer.vocab_size == tokenizer.original_vocab_size + len(special_tokens))
 
     def test_text_to_tokens(self):
         tokenizer = SentencePieceTokenizer("./tests/data/m_common.model")
diff --git a/tests/test_squad.py b/tests/test_squad.py
index 76188de2a723..19043b199e1f 100644
--- a/tests/test_squad.py
+++ b/tests/test_squad.py
@@ -22,50 +22,43 @@
 import sys
 
 import nemo
-from nemo.utils.lr_policies import get_lr_policy
-
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp.utils.callbacks.squad import \
-    eval_iter_callback, eval_epochs_done_callback
+from download_squad import SquadDownloader
+from nemo.collections.nlp.utils.callbacks.squad import eval_epochs_done_callback, eval_iter_callback
+from nemo.utils.lr_policies import get_lr_policy
 
 from .common_setup import NeMoUnitTest
 
 # pep8: disable=E402
-sys.path.insert(0, os.path.abspath(
-    os.path.join(os.path.dirname(__file__), '../examples/nlp/scripts')))
+sys.path.insert(
+    0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../examples/nlp/scripts')),
+)
+
 
-from download_squad import SquadDownloader
 # pep8: enable=E402
 
-class TestSquad(NeMoUnitTest):
 
+class TestSquad(NeMoUnitTest):
     @classmethod
     def setUpClass(cls) -> None:
         super().setUpClass()
-        data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                                   'data/nlp'))
+        data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/nlp'))
         if not os.path.exists(data_folder):
             print(f"mkdir {data_folder}")
             os.mkdir(data_folder)
 
         squad_folder = data_folder + '/squad'
         if not os.path.exists(squad_folder):
-            download_script_path = os.path.abspath(
-                os.path.join(os.path.dirname(__file__),
-                             '../examples/nlp/scripts'))
+            download_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../examples/nlp/scripts'))
             sys.path.insert(0, download_script_path)
             print("Extracting Squad data to: {0}".format(squad_folder))
             squad_dl = SquadDownloader(data_folder)
             squad_dl.download()
 
-            squad_v1_dev_file = \
-                os.path.join(squad_folder, 'v1.1/dev-v1.1.json')
-            squad_v1_train_file = \
-                os.path.join(squad_folder, 'v1.1/train-v1.1.json')
-            squad_v2_dev_file = \
-                os.path.join(squad_folder, 'v2.0/dev-v2.0.json')
-            squad_v2_train_file = \
-                os.path.join(squad_folder, 'v2.0/train-v2.0.json')
+            squad_v1_dev_file = os.path.join(squad_folder, 'v1.1/dev-v1.1.json')
+            squad_v1_train_file = os.path.join(squad_folder, 'v1.1/train-v1.1.json')
+            squad_v2_dev_file = os.path.join(squad_folder, 'v2.0/dev-v2.0.json')
+            squad_v2_train_file = os.path.join(squad_folder, 'v2.0/train-v2.0.json')
             with open(squad_v1_dev_file, "r", encoding="utf-8") as json_file:
                 data = json.load(json_file)
             data["data"] = [data["data"][0]]
@@ -85,13 +78,10 @@ def setUpClass(cls) -> None:
     @classmethod
     def tearDownClass(cls) -> None:
         super().tearDownClass()
-        squad_folder = os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                                    'data/nlp/squad'))
+        squad_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/nlp/squad'))
         if os.path.exists(squad_folder):
             shutil.rmtree(squad_folder)
-            download_script_path = os.path.abspath(
-                os.path.join(os.path.dirname(__file__),
-                             '../examples/nlp/scripts'))
+            download_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../examples/nlp/scripts'))
             if download_script_path in sys.path:
                 sys.path.remove(download_script_path)
 
@@ -99,8 +89,7 @@ def test_squad_v1(self):
         version_2_with_negative = False
         pretrained_bert_model = 'bert-base-uncased'
         batch_size = 3
-        data_dir = os.path.abspath(
-                os.path.join(os.path.dirname(__file__), 'data/nlp/squad/v1.1'))
+        data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/nlp/squad/v1.1'))
         max_query_length = 64
         max_seq_length = 384
         doc_stride = 128
@@ -115,63 +104,58 @@ def test_squad_v1(self):
 
         tokenizer = nemo_nlp.NemoBertTokenizer(pretrained_bert_model)
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, local_rank=None,
-            create_tb_writer=False)
-        model = nemo_nlp.huggingface.BERT(
-            pretrained_model_name=pretrained_bert_model)
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
+        )
+        model = nemo_nlp.huggingface.BERT(pretrained_model_name=pretrained_bert_model)
         hidden_size = model.local_parameters["hidden_size"]
-        qa_head = nemo_nlp.TokenClassifier(
-                                        hidden_size=hidden_size,
-                                        num_classes=2,
-                                        num_layers=1,
-                                        log_softmax=False)
+        qa_head = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False,)
         squad_loss = nemo_nlp.QuestionAnsweringLoss()
 
         data_layer = nemo_nlp.BertQuestionAnsweringDataLayer(
-                    mode='train',
-                    version_2_with_negative=version_2_with_negative,
-                    batch_size=batch_size,
-                    tokenizer=tokenizer,
-                    data_dir=data_dir,
-                    max_query_length=max_query_length,
-                    max_seq_length=max_seq_length,
-                    doc_stride=doc_stride)
-
-        input_ids, input_type_ids, input_mask, \
-            start_positions, end_positions, _ = data_layer()
-
-        hidden_states = model(
-                            input_ids=input_ids,
-                            token_type_ids=input_type_ids,
-                            attention_mask=input_mask)
+            mode='train',
+            version_2_with_negative=version_2_with_negative,
+            batch_size=batch_size,
+            tokenizer=tokenizer,
+            data_dir=data_dir,
+            max_query_length=max_query_length,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+        )
+
+        (input_ids, input_type_ids, input_mask, start_positions, end_positions, _,) = data_layer()
+
+        hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
 
         qa_output = qa_head(hidden_states=hidden_states)
-        loss, _, _ = squad_loss(
-            logits=qa_output,
-            start_positions=start_positions,
-            end_positions=end_positions)
+        loss, _, _ = squad_loss(logits=qa_output, start_positions=start_positions, end_positions=end_positions,)
 
         data_layer_eval = nemo_nlp.BertQuestionAnsweringDataLayer(
-                    mode='dev',
-                    version_2_with_negative=version_2_with_negative,
-                    batch_size=batch_size,
-                    tokenizer=tokenizer,
-                    data_dir=data_dir,
-                    max_query_length=max_query_length,
-                    max_seq_length=max_seq_length,
-                    doc_stride=doc_stride)
-        input_ids_eval, input_type_ids_eval, input_mask_eval, \
-            start_positions_eval, end_positions_eval, unique_ids_eval \
-            = data_layer_eval()
-
-        hidden_states_eval = model(input_ids=input_ids_eval,
-                                   token_type_ids=input_type_ids_eval,
-                                   attention_mask=input_mask_eval)
+            mode='dev',
+            version_2_with_negative=version_2_with_negative,
+            batch_size=batch_size,
+            tokenizer=tokenizer,
+            data_dir=data_dir,
+            max_query_length=max_query_length,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+        )
+        (
+            input_ids_eval,
+            input_type_ids_eval,
+            input_mask_eval,
+            start_positions_eval,
+            end_positions_eval,
+            unique_ids_eval,
+        ) = data_layer_eval()
+
+        hidden_states_eval = model(
+            input_ids=input_ids_eval, token_type_ids=input_type_ids_eval, attention_mask=input_mask_eval,
+        )
 
         qa_output_eval = qa_head(hidden_states=hidden_states_eval)
         _, start_logits_eval, end_logits_eval = squad_loss(
-            logits=qa_output_eval, start_positions=start_positions_eval,
-            end_positions=end_positions_eval)
+            logits=qa_output_eval, start_positions=start_positions_eval, end_positions=end_positions_eval,
+        )
         eval_output = [start_logits_eval, end_logits_eval, unique_ids_eval]
 
         callback_train = nemo.core.SimpleLossLoggerCallback(
@@ -179,39 +163,40 @@ def test_squad_v1(self):
             print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())),
             get_tb_values=lambda x: [["loss", x[0]]],
             step_freq=10,
-            tb_writer=neural_factory.tb_writer)
+            tb_writer=neural_factory.tb_writer,
+        )
 
         callbacks_eval = nemo.core.EvaluatorCallback(
             eval_tensors=eval_output,
             user_iter_callback=lambda x, y: eval_iter_callback(x, y),
-            user_epochs_done_callback=lambda x:
-                eval_epochs_done_callback(
-                    x, eval_data_layer=data_layer_eval,
-                    do_lower_case=do_lower_case,
-                    n_best_size=n_best_size,
-                    max_answer_length=max_answer_length,
-                    version_2_with_negative=version_2_with_negative,
-                    null_score_diff_threshold=null_score_diff_threshold),
-                tb_writer=neural_factory.tb_writer,
-                eval_step=eval_step_freq)
-
-        lr_policy_fn = get_lr_policy('WarmupAnnealing',
-                                     total_steps=max_steps,
-                                     warmup_ratio=lr_warmup_proportion)
-
-        neural_factory.train(tensors_to_optimize=[loss],
-                             callbacks=[callback_train, callbacks_eval],
-                             lr_policy=lr_policy_fn,
-                             optimizer='adam_w',
-                             optimization_params={"max_steps": max_steps,
-                                                  "lr": lr})
+            user_epochs_done_callback=lambda x: eval_epochs_done_callback(
+                x,
+                eval_data_layer=data_layer_eval,
+                do_lower_case=do_lower_case,
+                n_best_size=n_best_size,
+                max_answer_length=max_answer_length,
+                version_2_with_negative=version_2_with_negative,
+                null_score_diff_threshold=null_score_diff_threshold,
+            ),
+            tb_writer=neural_factory.tb_writer,
+            eval_step=eval_step_freq,
+        )
+
+        lr_policy_fn = get_lr_policy('WarmupAnnealing', total_steps=max_steps, warmup_ratio=lr_warmup_proportion,)
+
+        neural_factory.train(
+            tensors_to_optimize=[loss],
+            callbacks=[callback_train, callbacks_eval],
+            lr_policy=lr_policy_fn,
+            optimizer='adam_w',
+            optimization_params={"max_steps": max_steps, "lr": lr},
+        )
 
     def test_squad_v2(self):
         version_2_with_negative = True
         pretrained_bert_model = 'bert-base-uncased'
         batch_size = 3
-        data_dir = os.path.abspath(
-                os.path.join(os.path.dirname(__file__), 'data/nlp/squad/v2.0'))
+        data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/nlp/squad/v2.0'))
         max_query_length = 64
         max_seq_length = 384
         doc_stride = 128
@@ -226,63 +211,58 @@ def test_squad_v2(self):
 
         tokenizer = nemo_nlp.NemoBertTokenizer(pretrained_bert_model)
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, local_rank=None,
-            create_tb_writer=False)
-        model = nemo_nlp.huggingface.BERT(
-            pretrained_model_name=pretrained_bert_model)
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
+        )
+        model = nemo_nlp.huggingface.BERT(pretrained_model_name=pretrained_bert_model)
         hidden_size = model.local_parameters["hidden_size"]
-        qa_head = nemo_nlp.TokenClassifier(
-                                        hidden_size=hidden_size,
-                                        num_classes=2,
-                                        num_layers=1,
-                                        log_softmax=False)
+        qa_head = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False,)
         squad_loss = nemo_nlp.QuestionAnsweringLoss()
 
         data_layer = nemo_nlp.BertQuestionAnsweringDataLayer(
-                    mode='train',
-                    version_2_with_negative=version_2_with_negative,
-                    batch_size=batch_size,
-                    tokenizer=tokenizer,
-                    data_dir=data_dir,
-                    max_query_length=max_query_length,
-                    max_seq_length=max_seq_length,
-                    doc_stride=doc_stride)
-
-        input_ids, input_type_ids, input_mask, \
-            start_positions, end_positions, _ = data_layer()
-
-        hidden_states = model(
-                            input_ids=input_ids,
-                            token_type_ids=input_type_ids,
-                            attention_mask=input_mask)
+            mode='train',
+            version_2_with_negative=version_2_with_negative,
+            batch_size=batch_size,
+            tokenizer=tokenizer,
+            data_dir=data_dir,
+            max_query_length=max_query_length,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+        )
+
+        (input_ids, input_type_ids, input_mask, start_positions, end_positions, _,) = data_layer()
+
+        hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
 
         qa_output = qa_head(hidden_states=hidden_states)
-        loss, _, _ = squad_loss(
-            logits=qa_output,
-            start_positions=start_positions,
-            end_positions=end_positions)
+        loss, _, _ = squad_loss(logits=qa_output, start_positions=start_positions, end_positions=end_positions,)
 
         data_layer_eval = nemo_nlp.BertQuestionAnsweringDataLayer(
-                    mode='dev',
-                    version_2_with_negative=version_2_with_negative,
-                    batch_size=batch_size,
-                    tokenizer=tokenizer,
-                    data_dir=data_dir,
-                    max_query_length=max_query_length,
-                    max_seq_length=max_seq_length,
-                    doc_stride=doc_stride)
-        input_ids_eval, input_type_ids_eval, input_mask_eval, \
-            start_positions_eval, end_positions_eval, unique_ids_eval \
-            = data_layer_eval()
-
-        hidden_states_eval = model(input_ids=input_ids_eval,
-                                   token_type_ids=input_type_ids_eval,
-                                   attention_mask=input_mask_eval)
+            mode='dev',
+            version_2_with_negative=version_2_with_negative,
+            batch_size=batch_size,
+            tokenizer=tokenizer,
+            data_dir=data_dir,
+            max_query_length=max_query_length,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+        )
+        (
+            input_ids_eval,
+            input_type_ids_eval,
+            input_mask_eval,
+            start_positions_eval,
+            end_positions_eval,
+            unique_ids_eval,
+        ) = data_layer_eval()
+
+        hidden_states_eval = model(
+            input_ids=input_ids_eval, token_type_ids=input_type_ids_eval, attention_mask=input_mask_eval,
+        )
 
         qa_output_eval = qa_head(hidden_states=hidden_states_eval)
         _, start_logits_eval, end_logits_eval = squad_loss(
-            logits=qa_output_eval, start_positions=start_positions_eval,
-            end_positions=end_positions_eval)
+            logits=qa_output_eval, start_positions=start_positions_eval, end_positions=end_positions_eval,
+        )
         eval_output = [start_logits_eval, end_logits_eval, unique_ids_eval]
 
         callback_train = nemo.core.SimpleLossLoggerCallback(
@@ -290,29 +270,31 @@ def test_squad_v2(self):
             print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())),
             get_tb_values=lambda x: [["loss", x[0]]],
             step_freq=10,
-            tb_writer=neural_factory.tb_writer)
+            tb_writer=neural_factory.tb_writer,
+        )
 
         callbacks_eval = nemo.core.EvaluatorCallback(
             eval_tensors=eval_output,
             user_iter_callback=lambda x, y: eval_iter_callback(x, y),
-            user_epochs_done_callback=lambda x:
-                eval_epochs_done_callback(
-                    x, eval_data_layer=data_layer_eval,
-                    do_lower_case=do_lower_case,
-                    n_best_size=n_best_size,
-                    max_answer_length=max_answer_length,
-                    version_2_with_negative=version_2_with_negative,
-                    null_score_diff_threshold=null_score_diff_threshold),
-                tb_writer=neural_factory.tb_writer,
-                eval_step=eval_step_freq)
-
-        lr_policy_fn = get_lr_policy('WarmupAnnealing',
-                                     total_steps=max_steps,
-                                     warmup_ratio=lr_warmup_proportion)
-
-        neural_factory.train(tensors_to_optimize=[loss],
-                             callbacks=[callback_train, callbacks_eval],
-                             lr_policy=lr_policy_fn,
-                             optimizer='adam_w',
-                             optimization_params={"max_steps": max_steps,
-                                                  "lr": lr})
+            user_epochs_done_callback=lambda x: eval_epochs_done_callback(
+                x,
+                eval_data_layer=data_layer_eval,
+                do_lower_case=do_lower_case,
+                n_best_size=n_best_size,
+                max_answer_length=max_answer_length,
+                version_2_with_negative=version_2_with_negative,
+                null_score_diff_threshold=null_score_diff_threshold,
+            ),
+            tb_writer=neural_factory.tb_writer,
+            eval_step=eval_step_freq,
+        )
+
+        lr_policy_fn = get_lr_policy('WarmupAnnealing', total_steps=max_steps, warmup_ratio=lr_warmup_proportion,)
+
+        neural_factory.train(
+            tensors_to_optimize=[loss],
+            callbacks=[callback_train, callbacks_eval],
+            lr_policy=lr_policy_fn,
+            optimizer='adam_w',
+            optimization_params={"max_steps": max_steps, "lr": lr},
+        )
diff --git a/tests/test_tts.py b/tests/test_tts.py
index e12d746c33b6..bc874d7313a5 100644
--- a/tests/test_tts.py
+++ b/tests/test_tts.py
@@ -27,9 +27,36 @@
 
 
 class TestTTSPytorch(NeMoUnitTest):
-    labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h",
-              "i", "j", "k", "l", "m", "n", "o", "p", "q",
-              "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
+    labels = [
+        " ",
+        "a",
+        "b",
+        "c",
+        "d",
+        "e",
+        "f",
+        "g",
+        "h",
+        "i",
+        "j",
+        "k",
+        "l",
+        "m",
+        "n",
+        "o",
+        "p",
+        "q",
+        "r",
+        "s",
+        "t",
+        "u",
+        "v",
+        "w",
+        "x",
+        "y",
+        "z",
+        "'",
+    ]
     manifest_filepath = "tests/data/asr/an4_train.json"
 
     def setUp(self) -> None:
@@ -46,9 +73,7 @@ def setUp(self) -> None:
 
     def test_tacotron2_training(self):
         data_layer = nemo_asr.AudioToTextDataLayer(
-            manifest_filepath=self.manifest_filepath,
-            labels=self.labels,
-            batch_size=4
+            manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4,
         )
         preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
             window_size=None,
@@ -58,13 +83,11 @@ def test_tacotron2_training(self):
             normalize=None,
             preemph=None,
             dither=0,
-            mag_power=1.,
-            pad_value=-11.52)
+            mag_power=1.0,
+            pad_value=-11.52,
+        )
         text_embedding = nemo_tts.TextEmbedding(len(self.labels), 256)
-        t2_enc = nemo_tts.Tacotron2Encoder(
-            encoder_n_convolutions=2,
-            encoder_kernel_size=5,
-            encoder_embedding_dim=256)
+        t2_enc = nemo_tts.Tacotron2Encoder(encoder_n_convolutions=2, encoder_kernel_size=5, encoder_embedding_dim=256,)
         t2_dec = nemo_tts.Tacotron2Decoder(
             n_mel_channels=64,
             n_frames_per_step=1,
@@ -78,33 +101,25 @@ def test_tacotron2_training(self):
             attention_rnn_dim=512,
             attention_dim=64,
             attention_location_n_filters=16,
-            attention_location_kernel_size=15)
+            attention_location_kernel_size=15,
+        )
         t2_postnet = nemo_tts.Tacotron2Postnet(
-            n_mel_channels=64,
-            postnet_embedding_dim=256,
-            postnet_kernel_size=5,
-            postnet_n_convolutions=3)
+            n_mel_channels=64, postnet_embedding_dim=256, postnet_kernel_size=5, postnet_n_convolutions=3,
+        )
         t2_loss = nemo_tts.Tacotron2Loss()
         makegatetarget = nemo_tts.MakeGate()
 
         # DAG
         audio, audio_len, transcript, transcript_len = data_layer()
-        spec_target, spec_target_len = preprocessing(
-            input_signal=audio,
-            length=audio_len)
+        spec_target, spec_target_len = preprocessing(input_signal=audio, length=audio_len)
 
         transcript_embedded = text_embedding(char_phone=transcript)
-        transcript_encoded = t2_enc(
-            char_phone_embeddings=transcript_embedded,
-            embedding_length=transcript_len)
+        transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len,)
         mel_decoder, gate, _ = t2_dec(
-            char_phone_encoded=transcript_encoded,
-            encoded_length=transcript_len,
-            mel_target=spec_target)
+            char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target,
+        )
         mel_postnet = t2_postnet(mel_input=mel_decoder)
-        gate_target = makegatetarget(
-            mel_target=spec_target,
-            target_len=spec_target_len)
+        gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len)
         loss_t = t2_loss(
             mel_out=mel_decoder,
             mel_out_postnet=mel_postnet,
@@ -112,25 +127,23 @@ def test_tacotron2_training(self):
             mel_target=spec_target,
             gate_target=gate_target,
             target_len=spec_target_len,
-            seq_len=audio_len)
+            seq_len=audio_len,
+        )
 
         callback = nemo.core.SimpleLossLoggerCallback(
-            tensors=[loss_t],
-            print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'))
+            tensors=[loss_t], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
+        )
         # Instantiate an optimizer to perform `train` action
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, local_rank=None,
-            create_tb_writer=False)
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
+        )
         optimizer = neural_factory.get_trainer()
-        optimizer.train([loss_t], callbacks=[callback], optimizer="sgd",
-                        optimization_params={"num_epochs": 10, "lr": 0.0003})
+        optimizer.train(
+            [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 10, "lr": 0.0003},
+        )
 
     def test_waveglow_training(self):
-        data_layer = nemo_tts.AudioDataLayer(
-            manifest_filepath=self.manifest_filepath,
-            n_segments=4000,
-            batch_size=4
-        )
+        data_layer = nemo_tts.AudioDataLayer(manifest_filepath=self.manifest_filepath, n_segments=4000, batch_size=4,)
         preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
             window_size=None,
             window_stride=None,
@@ -139,8 +152,9 @@ def test_waveglow_training(self):
             normalize=None,
             preemph=None,
             dither=0,
-            mag_power=1.,
-            pad_value=-11.52)
+            mag_power=1.0,
+            pad_value=-11.52,
+        )
         waveglow = nemo_tts.WaveGlowNM(
             n_mel_channels=64,
             n_flows=6,
@@ -149,29 +163,25 @@ def test_waveglow_training(self):
             n_early_size=2,
             n_wn_layers=4,
             n_wn_channels=256,
-            wn_kernel_size=3)
+            wn_kernel_size=3,
+        )
         waveglow_loss = nemo_tts.WaveGlowLoss()
 
         # DAG
         audio, audio_len, = data_layer()
-        spec_target, _ = preprocessing(
-            input_signal=audio,
-            length=audio_len)
+        spec_target, _ = preprocessing(input_signal=audio, length=audio_len)
 
-        z, log_s_list, log_det_W_list = waveglow(
-            mel_spectrogram=spec_target, audio=audio)
-        loss_t = waveglow_loss(
-            z=z,
-            log_s_list=log_s_list,
-            log_det_W_list=log_det_W_list)
+        z, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target, audio=audio)
+        loss_t = waveglow_loss(z=z, log_s_list=log_s_list, log_det_W_list=log_det_W_list)
 
         callback = nemo.core.SimpleLossLoggerCallback(
-            tensors=[loss_t],
-            print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'))
+            tensors=[loss_t], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
+        )
         # Instantiate an optimizer to perform `train` action
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, local_rank=None,
-            create_tb_writer=False)
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
+        )
         optimizer = neural_factory.get_trainer()
-        optimizer.train([loss_t], callbacks=[callback], optimizer="sgd",
-                        optimization_params={"num_epochs": 10, "lr": 0.0003})
+        optimizer.train(
+            [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 10, "lr": 0.0003},
+        )
diff --git a/tests/test_tutorials_pytorch.py b/tests/test_tutorials_pytorch.py
index 410bda567921..1d8eea0ff719 100644
--- a/tests/test_tutorials_pytorch.py
+++ b/tests/test_tutorials_pytorch.py
@@ -17,13 +17,13 @@
 # =============================================================================
 
 import unittest
+
 from nemo.backends.pytorch.tutorials.chatbot.data import loadPrepareData
 
 from .common_setup import NeMoUnitTest
 
 
 class TestPytorchChatBotTutorial(NeMoUnitTest):
-
     def test_simple_train(self):
         datafile = "tests/data/dialog_sample.txt"
         print(datafile)
diff --git a/tests/test_weight_share.py b/tests/test_weight_share.py
index 10a61cd2e71a..59f2e07bd0f9 100644
--- a/tests/test_weight_share.py
+++ b/tests/test_weight_share.py
@@ -20,111 +20,127 @@
 
 import numpy as np
 import torch
+from ruamel.yaml import YAML
 
 import nemo
+import nemo.collections.asr as nemo_asr
 from nemo.core import WeightShareTransform
 from nemo.core.neural_types import *
 
-import nemo.collections.asr as nemo_asr
-
-from ruamel.yaml import YAML
-
 from .common_setup import NeMoUnitTest
 
 
 class TestWeightSharing(NeMoUnitTest):
-    labels = ["'", "a", "b", "c", "d", "e", "f", "g", "h",
-              "i", "j", "k", "l", "m", "n", "o", "p", "q",
-              "r", "s", "t", "u", "v", "w", "x", "y", "z", " "]
+    labels = [
+        "'",
+        "a",
+        "b",
+        "c",
+        "d",
+        "e",
+        "f",
+        "g",
+        "h",
+        "i",
+        "j",
+        "k",
+        "l",
+        "m",
+        "n",
+        "o",
+        "p",
+        "q",
+        "r",
+        "s",
+        "t",
+        "u",
+        "v",
+        "w",
+        "x",
+        "y",
+        "z",
+        " ",
+    ]
     manifest_filepath = "tests/data/asr/an4_train.json"
-    featurizer_config = {'window': 'hann',
-                         'dither': 1e-05,
-                         'normalize': 'per_feature',
-                         'frame_splicing': 1,
-                         'int_values': False,
-                         'window_stride': 0.01,
-                         'sample_rate': 16000,
-                         'features': 64,
-                         'n_fft': 512,
-                         'window_size': 0.02}
+    featurizer_config = {
+        'window': 'hann',
+        'dither': 1e-05,
+        'normalize': 'per_feature',
+        'frame_splicing': 1,
+        'int_values': False,
+        'window_stride': 0.01,
+        'sample_rate': 16000,
+        'features': 64,
+        'n_fft': 512,
+        'window_size': 0.02,
+    }
     yaml = YAML(typ="safe")
 
     def __check_if_weights_are_equal(self, w1: Dict, w2: Dict):
-        all_same = (set(w1.keys()) == set(w2.keys()))
+        all_same = set(w1.keys()) == set(w2.keys())
         if not all_same:
             return False
         else:
             for key in w1.keys():
                 all_same = all_same and np.array_equal(
-                  w1[key][0].cpu().detach().numpy(),
-                  w2[key][0].cpu().detach().numpy())
+                    w1[key][0].cpu().detach().numpy(), w2[key][0].cpu().detach().numpy(),
+                )
         return all_same
 
     def test_TaylorNet_get_weights(self):
         tn1 = nemo.backends.pytorch.tutorials.TaylorNet(dim=4)
         tn2 = nemo.backends.pytorch.tutorials.TaylorNet(dim=4)
         # because of randomness, actual weights should be different
-        self.assertFalse(self.__check_if_weights_are_equal(tn1.get_weights(),
-                                                           tn2.get_weights()))
+        self.assertFalse(self.__check_if_weights_are_equal(tn1.get_weights(), tn2.get_weights()))
         tn3 = nemo.backends.pytorch.tutorials.TaylorNet(dim=4)
         tn3.set_weights(tn1.get_weights())
         # check than weights are the same
-        self.assertTrue(self.__check_if_weights_are_equal(tn1.get_weights(),
-                                                          tn3.get_weights()))
+        self.assertTrue(self.__check_if_weights_are_equal(tn1.get_weights(), tn3.get_weights()))
         # change weights on one module - another module should not change
         tn1.fc1.bias.data = torch.tensor([0.1])
-        self.assertFalse(self.__check_if_weights_are_equal(tn1.get_weights(),
-                                                           tn3.get_weights()))
+        self.assertFalse(self.__check_if_weights_are_equal(tn1.get_weights(), tn3.get_weights()))
 
     def test_TaylorNet_tie_weights(self):
         tn1 = nemo.backends.pytorch.tutorials.TaylorNet(dim=4)
         tn2 = nemo.backends.pytorch.tutorials.TaylorNet(dim=4)
         # because of randomness, actual weights should be different
-        self.assertFalse(self.__check_if_weights_are_equal(tn1.get_weights(),
-                                                           tn2.get_weights()))
+        self.assertFalse(self.__check_if_weights_are_equal(tn1.get_weights(), tn2.get_weights()))
         tn2.tie_weights_with(tn1, list(tn1.get_weights().keys()))
         # change weights on one module - another module should change too
         tn1.fc1.bias.data = torch.tensor([0.1])
-        self.assertTrue(self.__check_if_weights_are_equal(tn1.get_weights(),
-                                                          tn2.get_weights()))
+        self.assertTrue(self.__check_if_weights_are_equal(tn1.get_weights(), tn2.get_weights()))
 
     def test_tie_weights2(self):
         voc_size = 3
         dim = 2
-        embd = nemo.backends.pytorch.common.SequenceEmbedding(
-          voc_size=voc_size, hidden_size=dim)
-        proj = nemo.backends.pytorch.common.SequenceProjection(
-          from_dim=dim, to_dim=voc_size)
+        embd = nemo.backends.pytorch.common.SequenceEmbedding(voc_size=voc_size, hidden_size=dim)
+        proj = nemo.backends.pytorch.common.SequenceProjection(from_dim=dim, to_dim=voc_size)
         embd.tie_weights_with(
             proj,
             weight_names=["embedding.weight"],
-            name2name_and_transform={"embedding.weight": (
-                "projection.weight", WeightShareTransform.SAME)})
+            name2name_and_transform={"embedding.weight": ("projection.weight", WeightShareTransform.SAME,)},
+        )
         self.assertTrue(
-            np.array_equal(embd.embedding.weight.detach().numpy(),
-                           proj.projection.weight.detach().numpy()))
+            np.array_equal(embd.embedding.weight.detach().numpy(), proj.projection.weight.detach().numpy(),)
+        )
         was = embd.embedding.weight.detach().numpy()
-        embd.embedding.weight.data = torch.tensor(
-            np.random.randint(0, 10, (3, 2))*1.0)
+        embd.embedding.weight.data = torch.tensor(np.random.randint(0, 10, (3, 2)) * 1.0)
         after = embd.embedding.weight.detach().numpy()
         self.assertTrue(
-            np.array_equal(embd.embedding.weight.detach().numpy(),
-                           proj.projection.weight.detach().numpy()))
+            np.array_equal(embd.embedding.weight.detach().numpy(), proj.projection.weight.detach().numpy(),)
+        )
         self.assertFalse(np.array_equal(was, after))
 
     def test_set_weights(self):
         voc_size = 3
         dim = 2
-        embd = nemo.backends.pytorch.common.SequenceEmbedding(
-            voc_size=voc_size, hidden_size=dim)
-        weights = torch.tensor(np.random.randint(0, 10, (3, 2))*1.0)
+        embd = nemo.backends.pytorch.common.SequenceEmbedding(voc_size=voc_size, hidden_size=dim)
+        weights = torch.tensor(np.random.randint(0, 10, (3, 2)) * 1.0)
         name2weights = {"embedding.weight": (weights, True)}
         embd.set_weights(name2weight=name2weights)
-        self.assertTrue(np.array_equal(embd.embedding.weight.detach().numpy(),
-                                       weights.detach().numpy()))
+        self.assertTrue(np.array_equal(embd.embedding.weight.detach().numpy(), weights.detach().numpy(),))
         weights = torch.tensor(np.random.randint(0, 10, (3, 2)) * 1.0)
-        self.assertFalse(np.array_equal(embd.embedding.weight.detach().numpy(),
-                                        weights.detach().numpy()))
+        self.assertFalse(np.array_equal(embd.embedding.weight.detach().numpy(), weights.detach().numpy(),))
 
     def test_freeze_unfreeze_TrainableNM(self):
         with open("tests/data/jasper_smaller.yaml") as file:
@@ -133,82 +149,81 @@ def test_freeze_unfreeze_TrainableNM(self):
             featurizer_config=self.featurizer_config,
             manifest_filepath=self.manifest_filepath,
             labels=self.labels,
-            batch_size=4
+            batch_size=4,
         )
         pre_process_params = {
-            'int_values': False, 'frame_splicing': 1, 'features': 64,
-            'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05,
-            'window': 'hann', 'sample_rate': 16000,
-            'normalize': 'per_feature', 'window_stride': 0.01}
-        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
-            **pre_process_params)
+            'int_values': False,
+            'frame_splicing': 1,
+            'features': 64,
+            'window_size': 0.02,
+            'n_fft': 512,
+            'dither': 1e-05,
+            'window': 'hann',
+            'sample_rate': 16000,
+            'normalize': 'per_feature',
+            'window_stride': 0.01,
+        }
+        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
         jasper_encoder = nemo_asr.JasperEncoder(
-            feat_in=jasper_model_definition[
-                'AudioToMelSpectrogramPreprocessor']['features'],
-            **jasper_model_definition['JasperEncoder'])
-        jasper_decoder = nemo_asr.JasperDecoderForCTC(
-            feat_in=1024,
-            num_classes=len(self.labels)
+            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
+            **jasper_model_definition['JasperEncoder'],
         )
+        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
         ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
         jasper_encoder.freeze()
         jasper_encoder.unfreeze(set(['encoder.4.conv.1.weight']))
         jasper_decoder.unfreeze()
         # DAG
         audio_signal, a_sig_length, transcript, transcript_len = dl()
-        processed_signal, p_length = preprocessing(input_signal=audio_signal,
-                                                   length=a_sig_length)
+        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)
 
-        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
-                                              length=p_length)
+        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
         # print(jasper_encoder)
         log_probs = jasper_decoder(encoder_output=encoded)
-        loss = ctc_loss(log_probs=log_probs,
-                        targets=transcript,
-                        input_length=encoded_len,
-                        target_length=transcript_len)
+        loss = ctc_loss(
+            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
+        )
 
         callback = nemo.core.SimpleLossLoggerCallback(
-            tensors=[loss],
-            print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'))
+            tensors=[loss], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
+        )
         # Instantiate an optimizer to perform `train` action
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, local_rank=None,
-            create_tb_writer=False)
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
+        )
         optimizer = neural_factory.get_trainer()
         optimizer.train(
-            [loss], callbacks=[callback],
-            optimizer="sgd",
-            optimization_params={"num_epochs": 2, "lr": 0.0003})
+            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 2, "lr": 0.0003},
+        )
 
     def test_freeze_unfreeze_Wrapper(self):
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch,
-            placement=nemo.core.DeviceType.GPU,
-            create_tb_writer=False)
+            backend=nemo.core.Backend.PyTorch, placement=nemo.core.DeviceType.GPU, create_tb_writer=False,
+        )
 
         dl_train = nemo.backends.pytorch.ZerosDataLayer(
             size=40,
-            dtype=[torch.FloatTensor,
-                   torch.LongTensor],
+            dtype=[torch.FloatTensor, torch.LongTensor],
             batch_size=4,
             output_ports={
-                "image": NeuralType({0: AxisType(BatchTag),
-                                     1: AxisType(ChannelTag, 3),
-                                     2: AxisType(HeightTag, 224),
-                                     3: AxisType(WidthTag, 224)}),
-                "label": NeuralType({0: AxisType(BatchTag)})
-            })
+                "image": NeuralType(
+                    {
+                        0: AxisType(BatchTag),
+                        1: AxisType(ChannelTag, 3),
+                        2: AxisType(HeightTag, 224),
+                        3: AxisType(WidthTag, 224),
+                    }
+                ),
+                "label": NeuralType({0: AxisType(BatchTag)}),
+            },
+        )
 
         # NOTICE: pretrain=True argument
-        resnet = neural_factory.get_module(name="resnet18",
-                                           params={"num_classes": 2},
-                                           collection="torchvision",
-                                           pretrained=True)
+        resnet = neural_factory.get_module(
+            name="resnet18", params={"num_classes": 2}, collection="torchvision", pretrained=True,
+        )
 
-        L_train = neural_factory.get_module(
-            name="CrossEntropyLoss", collection="toys",
-            params={})
+        L_train = neural_factory.get_module(name="CrossEntropyLoss", collection="toys", params={})
 
         # NOTICE: Freeze all Neural Module's weights
         resnet.freeze()
@@ -220,14 +235,13 @@ def test_freeze_unfreeze_Wrapper(self):
         train_loss = L_train(predictions=outputs, labels=labels)
 
         callback = nemo.core.SimpleLossLoggerCallback(
-            tensors=[train_loss],
-            print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'))
+            tensors=[train_loss], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
+        )
         # Instantiate an optimizer to perform `train` action
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, local_rank=None,
-            create_tb_writer=False)
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
+        )
         optimizer = neural_factory.get_trainer()
         optimizer.train(
-            [train_loss], callbacks=[callback],
-            optimizer="sgd",
-            optimization_params={"num_epochs": 2, "lr": 0.0003})
+            [train_loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 2, "lr": 0.0003},
+        )
diff --git a/tests/test_zeroDS.py b/tests/test_zeroDS.py
index 1958fc480f7e..c03aa1ab937c 100644
--- a/tests/test_zeroDS.py
+++ b/tests/test_zeroDS.py
@@ -18,21 +18,48 @@
 
 import os
 import tarfile
+
 import torch
 from ruamel.yaml import YAML
 
 import nemo
-from nemo.core.neural_types import *
-
 import nemo.collections.asr as nemo_asr
+from nemo.core.neural_types import *
 
 from .common_setup import NeMoUnitTest
 
 
 class TestZeroDL(NeMoUnitTest):
-    labels = ["'", "a", "b", "c", "d", "e", "f", "g", "h",
-              "i", "j", "k", "l", "m", "n", "o", "p", "q",
-              "r", "s", "t", "u", "v", "w", "x", "y", "z", " "]
+    labels = [
+        "'",
+        "a",
+        "b",
+        "c",
+        "d",
+        "e",
+        "f",
+        "g",
+        "h",
+        "i",
+        "j",
+        "k",
+        "l",
+        "m",
+        "n",
+        "o",
+        "p",
+        "q",
+        "r",
+        "s",
+        "t",
+        "u",
+        "v",
+        "w",
+        "x",
+        "y",
+        "z",
+        " ",
+    ]
     manifest_filepath = "tests/data/asr/an4_train.json"
     yaml = YAML(typ="safe")
 
@@ -51,31 +78,29 @@ def setUp(self) -> None:
     def test_simple_train(self):
         print("Simplest train test with ZeroDL")
         neural_factory = nemo.core.neural_factory.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, create_tb_writer=False)
+            backend=nemo.core.Backend.PyTorch, create_tb_writer=False
+        )
         trainable_module = nemo.backends.pytorch.tutorials.TaylorNet(dim=4)
         data_source = nemo.backends.pytorch.common.ZerosDataLayer(
             size=10000,
             dtype=torch.FloatTensor,
             batch_size=128,
             output_ports={
-                "x": NeuralType({
-                    0: AxisType(BatchTag),
-                    1: AxisType(ChannelTag, dim=1)}),
-                "y": NeuralType({
-                    0: AxisType(BatchTag),
-                    1: AxisType(ChannelTag, dim=1)})})
+                "x": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag, dim=1)}),
+                "y": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag, dim=1)}),
+            },
+        )
         loss = nemo.backends.pytorch.tutorials.MSELoss()
         x, y = data_source()
         y_pred = trainable_module(x=x)
         loss_tensor = loss(predictions=y_pred, target=y)
 
         callback = nemo.core.SimpleLossLoggerCallback(
-            tensors=[loss_tensor],
-            print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'))
+            tensors=[loss_tensor], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
+        )
         neural_factory.train(
-            [loss_tensor], callbacks=[callback],
-            optimization_params={"num_epochs": 3, "lr": 0.0003},
-            optimizer="sgd")
+            [loss_tensor], callbacks=[callback], optimization_params={"num_epochs": 3, "lr": 0.0003}, optimizer="sgd",
+        )
 
     def test_asr_with_zero_ds(self):
         print("Testing ASR NMs with ZeroDS and without pre-processing")
@@ -83,49 +108,46 @@ def test_asr_with_zero_ds(self):
             jasper_model_definition = self.yaml.load(file)
 
         dl = nemo.backends.pytorch.common.ZerosDataLayer(
-            size=100, dtype=torch.FloatTensor,
+            size=100,
+            dtype=torch.FloatTensor,
             batch_size=4,
             output_ports={
                 "processed_signal": NeuralType(
-                    {0: AxisType(BatchTag),
-                     1: AxisType(SpectrogramSignalTag, dim=64),
-                     2: AxisType(ProcessedTimeTag, dim=64)}),
-                "processed_length": NeuralType(
-                    {0: AxisType(BatchTag)}),
-                "transcript": NeuralType({0: AxisType(BatchTag),
-                                          1: AxisType(TimeTag, dim=64)}),
-                "transcript_length": NeuralType({0: AxisType(BatchTag)})
-            })
+                    {
+                        0: AxisType(BatchTag),
+                        1: AxisType(SpectrogramSignalTag, dim=64),
+                        2: AxisType(ProcessedTimeTag, dim=64),
+                    }
+                ),
+                "processed_length": NeuralType({0: AxisType(BatchTag)}),
+                "transcript": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag, dim=64)}),
+                "transcript_length": NeuralType({0: AxisType(BatchTag)}),
+            },
+        )
 
         jasper_encoder = nemo_asr.JasperEncoder(
-            feat_in=jasper_model_definition[
-                'AudioToMelSpectrogramPreprocessor']['features'],
-            **jasper_model_definition["JasperEncoder"])
-        jasper_decoder = nemo_asr.JasperDecoderForCTC(
-            feat_in=1024,
-            num_classes=len(self.labels)
+            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
+            **jasper_model_definition["JasperEncoder"],
         )
+        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
         ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
 
         # DAG
         processed_signal, p_length, transcript, transcript_len = dl()
-        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
-                                              length=p_length)
+        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
         # print(jasper_encoder)
         log_probs = jasper_decoder(encoder_output=encoded)
-        loss = ctc_loss(log_probs=log_probs,
-                        targets=transcript,
-                        input_length=encoded_len,
-                        target_length=transcript_len)
+        loss = ctc_loss(
+            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
+        )
 
         callback = nemo.core.SimpleLossLoggerCallback(
-            tensors=[loss],
-            print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'))
+            tensors=[loss], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
+        )
         # Instantiate an optimizer to perform `train` action
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, local_rank=None,
-            create_tb_writer=False)
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
+        )
         neural_factory.train(
-            [loss], callbacks=[callback],
-            optimization_params={"num_epochs": 2, "lr": 0.0003},
-            optimizer="sgd")
+            [loss], callbacks=[callback], optimization_params={"num_epochs": 2, "lr": 0.0003}, optimizer="sgd",
+        )