From b16d356221b1fd74a3007f0fcbeeff295ce988f4 Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 6 May 2020 13:54:47 -0700 Subject: [PATCH 01/40] Rebase off of master; add new working prototype of loss callback Signed-off-by: Jason --- examples/asr/jasper_an4_debug.py | 298 ++++++++++++++++++++ nemo/backends/pytorch/actions.py | 28 +- nemo/core/callbacks.py | 39 +++ nemo/core/neural_factory.py | 70 ++++- nemo/core/neural_types/__init__.py | 1 + nemo/core/neural_types/neural_type.py | 9 +- nemo/core/neural_types/nmtensor_registry.py | 87 ++++++ nemo/utils/app_state.py | 23 +- 8 files changed, 525 insertions(+), 30 deletions(-) create mode 100755 examples/asr/jasper_an4_debug.py create mode 100755 nemo/core/neural_types/nmtensor_registry.py diff --git a/examples/asr/jasper_an4_debug.py b/examples/asr/jasper_an4_debug.py new file mode 100755 index 000000000000..e19ea0117f62 --- /dev/null +++ b/examples/asr/jasper_an4_debug.py @@ -0,0 +1,298 @@ +# Copyright (c) 2019 NVIDIA Corporation +import argparse +import math +import os +from functools import partial + +from ruamel.yaml import YAML + +import nemo +import nemo.collections.asr as nemo_asr +import nemo.utils.argparse as nm_argparse +from nemo.collections.asr.helpers import ( + monitor_asr_train_progress, + post_process_predictions, + post_process_transcripts, + process_evaluation_batch, + process_evaluation_epoch, + word_error_rate, +) +from nemo.utils.lr_policies import CosineAnnealing + +logging = nemo.logging + + +def create_dags(model_config_file, vocab, args, nf): + + # Create a data_layer for training. + data_layer = nemo_asr.AudioToTextDataLayer.import_from_config( + model_config_file, + "AudioToTextDataLayer_train", + overwrite_params={"manifest_filepath": args.train_dataset, "batch_size": args.batch_size}, + ) + + num_samples = len(data_layer) + steps_per_epoch = math.ceil(num_samples / (data_layer.batch_size * args.iter_per_step * nf.world_size)) + total_steps = steps_per_epoch * args.num_epochs + logging.info("Train samples=", num_samples, "num_steps=", total_steps) + + # # Create a data_layer for evaluation. + # data_layer_eval = nemo_asr.AudioToTextDataLayer.import_from_config( + # model_config_file, "AudioToTextDataLayer_eval", overwrite_params={"manifest_filepath": args.eval_datasets}, + # ) + + # num_samples = len(data_layer_eval) + # logging.info(f"Eval samples={num_samples}") + + # Instantiate data processor. + data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config( + model_config_file, "AudioToMelSpectrogramPreprocessor" + ) + + # Instantiate JASPER encoder-decoder modules. + jasper_encoder = nemo_asr.JasperEncoder.import_from_config(model_config_file, "JasperEncoder") + jasper_decoder = nemo_asr.JasperDecoderForCTC.import_from_config( + model_config_file, "JasperDecoderForCTC", overwrite_params={"num_classes": len(vocab)} + ) + + # Instantiate losses. + ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) + greedy_decoder = nemo_asr.GreedyCTCDecoder() + + # Create a training graph. + audio, audio_len, transcript, transcript_len = data_layer() + processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len) + encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len) + log_probs = jasper_decoder(encoder_output=encoded) + predictions = greedy_decoder(log_probs=log_probs) + loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,) + + # # Create an evaluation graph. + # audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval() + # processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e) + # encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e) + # log_probs_e = jasper_decoder(encoder_output=encoded_e) + # predictions_e = greedy_decoder(log_probs=log_probs_e) + # loss_e = ctc_loss( + # log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, + # ) + logging.info("Num of params in encoder: {0}".format(jasper_encoder.num_weights)) + + # Callbacks to print info to console and Tensorboard. + # train_callback = nemo.core.SimpleLossLoggerCallback( + # tensors=[loss, predictions, transcript, transcript_len], + # print_func=partial(monitor_asr_train_progress, labels=vocab), + # get_tb_values=lambda x: [["loss", x[0]]], + # tb_writer=nf.tb_writer, + # ) + + # loss.rename("test") + # train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["test"]) + + train_callback = nemo.core.SimpleLossLogger() + + # checkpointer_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq) + + # eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e] + # eval_callback = nemo.core.EvaluatorCallback( + # eval_tensors=eval_tensors, + # user_iter_callback=partial(process_evaluation_batch, labels=vocab), + # user_epochs_done_callback=process_evaluation_epoch, + # eval_step=args.eval_freq, + # tb_writer=nf.tb_writer, + # eval_at_start=not args.do_not_eval_at_start, + # ) + # callbacks = [train_callback, checkpointer_callback, eval_callback] + callbacks = [train_callback] + + # Return entities required by the actual training. + return ( + loss, + # eval_tensors, + callbacks, + total_steps, + # log_probs_e, + # encoded_len_e, + ) + + +def main(): + parser = argparse.ArgumentParser( + parents=[nm_argparse.NemoArgParser()], description='AN4 ASR', conflict_handler='resolve', + ) + + # Overwrite default args + parser.add_argument("--train_dataset", type=str, help="training dataset path") + parser.add_argument("--eval_datasets", type=str, help="validation dataset path") + + # Create new args + # parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str) + parser.add_argument("--batch_size", default=48, type=int, help="size of the training batch") + parser.add_argument("--lm", default=None, type=str) + parser.add_argument("--test_after_training", action='store_true') + parser.add_argument("--momentum", type=float) + parser.add_argument("--beta1", default=0.95, type=float) + parser.add_argument("--beta2", default=0.25, type=float) + parser.add_argument("--do_not_eval_at_start", action='store_true') + parser.set_defaults( + model_config="./configs/jasper_an4.yaml", + train_dataset="~/TestData/an4_dataset/an4_train.json", + eval_datasets="~/TestData/an4_dataset/an4_val.json", + work_dir="./tmp", + optimizer="novograd", + num_epochs=50, + lr=0.02, + weight_decay=0.005, + checkpoint_save_freq=1000, + eval_freq=100, + amp_opt_level="O1", + ) + + args = parser.parse_args() + betas = (args.beta1, args.beta2) + + wer_thr = 0.20 + beam_wer_thr = 0.15 + + nf = nemo.core.NeuralModuleFactory( + local_rank=args.local_rank, + files_to_copy=[__file__], + optimization_level=args.amp_opt_level, + random_seed=0, + log_dir=args.work_dir, + create_tb_writer=True, + cudnn_benchmark=args.cudnn_benchmark, + ) + tb_writer = nf.tb_writer + checkpoint_dir = nf.checkpoint_dir + + # Load model definition + yaml = YAML(typ="safe") + with open(args.model_config) as f: + jasper_params = yaml.load(f) + # Get vocabulary. + vocab = jasper_params['labels'] + + # (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e,) = create_dags( + # args.model_config, vocab, args, nf + # ) + + loss, callbacks, total_steps = create_dags(args.model_config, vocab, args, nf) + + nf.train( + tensors_to_optimize=[loss], + callbacks=callbacks, + optimizer=args.optimizer, + lr_policy=CosineAnnealing(total_steps=total_steps, min_lr=args.lr / 100), + optimization_params={ + "num_epochs": args.num_epochs, + "max_steps": args.max_steps, + "lr": args.lr, + "momentum": args.momentum, + "betas": betas, + "weight_decay": args.weight_decay, + "grad_norm_clip": None, + }, + batches_per_step=args.iter_per_step, + amp_max_loss_scale=256.0, + # synced_batchnorm=(nf.global_rank is not None), + ) + + # if args.test_after_training: + # logging.info("Testing greedy and beam search with LM WER.") + # # Create BeamSearch NM + # if nf.world_size > 1 or args.lm is None: + # logging.warning("Skipping beam search WER as it does not work if doing distributed training.") + # else: + # beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM( + # vocab=vocab, beam_width=64, alpha=2.0, beta=1.5, lm_path=args.lm, num_cpus=max(os.cpu_count(), 1), + # ) + # beam_predictions = beam_search_with_lm(log_probs=log_probs_e, log_probs_length=encoded_len_e) + # eval_tensors.append(beam_predictions) + + # evaluated_tensors = nf.infer(eval_tensors) + # if nf.global_rank in [0, None]: + # greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab) + # references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab) + # wer = word_error_rate(hypotheses=greedy_hypotheses, references=references) + # logging.info("Greedy WER: {:.2f}%".format(wer * 100)) + # if wer > wer_thr: + # nf.sync_all_processes(False) + # raise ValueError(f"Final eval greedy WER {wer * 100:.2f}% > :" f"than {wer_thr * 100:.2f}%") + # nf.sync_all_processes() + + # if nf.world_size == 1 and args.lm is not None: + # beam_hypotheses = [] + # # Over mini-batch + # for i in evaluated_tensors[-1]: + # # Over samples + # for j in i: + # beam_hypotheses.append(j[0][1]) + + # beam_wer = word_error_rate(hypotheses=beam_hypotheses, references=references) + # logging.info("Beam WER {:.2f}%".format(beam_wer * 100)) + # assert beam_wer <= beam_wer_thr, "Final eval beam WER {:.2f}% > than {:.2f}%".format( + # beam_wer * 100, beam_wer_thr * 100 + # ) + # assert beam_wer <= wer, "Final eval beam WER > than the greedy WER." + + # # Reload model weights and train for extra 10 epochs + # checkpointer_callback = nemo.core.CheckpointCallback( + # folder=checkpoint_dir, step_freq=args.checkpoint_save_freq, force_load=True, + # ) + + # # Distributed Data Parallel changes the underlying class so we need + # # to reinstantiate Encoder and Decoder + # args.num_epochs += 10 + # previous_step_count = total_steps + # loss, eval_tensors, callbacks, total_steps, _, _ = create_dags(args.model_config, vocab, args, nf) + + # nf.reset_trainer() + # nf.train( + # tensors_to_optimize=[loss], + # callbacks=callbacks, + # optimizer=args.optimizer, + # lr_policy=CosineAnnealing(warmup_steps=previous_step_count, total_steps=total_steps), + # optimization_params={ + # "num_epochs": args.num_epochs, + # "lr": args.lr / 100, + # "momentum": args.momentum, + # "betas": betas, + # "weight_decay": args.weight_decay, + # "grad_norm_clip": None, + # }, + # reset=True, + # amp_max_loss_scale=256.0, + # # synced_batchnorm=(nf.global_rank is not None), + # ) + + # evaluated_tensors = nf.infer(eval_tensors) + # if nf.global_rank in [0, None]: + # greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab) + # references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab) + # wer_new = word_error_rate(hypotheses=greedy_hypotheses, references=references) + # logging.info("New greedy WER: {:.2f}%".format(wer_new * 100)) + # if wer_new > wer * 1.1: + # nf.sync_all_processes(False) + # raise ValueError( + # f"Fine tuning: new WER {wer_new * 100:.2f}% > than the " f"previous WER {wer * 100:.2f}%" + # ) + # nf.sync_all_processes() + + # # Open the log file and ensure that epochs is strictly increasing + # if nf._exp_manager.log_file: + # epochs = [] + # with open(nf._exp_manager.log_file, "r") as log_file: + # line = log_file.readline() + # while line: + # index = line.find("Starting epoch") + # if index != -1: + # epochs.append(int(line[index + len("Starting epoch") :])) + # line = log_file.readline() + # for i, e in enumerate(epochs): + # if i != e: + # raise ValueError("Epochs from logfile was not understood") + + +if __name__ == "__main__": + main() diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 172b2131990c..7663beea9293 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -20,8 +20,8 @@ from nemo.backends.pytorch.nm import DataLayerNM, TrainableNM from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor -from nemo.core.callbacks import ActionCallback, EvaluatorCallback, SimpleLossLoggerCallback -from nemo.core.neural_factory import Actions, OperationMode, Optimization +from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback, SimpleLossLoggerCallback +from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState from nemo.core.neural_types import * from nemo.utils.helpers import get_checkpoint_from_dir @@ -450,10 +450,10 @@ def __nm_graph_forward_pass( if nm_tensor is None: continue t_name = nm_tensor.unique_name - if t_name not in registered_tensors: + if t_name not in registered_tensors or registered_tensors[t_name] is None: registered_tensors[t_name] = t_tensor else: - raise ValueError("A NMTensor was produced twice in " f"the same DAG. {t_name}") + raise ValueError(f"A NMTensor was produced twice in the same DAG. {t_name}") @staticmethod def pad_tensor(t: torch.Tensor, target_size: torch.Size): @@ -1110,6 +1110,7 @@ def train( gradient_predivide=False, amp_max_loss_scale=2.0 ** 24, ): + self._training_state = TrainingState() # Analyse the arguments passed to train. if tensors_to_optimize is not None and training_graph is not None: raise ValueError("Cannot pass both `tensors_to_optimize` and `training_graph` to the train() function") @@ -1204,7 +1205,7 @@ def train( # callbacks setup if callbacks is not None: for callback in callbacks: - if not isinstance(callback, ActionCallback): + if not isinstance(callback, ActionCallback) and not isinstance(callback, NeMoCallback): raise ValueError("A callback was received that was not a child of ActionCallback") elif isinstance(callback, SimpleLossLoggerCallback): if logging_callchain: @@ -1407,20 +1408,20 @@ def train( else: tensors.append(d) - registered_tensors = { - t.unique_name: d for t, d in zip(curr_call_chain[0][2].values(), tensors) if t is not None - } + for t, d in zip(curr_call_chain[0][2].values(), tensors): + if t is not None: + self.training_state.set_tensor(t, d) disable_allreduce = batch_counter < (batches_per_step - 1) self.__nm_graph_forward_pass( - call_chain=curr_call_chain, registered_tensors=registered_tensors, + call_chain=curr_call_chain, registered_tensors=self.training_state.tensor_dict, ) curr_tensors_to_optimize = training_loop[self.step % len(training_loop)][1] final_loss = 0 for tensor in curr_tensors_to_optimize: if ( - torch.isnan(registered_tensors[tensor.unique_name]).any() - or torch.isinf(registered_tensors[tensor.unique_name]).any() + torch.isnan(self.training_state.tensor_dict[tensor.unique_name]).any() + or torch.isinf(self.training_state.tensor_dict[tensor.unique_name]).any() ): if ( (stop_on_nan_loss) @@ -1436,7 +1437,7 @@ def train( ) else: logging.warning('Loss is NaN or inf, continuing training') - final_loss += registered_tensors[tensor.unique_name] + final_loss += self.training_state.tensor_dict[tensor.unique_name] if self._optim_level in AmpOptimizations and self._optim_level != Optimization.mxprO0: with amp.scale_loss(final_loss, curr_optimizer, delay_unscale=disable_allreduce) as scaled_loss: @@ -1479,10 +1480,11 @@ def train( batch_counter = 0 # Register iteration end with callbacks self._update_callbacks( - callbacks=callbacks, registered_tensors=registered_tensors, + callbacks=callbacks, registered_tensors=self.training_state.tensor_dict, final_loss=final_loss ) self._perform_on_iteration_end(callbacks=callbacks) self.step += 1 + self.training_state.clear_dict() # End of epoch for loop # Register epochs end with callbacks self._perform_on_epoch_end(callbacks=callbacks) diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index e465bf5bf95a..1161cef57ee2 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -37,6 +37,45 @@ logging = nemo.logging +class NeMoCallback(ABC): + def on_action_start(self, state): + pass + + def on_action_end(self, state): + pass + + def on_epoch_start(self, state): + pass + + def on_epoch_end(self, state): + pass + + def on_iteration_start(self, state): + pass + + def on_iteration_end(self, state): + pass + + +class SimpleLossLogger(NeMoCallback): + def __init__(self, step_freq=100, tensors_to_log=["loss"]): + # Step_freq: how often logs are printed + self.step_freq = step_freq + self.tensors_to_log = tensors_to_log + + # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]): + # #tensors_to_log: List of keys into state that will be logged + + def on_iteration_end(self, state): + if state["step"] % self.step_freq == 0: + for tensor_key in self.tensors_to_log: + tensor = state["tensors"].get_tensor(tensor_key) + logging.info("%s: %s", tensor_key, tensor) + # except KeyError: + # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " + # f"Current state tensors include {state['tensors'].tensor_list()}") + + class ActionCallback(ABC): """Abstract interface for callbacks. """ diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index 4402ded7b927..37dac0e678d8 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -36,6 +36,7 @@ from ..utils import ExpManager from .callbacks import ActionCallback, EvaluatorCallback from .neural_types import * +from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated logging = nemo.logging @@ -84,6 +85,26 @@ class DeviceType(Enum): AllGpu = 3 +class TrainingState: + def __init__(self): + tensor_naming_registery = AppState().tensor_names + self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None) + + def tensor_list(self): + return self.tensor_dict.keys() + + def clear_dict(self): + for name in self.tensor_dict: + self.tensor_dict[name] = None + + def set_tensor(self, tensor, value): + self.tensor_dict[tensor.unique_name] = value + + def get_tensor(self, name): + unique_name = AppState().tensor_names[name] + return self.tensor_dict[unique_name] + + class Actions(ABC): """Basic actions allowed on graphs of Neural Modules""" @@ -93,6 +114,15 @@ def __init__(self, local_rank, global_rank, optimization_level=Optimization.mxpr self._optim_level = optimization_level self.step = None self.epoch_num = None + self._training_state = TrainingState() + + @property + def state(self): + return {"step": self.step, "tensors": self.training_state} + + @property + def training_state(self): + return self._training_state @property def local_rank(self): @@ -201,45 +231,67 @@ def _perform_on_iteration_start(self, callbacks): # to be a list of ActionCallback objects if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: - callback.on_iteration_start() + if isinstance(callback, ActionCallback): + callback.on_iteration_start() + else: + callback.on_iteration_start(self.state) def _perform_on_iteration_end(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: - callback.on_iteration_end() + if isinstance(callback, ActionCallback): + callback.on_iteration_end() + else: + callback.on_iteration_end(self.state) def _perform_on_action_start(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: - callback.on_action_start() + if isinstance(callback, ActionCallback): + callback.on_action_start() + else: + callback.on_action_start(self.state) def _perform_on_action_end(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: - callback.on_action_end() + if isinstance(callback, ActionCallback): + callback.on_action_end() + else: + callback.on_action_end(self.state) def _perform_on_epoch_start(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: - callback.on_epoch_start() + if isinstance(callback, ActionCallback): + callback.on_epoch_start() + else: + callback.on_epoch_start(self.state) def _perform_on_epoch_end(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: - callback.on_epoch_end() + if isinstance(callback, ActionCallback): + callback.on_epoch_end() + else: + callback.on_epoch_end(self.state) def _init_callbacks(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: - callback.action = self + if isinstance(callback, ActionCallback): + callback.action = self def _update_callbacks( - self, callbacks=None, registered_tensors=None, + self, callbacks=None, registered_tensors=None, final_loss=None, ): # if self.local_rank is None or self.local_rank == 0: if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: - callback._registered_tensors = registered_tensors + if isinstance(callback, ActionCallback): + callback._registered_tensors = registered_tensors + else: # For now, we can use the old callback function. In the future we should improve this + self.training_state.tensor_dict["loss"] = final_loss def _str_to_opt_level(opt_str: str) -> Optimization: diff --git a/nemo/core/neural_types/__init__.py b/nemo/core/neural_types/__init__.py index 1fb5bf349076..0ae947d90137 100644 --- a/nemo/core/neural_types/__init__.py +++ b/nemo/core/neural_types/__init__.py @@ -19,3 +19,4 @@ from nemo.core.neural_types.comparison import * from nemo.core.neural_types.elements import * from nemo.core.neural_types.neural_type import * +from nemo.core.neural_types.nmtensor_registry import NmTensorNameRegistry diff --git a/nemo/core/neural_types/neural_type.py b/nemo/core/neural_types/neural_type.py index d503f8b78cf1..d78d0dc9923c 100644 --- a/nemo/core/neural_types/neural_type.py +++ b/nemo/core/neural_types/neural_type.py @@ -49,9 +49,9 @@ class NeuralType(object): def __str__(self): if self.axes is not None: - return f"axes: {self.axes}; " f" elements_type: {self.elements_type.__class__.__name__}" + return f"axes: {self.axes}; elements_type: {self.elements_type.__class__.__name__}" else: - return f"axes: None; " f" elements_type: {self.elements_type.__class__.__name__}" + return f"axes: None; elements_type: {self.elements_type.__class__.__name__}" def __init__(self, axes: Optional[Tuple] = None, elements_type: ElementType = VoidType(), optional=False): if not isinstance(elements_type, ElementType): @@ -223,6 +223,7 @@ def __init__(self, producer, producer_args, output_port_name, ntype=None): self._step_number = AppState().active_graph.step_number # List of tuples (step number, module name, input port name) self._consumers = [] + AppState().tensor_names.register(self) @property def producer(self): @@ -323,6 +324,10 @@ def unique_name(self): raise ValueError("This NmTensor does not have a unique name") return f"{self._output_port_name}~~~{self._producer_name}~~~{self._uuid}" + def rename(self, new_name): + """TODO + """ + AppState().tensor_names.rename_NmTensor(self, new_name) class NeuralTypeError(Exception): """Base class for neural type related exceptions.""" diff --git a/nemo/core/neural_types/nmtensor_registry.py b/nemo/core/neural_types/nmtensor_registry.py new file mode 100755 index 000000000000..c439d4949c9d --- /dev/null +++ b/nemo/core/neural_types/nmtensor_registry.py @@ -0,0 +1,87 @@ +# ============================================================================= +# Copyright (c) 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + + +class NmTensorNameRegistry: + def __init__(self): + """ + Constructor. Initializes the manager. Sets active graph to None. + + TODO: Should probably be a property of a graph + """ + # Create the nmtensor_naming_dict + # which contains a mapping of str to NMTensor.unique_name + self._nmtensor_naming_dict = {"loss": "loss"} # Reserve keyname of 'loss' + self._nmtensor_uniname_set = set(["loss"]) + + # def summary(self): + # """ Prints a nice summary. """ + # desc = "" + # for graph in self: + # desc = desc + "`{}`: {}\n".format(graph.name, graph) + # return desc + + @property + def unique_names(self): + return self._nmtensor_uniname_set + + # def register(self, tensor: NmTensor): + def register(self, tensor): + """TODO + """ + + # Check if object is already in a set. + if tensor.unique_name in self._nmtensor_uniname_set: + pass + + # Finally, add object to the set. + self._nmtensor_uniname_set.add(tensor.unique_name) + + # def rename_NmTensor(self, tensor: NmTensor, new_name: str): + def rename_NmTensor(self, tensor, new_name: str): + """ TODO + """ + # Find old name if exists + old_name = tensor.unique_name + for custom_name, unique_name in self._nmtensor_naming_dict.items(): + if unique_name == tensor.unique_name: + old_name = custom_name + + if old_name != tensor.unique_name: + del self._nmtensor_naming_dict[old_name] + + if new_name in self._nmtensor_naming_dict: + raise KeyError(f"{new_name} already exists in current graph. Please use a unique name") + self._nmtensor_naming_dict[new_name] = tensor.unique_name + + def __getitem__(self, key): + """ + Object getter function. + + Args: + key: Object name. + + Returns: + Object associated with the key. + """ + # Search for an object with a given name. + if key in self._nmtensor_naming_dict: + key = self._nmtensor_naming_dict[key] + + if key in self._nmtensor_uniname_set: + return key + + raise KeyError("A NmTensor with name `{}` don't exists!".format(key)) diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index d77daa133adf..6183526b87fe 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -47,6 +47,17 @@ def __init__(self, device=None): self._module_registry = ObjectRegistry("module") # Create graph manager (registry with some additional functionality). self._neural_graph_manager = NeuralGraphManager() + # Create NmTensor registry + self._nmtensor_name_registry = nemo.core.neural_types.NmTensorNameRegistry() + + @property + def tensor_names(self): + """ Property returning the existing modules. + + Returns: + Existing modules (a set object). + """ + return self._nmtensor_name_registry @property def modules(self): @@ -68,14 +79,14 @@ def graphs(self): return self._neural_graph_manager def register_module(self, module, name: str) -> str: - """ - Registers a module using the provided name. + """ + Registers a module using the provided name. If name is none - generates a new unique name. - + Args: module: A Neural Module object to be registered. name: A "proposition" of module name. - + Returns: A unique name (proposition or newly generated name). """ @@ -85,11 +96,11 @@ def register_graph(self, graph, name: str) -> str: """ Registers a new graph using the provided name. If name is none - generates a new unique name. - + Args: graph: A Neural Graph object to be registered. name: A "proposition" of graph name. - + Returns: A unique name (proposition or newly generated name). """ From 8024454fb9cd0afbd8af50163bd950f6421fd27d Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 6 May 2020 17:22:59 -0700 Subject: [PATCH 02/40] first working hack of computing uncomputed tensors Signed-off-by: Jason --- examples/asr/jasper_an4_debug.py | 6 +- nemo/backends/pytorch/actions.py | 292 ++++++-------------- nemo/core/callbacks.py | 15 + nemo/core/neural_factory.py | 128 ++++++++- nemo/core/neural_types/nmtensor_registry.py | 11 +- nemo/utils/neural_graph/object_registry.py | 12 +- 6 files changed, 246 insertions(+), 218 deletions(-) diff --git a/examples/asr/jasper_an4_debug.py b/examples/asr/jasper_an4_debug.py index e19ea0117f62..f06055baec8a 100755 --- a/examples/asr/jasper_an4_debug.py +++ b/examples/asr/jasper_an4_debug.py @@ -65,7 +65,7 @@ def create_dags(model_config_file, vocab, args, nf): encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len) log_probs = jasper_decoder(encoder_output=encoded) predictions = greedy_decoder(log_probs=log_probs) - loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,) + loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len) # # Create an evaluation graph. # audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval() @@ -89,7 +89,9 @@ def create_dags(model_config_file, vocab, args, nf): # loss.rename("test") # train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["test"]) - train_callback = nemo.core.SimpleLossLogger() + # train_callback = nemo.core.SimpleLossLogger() + predictions.rename("test") + train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["loss", "test"]) # checkpointer_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 7663beea9293..e737b08997c2 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -21,8 +21,10 @@ from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback, SimpleLossLoggerCallback -from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState +from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState, topological_sort_from_leaves from nemo.core.neural_types import * +from nemo.utils.app_state import AppState +from nemo.utils.decorators import deprecated from nemo.utils.helpers import get_checkpoint_from_dir # these imports will happen on as-needed basis @@ -87,137 +89,38 @@ def __init__( local_rank=local_rank, global_rank=global_rank, optimization_level=optimization_level, ) - # will be [unique_instance_id -> (NMModule, PTModule)] - self.module_reference_table = {} self.step = 0 self.epoch_num = 0 self.optimizers = [] self.tb_writer = tb_writer - self._modules = set() self.cache = None self.amp_initialized = False - - @property - def modules(self): - return self._modules + self.ddp_initialized = False + self.ddp_module_dict = {} def __get_top_sorted_modules_and_dataloader(self, hook): + """ TODO """ - Constructs DAG leading to hook and creates its topological order. - It also populates self.module_reference_table. - Args: - hook: an NmTensor or a list of NmTensors representing leaf nodes - in DAG - - Returns: - list of modules with their call arguments and outputs, and dataset - """ - - def create_node(producer, producer_args): - if producer_args is None: - return tuple((producer, ())) - else: - return tuple((producer, tuple([(k, v) for k, v in producer_args.items()]),)) - - def is_in_degree_zero(node, processed_nodes): - """A node has in degree of zero""" - if node[1] == (): - return True - for portname, nmtensor in node[1]: - nd = create_node(nmtensor.producer, nmtensor.producer_args) - if nd not in processed_nodes: - return False - return True - - hooks = hook if isinstance(hook, list) else [hook] - - # ensures that no tensors are processed twice - processed_nmtensors = set() - - indices_to_remove = [] - # Check for duplicates in hook - for i, nmtensor in enumerate(hook): - if nmtensor in processed_nmtensors: - indices_to_remove.append(i) - else: - processed_nmtensors.add(nmtensor) - - for i in reversed(indices_to_remove): - hook.pop(i) - - _top_sorted_modules = [] - all_nodes = {} - - # extract all nodes to all_nodes set - hooks_lst = list(hooks) - while len(hooks_lst) > 0: - # take nmtensor from the end of the list - nmtensor = hooks_lst.pop() - - node = create_node(nmtensor.producer, nmtensor.producer_args) - # Store nmtensor as an output of its producer - # first make sure all keys are present per output port - # and nm is inside all_nodes - if node not in all_nodes: - all_nodes[node] = {k: None for k in nmtensor.producer.output_ports} - # second, populate output port with current nmtensor - # where applicable - all_nodes[node][nmtensor.name] = nmtensor - processed_nmtensors.add(nmtensor) - if nmtensor.producer_args is not None and nmtensor.producer_args != {}: - for _, new_nmtensor in nmtensor.producer_args.items(): - if new_nmtensor not in processed_nmtensors: - # put in the start of list - hooks_lst.insert(0, new_nmtensor) - - all_node_with_output = [] - # Iterate over all_nodes to create new nodes that include its output - # now all nodes have (module, input tensors, output tensors) - for node in all_nodes: - all_node_with_output.append(tuple((node[0], node[1], all_nodes[node]))) - - processed_nodes = [] - while len(all_node_with_output) > 0: - for node in all_node_with_output.copy(): - # if node's in_degree is zero it can be added to - # _top_sorted_modules - # this will also reduce in_degree of its children - if is_in_degree_zero(node, processed_nodes): - _top_sorted_modules.append(node) - processed_nodes.append((node[0], node[1])) - all_node_with_output.remove(node) - - # Create top_sorted_modules aka callchain - top_sorted_modules = [] - for i, m in enumerate(_top_sorted_modules): - top_sorted_modules.append((m[0], dict(m[1]), m[2])) - # Ensure that there is only one dataset in callchain - if i > 0 and isinstance(m[0], DataLayerNM): - raise ValueError("There were more than one DataLayer NeuralModule inside your DAG.") + top_sorted_modules = topological_sort_from_leaves(hook) if not isinstance(top_sorted_modules[0][0], DataLayerNM): raise ValueError("The first module in your DAG was not a DataLayer NeuralModule.") tdataset = top_sorted_modules[0][0].dataset - # populate self.module_reference_table for m in top_sorted_modules: if m[0].factory is None and self._local_rank is not None: raise ValueError( - "Neural module {0} was created without " - "NeuralModuleFactory, but you are trying to" - "run in distributed mode. Please instantiate" - "NeuralModuleFactory first and pass its " - "instance as `factory` parameter to all your" - "Neural Module objects." - "".format(str(m[0])) + "Neural module {0} was created without NeuralModuleFactory, but you are trying to run in " + "distributed mode. Please instantiate NeuralModuleFactory first and pass its instance as " + "`factory` parameter to all your Neural Module objects.".format(str(m[0])) ) - key = m[0].unique_instance_id - if key not in self.module_reference_table: - if isinstance(m[0], TrainableNeuralModuleWrapper): - self.module_reference_table[key] = (m[0], m[0]._pt_module) - else: - self.module_reference_table[key] = (m[0], m[0]) + # key = m[0].unique_instance_id + # if key not in self.module_reference_table: + # if isinstance(m[0], TrainableNeuralModuleWrapper): + # self.module_reference_table[key] = (m[0], m[0]._pt_module) + # else: + # self.module_reference_table[key] = (m[0], m[0]) return top_sorted_modules, tdataset @@ -372,10 +275,10 @@ def __initialize_amp( if optim_level == Optimization.mxprO0: return optimizer - if len(self.modules) < 1: + if len(AppState().modules) < 1: raise ValueError("There were no modules to initialize") pt_modules = [] - for module in self.modules: + for module in AppState().modules: if isinstance(module, nn.Module): pt_modules.append(module) elif isinstance(module, TrainableNeuralModuleWrapper): @@ -391,6 +294,9 @@ def __initialize_amp( self.amp_initialized = True return optimizer + def nm_graph_forward_pass(self, callchain, registered_tensors): + self.__nm_graph_forward_pass(callchain, registered_tensors) + def __nm_graph_forward_pass( self, call_chain, registered_tensors, mode=OperationMode.training, use_cache=False, ): @@ -409,8 +315,9 @@ def __nm_graph_forward_pass( continue call_args = call_chain[ind][1] # module = call_chain[ind][0] + # pmodule = self.module_reference_table[m_id][1] m_id = call_chain[ind][0].unique_instance_id - pmodule = self.module_reference_table[m_id][1] + pmodule = self.ddp_module_dict[m_id] if self.ddp_initialized else call_chain[ind][0] # if self._local_rank is not None: # if isinstance(pmodule, DDP): @@ -436,10 +343,11 @@ def __nm_graph_forward_pass( key = nmtensor.unique_name call_set[tensor_name] = registered_tensors[key] # actual PyTorch module call with signature - if isinstance(self.module_reference_table[m_id][0], TrainableNeuralModuleWrapper,): - new_tensors = pmodule(**call_set) - else: - new_tensors = pmodule(force_pt=True, **call_set) + # if isinstance(self.module_reference_table[m_id][0], TrainableNeuralModuleWrapper,): + # new_tensors = pmodule(**call_set) + # else: + # new_tensors = pmodule(force_pt=True, **call_set) + new_tensors = pmodule(force_pt=True, **call_set) if not isinstance(new_tensors, List): if not isinstance(new_tensors, tuple): @@ -925,31 +833,6 @@ def _check_tuples(list_of_tuples): return False return True - def _get_all_modules(self, training_loop, callbacks, logging_callchain=None): - """Gets all neural modules that will be used by train() and eval() via - EvaluatorCallbacks. Saves all modules to self.modules - """ - # If there is a SimpleLossLoggerCallback, create an logger_callchain - # with all callchains from training_loop and - # SimpleLossLoggerCallback.tensors - if logging_callchain: - for module in logging_callchain: - self.modules.add(module[0]) - - # Else grab all callchains from training_loop - else: - for step in training_loop: - for module in step[2]: - self.modules.add(module[0]) - - # Lastly, grab all eval modules - if callbacks is not None: - for callback in callbacks: - if isinstance(callback, EvaluatorCallback): - (callchain, _,) = self.__get_top_sorted_modules_and_dataloader(hook=callback.eval_tensors) - for module in callchain: - self.modules.add(module[0]) - @staticmethod def __module_export(module, output, d_format: DeploymentFormat, input_example=None, output_example=None): # Check if output already exists @@ -1217,8 +1100,6 @@ def train( all_tensors = all_tensors + step[1] (logging_callchain, _,) = self.__get_top_sorted_modules_and_dataloader(hook=all_tensors) - self._get_all_modules(training_loop, callbacks, logging_callchain) - # Intialize Amp if needed if self._optim_level in AmpOptimizations: # Store mapping of self.optimizers to optimizer in callchain @@ -1270,67 +1151,72 @@ def train( else: train_sampler = None - for train_iter in training_loop: - call_chain = train_iter[2] - for i in range(1, len(call_chain) - 1): - key = call_chain[i][0].unique_instance_id - pmodule = self.module_reference_table[key][1] - num_trainable_weights = self.module_reference_table[key][1].num_weights - if ( - not isinstance(pmodule, DDP) - and isinstance(pmodule, torch.nn.Module) - and num_trainable_weights > 0 - ): - # gpf = 1 - # if gradient_predivide: - # gpf = dist.get_world_size() - # pmodule = DDP(pmodule, gradient_predivide_factor=gpf) # Old Apex Method - - # Per pytorch docs, convert sync bn prior to DDP - if synced_batchnorm: - world_size = dist.get_world_size() - sync_batchnorm_group = None - if synced_batchnorm_groupsize > 0: - if world_size % synced_batchnorm_groupsize != 0: - raise ValueError( - f"Synchronized batch norm group size ({synced_batchnorm_groupsize}) must be 0" - f" or divide total number of GPUs ({world_size})." - ) - # Find ranks of other nodes in the same batchnorm group - rank = torch.distributed.get_rank() - group = rank // synced_batchnorm_groupsize - group_rank_ids = range( - group * synced_batchnorm_groupsize, (group + 1) * synced_batchnorm_groupsize + # for train_iter in training_loop: + # call_chain = train_iter[2] + # for i in range(1, len(call_chain) - 1): + # key = call_chain[i][0].unique_instance_id + # pmodule = self.module_reference_table[key][1] + # num_trainable_weights = self.module_reference_table[key][1].num_weights + self.ddp_initialized = True + for module in AppState().modules: + key = module.unique_instance_id + num_trainable_weights = module.num_weights + if ( + not isinstance(module, DDP) + and isinstance(module, torch.nn.Module) + and num_trainable_weights > 0 + ): + # gpf = 1 + # if gradient_predivide: + # gpf = dist.get_world_size() + # pmodule = DDP(pmodule, gradient_predivide_factor=gpf) # Old Apex Method + + # Per pytorch docs, convert sync bn prior to DDP + if synced_batchnorm: + world_size = dist.get_world_size() + sync_batchnorm_group = None + if synced_batchnorm_groupsize > 0: + if world_size % synced_batchnorm_groupsize != 0: + raise ValueError( + f"Synchronized batch norm group size ({synced_batchnorm_groupsize}) must be 0" + f" or divide total number of GPUs ({world_size})." ) - sync_batchnorm_group = torch.distributed.new_group(group_rank_ids) - - pmodule = nn.SyncBatchNorm.convert_sync_batchnorm( - pmodule, process_group=sync_batchnorm_group + # Find ranks of other nodes in the same batchnorm group + rank = torch.distributed.get_rank() + group = rank // synced_batchnorm_groupsize + group_rank_ids = range( + group * synced_batchnorm_groupsize, (group + 1) * synced_batchnorm_groupsize ) + sync_batchnorm_group = torch.distributed.new_group(group_rank_ids) - # By default, disable broadcast_buffers. This disables batch norm synchronization on forward - # pass - pmodule = DDP( - pmodule, device_ids=[self.local_rank], broadcast_buffers=False, find_unused_parameters=True + module = nn.SyncBatchNorm.convert_sync_batchnorm( + module, process_group=sync_batchnorm_group ) - # # Convert batchnorm modules to synced if applicable - # if synced_batchnorm and isinstance(pmodule, torch.nn.Module): - # world_size = dist.get_world_size() - # if synced_batchnorm_groupsize > 0 and world_size % synced_batchnorm_groupsize != 0: - # raise ValueError( - # f"Synchronized batch norm group size" - # f" ({synced_batchnorm_groupsize}) must be 0" - # f" or divide total number of GPUs" - # f" ({world_size})." - # ) - # process_group = create_syncbn_process_group(synced_batchnorm_groupsize) - # pmodule = convert_syncbn(pmodule, process_group=process_group) - - self.module_reference_table[key] = ( - self.module_reference_table[key][0], - pmodule, + # By default, disable broadcast_buffers. This disables batch norm synchronization on forward + # pass + module = DDP( + module, device_ids=[self.local_rank], broadcast_buffers=False, find_unused_parameters=True ) + self.ddp_module_dict[key] = module + + # # Convert batchnorm modules to synced if applicable + # if synced_batchnorm and isinstance(pmodule, torch.nn.Module): + # world_size = dist.get_world_size() + # if synced_batchnorm_groupsize > 0 and world_size % synced_batchnorm_groupsize != 0: + # raise ValueError( + # f"Synchronized batch norm group size" + # f" ({synced_batchnorm_groupsize}) must be 0" + # f" or divide total number of GPUs" + # f" ({world_size})." + # ) + # process_group = create_syncbn_process_group(synced_batchnorm_groupsize) + # pmodule = convert_syncbn(pmodule, process_group=process_group) + + # self.module_reference_table[key] = ( + # self.module_reference_table[key][0], + # pmodule, + # ) # single GPU/CPU training else: if t_dataset is not None: @@ -1566,7 +1452,7 @@ def get_DDP_modules(self, call_chain): modules = [] for ind in range(1, len(call_chain)): m_id = call_chain[ind][0].unique_instance_id - module = self.module_reference_table[m_id][1] + module = self.ddp_module_dict[m_id] if isinstance(module, DDP): modules.append(module) diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 1161cef57ee2..17dbf890f76c 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -38,6 +38,18 @@ class NeMoCallback(ABC): + def __init__(self): + self._action = None + + @property + def action(self): + """TODO remove""" + return self._action + + @action.setter + def action(self, action_obj): + self._action = action_obj + def on_action_start(self, state): pass @@ -59,6 +71,7 @@ def on_iteration_end(self, state): class SimpleLossLogger(NeMoCallback): def __init__(self, step_freq=100, tensors_to_log=["loss"]): + super().__init__() # Step_freq: how often logs are printed self.step_freq = step_freq self.tensors_to_log = tensors_to_log @@ -70,6 +83,8 @@ def on_iteration_end(self, state): if state["step"] % self.step_freq == 0: for tensor_key in self.tensors_to_log: tensor = state["tensors"].get_tensor(tensor_key) + if tensor is None: + tensor = state["tensors"].get_and_compute_tensor(tensor_key, self.action) logging.info("%s: %s", tensor_key, tensor) # except KeyError: # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index 37dac0e678d8..3d9faf867e08 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -25,6 +25,7 @@ 'DeploymentFormat', ] +import copy import random from abc import ABC, abstractmethod from enum import Enum @@ -41,6 +42,111 @@ logging = nemo.logging +# def topological_sort_from_leaves(leaf_nmtensors, cached_training_state: TrainingState = None): +def topological_sort_from_leaves(leaf_nmtensors, cached_training_state = None): + from nemo.backends.pytorch.nm import DataLayerNM + def create_node(producer, producer_args): + if producer_args is None: + return tuple((producer, ())) + else: + return tuple((producer, tuple([(k, v) for k, v in producer_args.items()]),)) + + def is_in_degree_zero(node, processed_nodes, cached_training_state): + """A node has in degree of zero""" + if node[1] == (): + return True + for portname, nmtensor in node[1]: + nd = create_node(nmtensor.producer, nmtensor.producer_args) + if nd not in processed_nodes: + if cached_training_state and cached_training_state.check_tensor_cached(nmtensor.unique_name): + continue + return False + return True + + hooks = leaf_nmtensors if isinstance(leaf_nmtensors, list) else [leaf_nmtensors] + + # ensures that no tensors are processed twice + processed_nmtensors = set() + + indices_to_remove = [] + # Check for duplicates in hook + for i, nmtensor in enumerate(hooks): + if nmtensor in processed_nmtensors: + indices_to_remove.append(i) + else: + processed_nmtensors.add(nmtensor) + + for i in reversed(indices_to_remove): + hooks.pop(i) + + _top_sorted_modules = [] + all_nodes = {} + + # extract all nodes to all_nodes set + hooks_lst = list(hooks) + while len(hooks_lst) > 0: + # take nmtensor from the end of the list + nmtensor = hooks_lst.pop() + producer_args = nmtensor.producer_args + + node = create_node(nmtensor.producer, producer_args) + # Store nmtensor as an output of its producer + # first make sure all keys are present per output port + # and nm is inside all_nodes + if node not in all_nodes: + all_nodes[node] = {k: None for k in nmtensor.producer.output_ports} + # second, populate output port with current nmtensor + # where applicable + all_nodes[node][nmtensor.name] = nmtensor + processed_nmtensors.add(nmtensor) + + new_tensors = set() + if producer_args is not None and producer_args != {}: + for _, new_nmtensor in producer_args.items(): + if new_nmtensor not in processed_nmtensors: + new_tensors.add(new_nmtensor) + + # TODO + if cached_training_state: + for name, input_nmtensor in producer_args.items(): + if cached_training_state.check_tensor_cached(input_nmtensor.unique_name): + new_tensors.remove(input_nmtensor) + + for new_nmtensor in new_tensors: + # put in the start of list + hooks_lst.insert(0, new_nmtensor) + + all_node_with_output = [] + # Iterate over all_nodes to create new nodes that include its output + # now all nodes have (module, input tensors, output tensors) + for node in all_nodes: + all_node_with_output.append(tuple((node[0], node[1], all_nodes[node]))) + + processed_nodes = [] + while len(all_node_with_output) > 0: + for node in all_node_with_output.copy(): + # if node's in_degree is zero it can be added to + # _top_sorted_modules + # this will also reduce in_degree of its children + if is_in_degree_zero(node, processed_nodes, cached_training_state): + _top_sorted_modules.append(node) + processed_nodes.append((node[0], node[1])) + all_node_with_output.remove(node) + + # Create top_sorted_modules aka callchain + top_sorted_modules = [] + for i, m in enumerate(_top_sorted_modules): + top_sorted_modules.append((m[0], dict(m[1]), m[2])) + # Ensure that there is only one dataset in callchain + if i > 0 and isinstance(m[0], DataLayerNM): + raise ValueError("There were more than one DataLayer NeuralModule inside your DAG.") + + #TODO + if cached_training_state and isinstance(m[0], DataLayerNM): + raise ValueError("Could not compute tensor from current cached training state.") + + return top_sorted_modules + class DeploymentFormat(Enum): """Which format to use when exporting a Neural Module for deployment""" @@ -100,10 +206,28 @@ def clear_dict(self): def set_tensor(self, tensor, value): self.tensor_dict[tensor.unique_name] = value + def check_tensor_cached(self, unique_name): + if self.tensor_dict[unique_name] is None: + return False + return True + def get_tensor(self, name): unique_name = AppState().tensor_names[name] return self.tensor_dict[unique_name] + def get_and_compute_tensor(self, name, action): + unique_name = AppState().tensor_names[name] + tensor_value = self.tensor_dict[unique_name] + if tensor_value is None: + nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name] + callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self) + # print(callchain) + callchain.insert(0, ()) + action.nm_graph_forward_pass(callchain, self.tensor_dict) + # print(self.tensor_dict[unique_name]) + tensor_value = self.tensor_dict[unique_name] + return tensor_value + class Actions(ABC): """Basic actions allowed on graphs of Neural Modules""" @@ -279,8 +403,8 @@ def _perform_on_epoch_end(self, callbacks): def _init_callbacks(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: - if isinstance(callback, ActionCallback): - callback.action = self + # if isinstance(callback, ActionCallback): + callback.action = self def _update_callbacks( self, callbacks=None, registered_tensors=None, final_loss=None, diff --git a/nemo/core/neural_types/nmtensor_registry.py b/nemo/core/neural_types/nmtensor_registry.py index c439d4949c9d..5055319c2cef 100755 --- a/nemo/core/neural_types/nmtensor_registry.py +++ b/nemo/core/neural_types/nmtensor_registry.py @@ -25,7 +25,8 @@ def __init__(self): # Create the nmtensor_naming_dict # which contains a mapping of str to NMTensor.unique_name self._nmtensor_naming_dict = {"loss": "loss"} # Reserve keyname of 'loss' - self._nmtensor_uniname_set = set(["loss"]) + # self._nmtensor_uniname_set = set(["loss"]) + self._nmtensor_uniname_dict = {"loss": None} # def summary(self): # """ Prints a nice summary. """ @@ -36,7 +37,7 @@ def __init__(self): @property def unique_names(self): - return self._nmtensor_uniname_set + return self._nmtensor_uniname_dict.keys() # def register(self, tensor: NmTensor): def register(self, tensor): @@ -44,11 +45,11 @@ def register(self, tensor): """ # Check if object is already in a set. - if tensor.unique_name in self._nmtensor_uniname_set: + if tensor.unique_name in self._nmtensor_uniname_dict: pass # Finally, add object to the set. - self._nmtensor_uniname_set.add(tensor.unique_name) + self._nmtensor_uniname_dict[tensor.unique_name] = tensor # def rename_NmTensor(self, tensor: NmTensor, new_name: str): def rename_NmTensor(self, tensor, new_name: str): @@ -81,7 +82,7 @@ def __getitem__(self, key): if key in self._nmtensor_naming_dict: key = self._nmtensor_naming_dict[key] - if key in self._nmtensor_uniname_set: + if key in self._nmtensor_uniname_dict: return key raise KeyError("A NmTensor with name `{}` don't exists!".format(key)) diff --git a/nemo/utils/neural_graph/object_registry.py b/nemo/utils/neural_graph/object_registry.py index 8e861e529944..8a6a1207e2ef 100644 --- a/nemo/utils/neural_graph/object_registry.py +++ b/nemo/utils/neural_graph/object_registry.py @@ -24,7 +24,7 @@ class ObjectRegistry(WeakSet): """ def __init__(self, base_type_name): - """ + """ Stores base type name. """ super().__init__() @@ -32,13 +32,13 @@ def __init__(self, base_type_name): def register(self, new_obj, name: str) -> str: """ - Registers a new object using the provided name. + Registers a new object using the provided name. If name is none - generates new unique name. - + Args: new_obj: An object to be registered. name: A "proposition" for the object name. - + Returns: A unique name (proposition or newly generated name). """ @@ -66,7 +66,7 @@ def register(self, new_obj, name: str) -> str: return unique_name def has(self, name: str) -> bool: - """ + """ Check if registry stores object with a given name. Args: @@ -125,7 +125,7 @@ def __eq__(self, other): """ Checks if two registers have the same content. - Args: + Args: other: The second registry object. """ if not isinstance(other, WeakSet): From 879fcfc1a1eabf41461b23e1070259a146a68879 Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 6 May 2020 17:23:48 -0700 Subject: [PATCH 03/40] style Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 10 ++-------- nemo/core/neural_factory.py | 5 +++-- nemo/core/neural_types/neural_type.py | 1 + 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index e737b08997c2..e06511f9d130 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -1161,11 +1161,7 @@ def train( for module in AppState().modules: key = module.unique_instance_id num_trainable_weights = module.num_weights - if ( - not isinstance(module, DDP) - and isinstance(module, torch.nn.Module) - and num_trainable_weights > 0 - ): + if not isinstance(module, DDP) and isinstance(module, torch.nn.Module) and num_trainable_weights > 0: # gpf = 1 # if gradient_predivide: # gpf = dist.get_world_size() @@ -1189,9 +1185,7 @@ def train( ) sync_batchnorm_group = torch.distributed.new_group(group_rank_ids) - module = nn.SyncBatchNorm.convert_sync_batchnorm( - module, process_group=sync_batchnorm_group - ) + module = nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group=sync_batchnorm_group) # By default, disable broadcast_buffers. This disables batch norm synchronization on forward # pass diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index 3d9faf867e08..13ec0f110808 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -43,8 +43,9 @@ logging = nemo.logging # def topological_sort_from_leaves(leaf_nmtensors, cached_training_state: TrainingState = None): -def topological_sort_from_leaves(leaf_nmtensors, cached_training_state = None): +def topological_sort_from_leaves(leaf_nmtensors, cached_training_state=None): from nemo.backends.pytorch.nm import DataLayerNM + def create_node(producer, producer_args): if producer_args is None: return tuple((producer, ())) @@ -141,7 +142,7 @@ def is_in_degree_zero(node, processed_nodes, cached_training_state): if i > 0 and isinstance(m[0], DataLayerNM): raise ValueError("There were more than one DataLayer NeuralModule inside your DAG.") - #TODO + # TODO if cached_training_state and isinstance(m[0], DataLayerNM): raise ValueError("Could not compute tensor from current cached training state.") diff --git a/nemo/core/neural_types/neural_type.py b/nemo/core/neural_types/neural_type.py index d78d0dc9923c..d3da8a80fdf5 100644 --- a/nemo/core/neural_types/neural_type.py +++ b/nemo/core/neural_types/neural_type.py @@ -329,6 +329,7 @@ def rename(self, new_name): """ AppState().tensor_names.rename_NmTensor(self, new_name) + class NeuralTypeError(Exception): """Base class for neural type related exceptions.""" From ddbf472c56818537d3d3af9be4ab032372a0296c Mon Sep 17 00:00:00 2001 From: Jason Date: Tue, 12 May 2020 13:31:55 -0700 Subject: [PATCH 04/40] add a reference to Actions into TrainingState, remove deprecated function Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 2 +- nemo/core/callbacks.py | 14 +---- nemo/core/neural_factory.py | 99 ++++---------------------------- 3 files changed, 12 insertions(+), 103 deletions(-) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index e06511f9d130..90ea4d34a490 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -993,7 +993,7 @@ def train( gradient_predivide=False, amp_max_loss_scale=2.0 ** 24, ): - self._training_state = TrainingState() + self._training_state = TrainingState(self) # Analyse the arguments passed to train. if tensors_to_optimize is not None and training_graph is not None: raise ValueError("Cannot pass both `tensors_to_optimize` and `training_graph` to the train() function") diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 17dbf890f76c..13a01c38390e 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -38,18 +38,6 @@ class NeMoCallback(ABC): - def __init__(self): - self._action = None - - @property - def action(self): - """TODO remove""" - return self._action - - @action.setter - def action(self, action_obj): - self._action = action_obj - def on_action_start(self, state): pass @@ -84,7 +72,7 @@ def on_iteration_end(self, state): for tensor_key in self.tensors_to_log: tensor = state["tensors"].get_tensor(tensor_key) if tensor is None: - tensor = state["tensors"].get_and_compute_tensor(tensor_key, self.action) + tensor = state["tensors"].get_and_compute_tensor(tensor_key) logging.info("%s: %s", tensor_key, tensor) # except KeyError: # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index 13ec0f110808..fbfc87ec4abf 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -34,9 +34,9 @@ import numpy as np import nemo -from ..utils import ExpManager -from .callbacks import ActionCallback, EvaluatorCallback -from .neural_types import * +from nemo.utils import ExpManager +from nemo.core.callbacks import ActionCallback, EvaluatorCallback +from nemo.core.neural_types import NmTensor from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated @@ -193,9 +193,10 @@ class DeviceType(Enum): class TrainingState: - def __init__(self): + def __init__(self, action): tensor_naming_registery = AppState().tensor_names self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None) + self._action = action def tensor_list(self): return self.tensor_dict.keys() @@ -216,7 +217,7 @@ def get_tensor(self, name): unique_name = AppState().tensor_names[name] return self.tensor_dict[unique_name] - def get_and_compute_tensor(self, name, action): + def get_and_compute_tensor(self, name): unique_name = AppState().tensor_names[name] tensor_value = self.tensor_dict[unique_name] if tensor_value is None: @@ -224,7 +225,7 @@ def get_and_compute_tensor(self, name, action): callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self) # print(callchain) callchain.insert(0, ()) - action.nm_graph_forward_pass(callchain, self.tensor_dict) + self._action.nm_graph_forward_pass(callchain, self.tensor_dict) # print(self.tensor_dict[unique_name]) tensor_value = self.tensor_dict[unique_name] return tensor_value @@ -239,7 +240,7 @@ def __init__(self, local_rank, global_rank, optimization_level=Optimization.mxpr self._optim_level = optimization_level self.step = None self.epoch_num = None - self._training_state = TrainingState() + self._training_state = None @property def state(self): @@ -404,8 +405,8 @@ def _perform_on_epoch_end(self, callbacks): def _init_callbacks(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: - # if isinstance(callback, ActionCallback): - callback.action = self + if isinstance(callback, ActionCallback): + callback.action = self def _update_callbacks( self, callbacks=None, registered_tensors=None, final_loss=None, @@ -617,86 +618,6 @@ def __name_import(name): mod = getattr(mod, comp) return mod - @deprecated(version=0.11) - def __get_pytorch_module(self, name, collection, params, pretrained): - # TK: "factory" is not passed as parameter anymore. - # params["factory"] = self - - if collection == "toys" or collection == "tutorials" or collection == "other": - constructor = NeuralModuleFactory.__name_import("nemo.backends.pytorch.tutorials." + name) - elif collection == "nemo_nlp": - constructor = NeuralModuleFactory.__name_import("nemo_nlp." + name) - if name == "BERT" and pretrained is True: - params["pretrained"] = True - elif collection == "nemo_asr": - constructor = NeuralModuleFactory.__name_import("nemo_asr." + name) - elif collection == "nemo_lpr": - constructor = NeuralModuleFactory.__name_import("nemo_lpr." + name) - elif collection == 'common': - constructor = NeuralModuleFactory.__name_import('nemo.backends.pytorch.common.' + name) - elif collection == "torchvision": - import torchvision.models as tv_models - import nemo.backends.pytorch.module_wrapper as mw - import torch.nn as nn - - if name == "ImageFolderDataLayer": - constructor = NeuralModuleFactory.__name_import("nemo.backends.pytorch.torchvision.data." + name) - instance = constructor(**params) - return instance - else: - _nm_name = name.lower() - if _nm_name == "resnet18": - input_ports = { - "x": NeuralType( - { - 0: AxisType(BatchTag), - 1: AxisType(ChannelTag), - 2: AxisType(HeightTag, 224), - 3: AxisType(WidthTag, 224), - } - ) - } - output_ports = {"output": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})} - - pt_model = tv_models.resnet18(pretrained=pretrained) - num_classes = params.get("num_classes", None) - if num_classes is not None: - pt_model.fc = nn.Linear(512, params["num_classes"]) - return mw.TrainableNeuralModuleWrapper( - pt_nn_module=pt_model, input_ports_dict=input_ports, output_ports_dict=output_ports, - ) - elif _nm_name == "resnet50": - input_ports = { - "x": NeuralType( - { - 0: AxisType(BatchTag), - 1: AxisType(ChannelTag), - 2: AxisType(HeightTag, 224), - 3: AxisType(WidthTag, 224), - } - ) - } - output_ports = {"output": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})} - - pt_model = tv_models.resnet50(pretrained=pretrained) - num_classes = params.get("num_classes", None) - if num_classes is not None: - pt_model.fc = nn.Linear(2048, params["num_classes"]) - return mw.TrainableNeuralModuleWrapper( - pt_nn_module=pt_model, input_ports_dict=input_ports, output_ports_dict=output_ports, - ) - else: - collection_path = "nemo.collections." + collection + "." + name - constructor = NeuralModuleFactory.__name_import(collection_path) - if name == "BERT" and pretrained is True: - params["pretrained"] = True - - # TK: "placement" is not passed as parameter anymore. - # if "placement" not in params: - # params["placement"] = self._placement - instance = constructor(**params) - return instance - @deprecated(version=0.11) def get_module(self, name, collection, params, pretrained=False): """ From 912d83d409001c5c57d95019a3ab563729e1f74b Mon Sep 17 00:00:00 2001 From: Jason Date: Tue, 12 May 2020 14:13:34 -0700 Subject: [PATCH 05/40] add decorators; add all events Signed-off-by: Jason --- examples/asr/jasper_an4_debug.py | 6 +++ nemo/backends/pytorch/actions.py | 4 +- nemo/core/callbacks.py | 91 +++++++++++++++++++++++++++++--- nemo/core/neural_factory.py | 56 +++++++++++++++++--- 4 files changed, 139 insertions(+), 18 deletions(-) diff --git a/examples/asr/jasper_an4_debug.py b/examples/asr/jasper_an4_debug.py index f06055baec8a..bd0f2ec99b48 100755 --- a/examples/asr/jasper_an4_debug.py +++ b/examples/asr/jasper_an4_debug.py @@ -107,6 +107,12 @@ def create_dags(model_config_file, vocab, args, nf): # callbacks = [train_callback, checkpointer_callback, eval_callback] callbacks = [train_callback] + @nemo.core.callbacks.on_step_start + def my_own_func(state): + print(state) + + callbacks.append(my_own_func) + # Return entities required by the actual training. return ( loss, diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 90ea4d34a490..dec18f0e9ea9 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -1258,7 +1258,7 @@ def train( curr_optimizer = training_loop[self.step % len(training_loop)][0] curr_optimizer.zero_grad() # Register iteration start with callbacks - self._perform_on_iteration_start(callbacks=callbacks) + self._perform_on_step_start(callbacks=callbacks) # set learning rate policy if lr_policy is not None: @@ -1362,7 +1362,7 @@ def train( self._update_callbacks( callbacks=callbacks, registered_tensors=self.training_state.tensor_dict, final_loss=final_loss ) - self._perform_on_iteration_end(callbacks=callbacks) + self._perform_on_step_end(callbacks=callbacks) self.step += 1 self.training_state.clear_dict() # End of epoch for loop diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 13a01c38390e..9ce148507b02 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -38,28 +38,33 @@ class NeMoCallback(ABC): - def on_action_start(self, state): + def on_train_start(self, state): pass - def on_action_end(self, state): + def on_epoch_start(self, state): pass - def on_epoch_start(self, state): + def on_batch_start(self, state): pass - def on_epoch_end(self, state): + def on_step_start(self, state): pass - def on_iteration_start(self, state): + def on_step_end(self, state): pass - def on_iteration_end(self, state): + def on_batch_end(self, state): + pass + + def on_epoch_end(self, state): + pass + + def on_train_end(self, state): pass class SimpleLossLogger(NeMoCallback): def __init__(self, step_freq=100, tensors_to_log=["loss"]): - super().__init__() # Step_freq: how often logs are printed self.step_freq = step_freq self.tensors_to_log = tensors_to_log @@ -67,7 +72,7 @@ def __init__(self, step_freq=100, tensors_to_log=["loss"]): # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]): # #tensors_to_log: List of keys into state that will be logged - def on_iteration_end(self, state): + def on_step_end(self, state): if state["step"] % self.step_freq == 0: for tensor_key in self.tensors_to_log: tensor = state["tensors"].get_tensor(tensor_key) @@ -78,6 +83,76 @@ def on_iteration_end(self, state): # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " # f"Current state tensors include {state['tensors'].tensor_list()}") +def on_train_start(func): + class NeMoCallbackWrapper(NeMoCallback): + def __init__(self, my_func): + self._func = my_func + def on_train_start(self, state): + self._func(state) + return NeMoCallbackWrapper(func) + + +def on_epoch_start(func): + class NeMoCallbackWrapper(NeMoCallback): + def __init__(self, my_func): + self._func = my_func + def on_epoch_start(self, state): + self._func(state) + return NeMoCallbackWrapper(func) + + +def on_batch_start(func): + class NeMoCallbackWrapper(NeMoCallback): + def __init__(self, my_func): + self._func = my_func + def on_batch_start(self, state): + self._func(state) + return NeMoCallbackWrapper(func) + + +def on_step_start(func): + class NeMoCallbackWrapper(NeMoCallback): + def __init__(self, my_func): + self._func = my_func + def on_step_start(self, state): + self._func(state) + return NeMoCallbackWrapper(func) + + +def on_step_end(func): + class NeMoCallbackWrapper(NeMoCallback): + def __init__(self, my_func): + self._func = my_func + def on_step_end(self, state): + self._func(state) + return NeMoCallbackWrapper(func) + + +def on_batch_end(func): + class NeMoCallbackWrapper(NeMoCallback): + def __init__(self, my_func): + self._func = my_func + def on_batch_end(self, state): + self._func(state) + return NeMoCallbackWrapper(func) + + +def on_epoch_end(func): + class NeMoCallbackWrapper(NeMoCallback): + def __init__(self, my_func): + self._func = my_func + def on_epoch_end(self, state): + self._func(state) + return NeMoCallbackWrapper(func) + + +def on_train_end(func): + class NeMoCallbackWrapper(NeMoCallback): + def __init__(self, my_func): + self._func = my_func + def on_train_end(self, state): + self._func(state) + return NeMoCallbackWrapper(func) class ActionCallback(ABC): """Abstract interface for callbacks. diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index fbfc87ec4abf..3416c5fc8032 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -35,7 +35,7 @@ import nemo from nemo.utils import ExpManager -from nemo.core.callbacks import ActionCallback, EvaluatorCallback +from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback from nemo.core.neural_types import NmTensor from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated @@ -352,55 +352,95 @@ def create_optimizer(self, optimizer, things_to_optimize, optimizer_params): """ pass - def _perform_on_iteration_start(self, callbacks): + def _perform_on_step_start(self, callbacks): # TODO: Most of these checks can be relaxed since we enforce callbacks # to be a list of ActionCallback objects if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: if isinstance(callback, ActionCallback): callback.on_iteration_start() + elif isinstance(callback, NeMoCallback): + callback.on_step_start(self.state) else: - callback.on_iteration_start(self.state) + raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " + "understood") - def _perform_on_iteration_end(self, callbacks): + def _perform_on_step_end(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: if isinstance(callback, ActionCallback): callback.on_iteration_end() + elif isinstance(callback, NeMoCallback): + callback.on_step_end(self.state) else: - callback.on_iteration_end(self.state) + raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " + "understood") def _perform_on_action_start(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: if isinstance(callback, ActionCallback): callback.on_action_start() + elif isinstance(callback, NeMoCallback): + callback.on_train_start(self.state) else: - callback.on_action_start(self.state) + raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " + "understood") def _perform_on_action_end(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: if isinstance(callback, ActionCallback): callback.on_action_end() + elif isinstance(callback, NeMoCallback): + callback.on_train_end(self.state) else: - callback.on_action_end(self.state) + raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " + "understood") def _perform_on_epoch_start(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: if isinstance(callback, ActionCallback): callback.on_epoch_start() - else: + elif isinstance(callback, NeMoCallback): callback.on_epoch_start(self.state) + else: + raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " + "understood") def _perform_on_epoch_end(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: for callback in callbacks: if isinstance(callback, ActionCallback): callback.on_epoch_end() + elif isinstance(callback, NeMoCallback): + callback.on_epoch_end(self.state) + else: + raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " + "understood") + + def _perform_on_batch_start(self, callbacks): + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + continue + elif isinstance(callback, NeMoCallback): + callback.on_epoch_start(self.state) else: + raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " + "understood") + + def _perform_on_batch_end(self, callbacks): + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + continue + elif isinstance(callback, NeMoCallback): callback.on_epoch_end(self.state) + else: + raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " + "understood") def _init_callbacks(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: From 2e4eb18cf9542a2d3e9b5f29243381a84da1605d Mon Sep 17 00:00:00 2001 From: Jason Date: Tue, 12 May 2020 14:15:17 -0700 Subject: [PATCH 06/40] style Signed-off-by: Jason --- nemo/core/callbacks.py | 18 ++++++++++++++++ nemo/core/neural_factory.py | 42 ++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 9ce148507b02..5780818352df 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -83,12 +83,15 @@ def on_step_end(self, state): # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " # f"Current state tensors include {state['tensors'].tensor_list()}") + def on_train_start(func): class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func + def on_train_start(self, state): self._func(state) + return NeMoCallbackWrapper(func) @@ -96,8 +99,10 @@ def on_epoch_start(func): class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func + def on_epoch_start(self, state): self._func(state) + return NeMoCallbackWrapper(func) @@ -105,8 +110,10 @@ def on_batch_start(func): class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func + def on_batch_start(self, state): self._func(state) + return NeMoCallbackWrapper(func) @@ -114,8 +121,10 @@ def on_step_start(func): class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func + def on_step_start(self, state): self._func(state) + return NeMoCallbackWrapper(func) @@ -123,8 +132,10 @@ def on_step_end(func): class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func + def on_step_end(self, state): self._func(state) + return NeMoCallbackWrapper(func) @@ -132,8 +143,10 @@ def on_batch_end(func): class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func + def on_batch_end(self, state): self._func(state) + return NeMoCallbackWrapper(func) @@ -141,8 +154,10 @@ def on_epoch_end(func): class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func + def on_epoch_end(self, state): self._func(state) + return NeMoCallbackWrapper(func) @@ -150,10 +165,13 @@ def on_train_end(func): class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func + def on_train_end(self, state): self._func(state) + return NeMoCallbackWrapper(func) + class ActionCallback(ABC): """Abstract interface for callbacks. """ diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index 3416c5fc8032..b9d6662c543e 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -34,9 +34,9 @@ import numpy as np import nemo -from nemo.utils import ExpManager from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback from nemo.core.neural_types import NmTensor +from nemo.utils import ExpManager from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated @@ -362,8 +362,9 @@ def _perform_on_step_start(self, callbacks): elif isinstance(callback, NeMoCallback): callback.on_step_start(self.state) else: - raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " - "understood") + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + ) def _perform_on_step_end(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: @@ -373,8 +374,9 @@ def _perform_on_step_end(self, callbacks): elif isinstance(callback, NeMoCallback): callback.on_step_end(self.state) else: - raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " - "understood") + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + ) def _perform_on_action_start(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: @@ -384,8 +386,9 @@ def _perform_on_action_start(self, callbacks): elif isinstance(callback, NeMoCallback): callback.on_train_start(self.state) else: - raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " - "understood") + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + ) def _perform_on_action_end(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: @@ -395,8 +398,9 @@ def _perform_on_action_end(self, callbacks): elif isinstance(callback, NeMoCallback): callback.on_train_end(self.state) else: - raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " - "understood") + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + ) def _perform_on_epoch_start(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: @@ -406,8 +410,9 @@ def _perform_on_epoch_start(self, callbacks): elif isinstance(callback, NeMoCallback): callback.on_epoch_start(self.state) else: - raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " - "understood") + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + ) def _perform_on_epoch_end(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: @@ -417,8 +422,9 @@ def _perform_on_epoch_end(self, callbacks): elif isinstance(callback, NeMoCallback): callback.on_epoch_end(self.state) else: - raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " - "understood") + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + ) def _perform_on_batch_start(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: @@ -428,8 +434,9 @@ def _perform_on_batch_start(self, callbacks): elif isinstance(callback, NeMoCallback): callback.on_epoch_start(self.state) else: - raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " - "understood") + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + ) def _perform_on_batch_end(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: @@ -439,8 +446,9 @@ def _perform_on_batch_end(self, callbacks): elif isinstance(callback, NeMoCallback): callback.on_epoch_end(self.state) else: - raise ValueError("Callback was not a child of ActionCallback nor NeMoCallback and was not " - "understood") + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + ) def _init_callbacks(self, callbacks): if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: From 35d6b7d4306559cf026ba676422211826cafc858 Mon Sep 17 00:00:00 2001 From: Jason Date: Tue, 12 May 2020 14:15:47 -0700 Subject: [PATCH 07/40] more style Signed-off-by: Jason --- nemo/core/neural_factory.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index b9d6662c543e..79c357582f57 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -363,7 +363,7 @@ def _perform_on_step_start(self, callbacks): callback.on_step_start(self.state) else: raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" ) def _perform_on_step_end(self, callbacks): @@ -375,7 +375,7 @@ def _perform_on_step_end(self, callbacks): callback.on_step_end(self.state) else: raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" ) def _perform_on_action_start(self, callbacks): @@ -387,7 +387,7 @@ def _perform_on_action_start(self, callbacks): callback.on_train_start(self.state) else: raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" ) def _perform_on_action_end(self, callbacks): @@ -399,7 +399,7 @@ def _perform_on_action_end(self, callbacks): callback.on_train_end(self.state) else: raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" ) def _perform_on_epoch_start(self, callbacks): @@ -411,7 +411,7 @@ def _perform_on_epoch_start(self, callbacks): callback.on_epoch_start(self.state) else: raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" ) def _perform_on_epoch_end(self, callbacks): @@ -423,7 +423,7 @@ def _perform_on_epoch_end(self, callbacks): callback.on_epoch_end(self.state) else: raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" ) def _perform_on_batch_start(self, callbacks): @@ -435,7 +435,7 @@ def _perform_on_batch_start(self, callbacks): callback.on_epoch_start(self.state) else: raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" ) def _perform_on_batch_end(self, callbacks): @@ -447,7 +447,7 @@ def _perform_on_batch_end(self, callbacks): callback.on_epoch_end(self.state) else: raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not " "understood" + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" ) def _init_callbacks(self, callbacks): From 4f6e1f71691ec751969dff6ce50b14b2208dee28 Mon Sep 17 00:00:00 2001 From: Jason Date: Fri, 15 May 2020 17:02:41 -0700 Subject: [PATCH 08/40] initial refactor Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 155 +++++++++++++++ nemo/core/callbacks.py | 101 ++++++---- nemo/core/neural_factory.py | 314 +++++++++++++------------------ 3 files changed, 356 insertions(+), 214 deletions(-) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index dec18f0e9ea9..ee9f9cd6ce13 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -50,6 +50,45 @@ } +class TrainingState: + def __init__(self, action): + tensor_naming_registery = AppState().tensor_names + self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None) + self._action = action + + def tensor_list(self): + return self.tensor_dict.keys() + + def clear_dict(self): + for name in self.tensor_dict: + self.tensor_dict[name] = None + + def set_tensor(self, tensor, value): + self.tensor_dict[tensor.unique_name] = value + + def check_tensor_cached(self, unique_name): + if self.tensor_dict[unique_name] is None: + return False + return True + + def get_tensor(self, name): + unique_name = AppState().tensor_names[name] + return self.tensor_dict[unique_name] + + def get_and_compute_tensor(self, name): + unique_name = AppState().tensor_names[name] + tensor_value = self.tensor_dict[unique_name] + if tensor_value is None: + nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name] + callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self) + # print(callchain) + callchain.insert(0, ()) + self._action.nm_graph_forward_pass(callchain, self.tensor_dict) + # print(self.tensor_dict[unique_name]) + tensor_value = self.tensor_dict[unique_name] + return tensor_value + + class PtActions(Actions): def __init__( self, local_rank=None, global_rank=None, tb_writer=None, optimization_level=Optimization.mxprO0, @@ -993,6 +1032,122 @@ def train( gradient_predivide=False, amp_max_loss_scale=2.0 ** 24, ): + def _perform_on_step_start(callbacks, state): + # TODO: Most of these checks can be relaxed since we enforce callbacks + # to be a list of ActionCallback objects + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + callback.on_iteration_start() + elif isinstance(callback, NeMoCallback): + callback.on_step_start(state) + else: + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + ) + + def _perform_on_step_end(callbacks, state): + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + callback.on_iteration_end() + elif isinstance(callback, NeMoCallback): + callback.on_step_end(state) + else: + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + ) + + def _perform_on_action_start(callbacks, state): + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + callback.on_action_start() + elif isinstance(callback, NeMoCallback): + callback.on_train_start(state) + else: + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + ) + + def _perform_on_action_end(callbacks, state): + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + callback.on_action_end() + elif isinstance(callback, NeMoCallback): + callback.on_train_end(state) + else: + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + ) + + def _perform_on_epoch_start(callbacks, state): + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + callback.on_epoch_start() + elif isinstance(callback, NeMoCallback): + callback.on_epoch_start(state) + else: + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + ) + + def _perform_on_epoch_end(callbacks, state): + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + callback.on_epoch_end() + elif isinstance(callback, NeMoCallback): + callback.on_epoch_end(state) + else: + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + ) + + def _perform_on_batch_start(callbacks, state): + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + continue + elif isinstance(callback, NeMoCallback): + callback.on_epoch_start(state) + else: + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + ) + + def _perform_on_batch_end(callbacks, state): + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + continue + elif isinstance(callback, NeMoCallback): + callback.on_epoch_end(state) + else: + raise ValueError( + "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + ) + + def _init_callbacks(callbacks, action): + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + callback.action = action + + def _update_callbacks(callbacks=None, registered_tensors=None, final_loss=None): + # if self.local_rank is None or self.local_rank == 0: + if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + for callback in callbacks: + if isinstance(callback, ActionCallback): + callback._registered_tensors = registered_tensors + else: # For now, we can use the old callback function. In the future we should improve this + registered_tensors["loss"] = final_loss + + def get_state(self): + return {"step": self.step, "tensors": self._training_state, "epoch_num":self.epoch_num, "optimizer": self.optimizers} + self._training_state = TrainingState(self) # Analyse the arguments passed to train. if tensors_to_optimize is not None and training_graph is not None: diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 5780818352df..ecdb38fa30dc 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -26,6 +26,7 @@ import nemo from nemo.utils import get_checkpoint_from_dir +from nemo.utils.app_state import AppState try: import wandb @@ -63,6 +64,46 @@ def on_train_end(self, state): pass +class TensorboardLogger(NeMoCallback): + def __init__(self, step_freq=100, tensors_to_log=["loss"]): + # Step_freq: how often logs are printed + self.step_freq = step_freq + self.tensors_to_log = tensors_to_log + + # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]): + # #tensors_to_log: List of keys into state that will be logged + + def on_step_end(self, state): + if state["step"] % self.step_freq == 0: + for tensor_key in self.tensors_to_log: + tensor = state["tensors"].get_tensor(tensor_key) + if tensor is None: + tensor = state["tensors"].get_and_compute_tensor(tensor_key) + logging.info("%s: %s", tensor_key, tensor) + # except KeyError: + # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " + # f"Current state tensors include {state['tensors'].tensor_list()}") + +class WandBLogger(NeMoCallback): + def __init__(self, step_freq=100, tensors_to_log=["loss"]): + # Step_freq: how often logs are printed + self.step_freq = step_freq + self.tensors_to_log = tensors_to_log + + # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]): + # #tensors_to_log: List of keys into state that will be logged + + def on_step_end(self, state): + if state["step"] % self.step_freq == 0: + for tensor_key in self.tensors_to_log: + tensor = state["tensors"].get_tensor(tensor_key) + if tensor is None: + tensor = state["tensors"].get_and_compute_tensor(tensor_key) + logging.info("%s: %s", tensor_key, tensor) + # except KeyError: + # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " + # f"Current state tensors include {state['tensors'].tensor_list()}") + class SimpleLossLogger(NeMoCallback): def __init__(self, step_freq=100, tensors_to_log=["loss"]): # Step_freq: how often logs are printed @@ -374,7 +415,7 @@ def on_iteration_end(self): logging.info(f"Step time: {run_time} seconds") -class CheckpointCallback(ActionCallback): +class CheckpointCallback(NeMoCallback): """ For callback documentation: please see https://nvidia.github.io/NeMo/tutorials/callbacks.html @@ -400,14 +441,14 @@ def __init__( # If True, run will fail if we cannot load module weights self._force_load = force_load - def __save_to(self, path): - if self.global_rank is not None and self.global_rank != 0: + def __save_to(self, path, state): + if state.global_rank is not None and state.global_rank != 0: return if not os.path.isdir(path): logging.info(f"Creating {path} folder") os.makedirs(path, exist_ok=True) unique_mod_names = set() - for module in self.action.modules: + for module in AppState().modules: if module.num_weights > 0: if str(module) in unique_mod_names: raise NotImplementedError( @@ -416,19 +457,19 @@ def __save_to(self, path): ) unique_mod_names.add(str(module)) if self._step_freq > -1: - filename = f"{module}-STEP-{self.step}.pt" + filename = f"{module}-STEP-{state.step}.pt" else: - filename = f"{module}-EPOCH-{self.epoch_num}.pt" + filename = f"{module}-EPOCH-{state.epoch_num}.pt" module.save_to(os.path.join(path, filename)) if self._step_freq > -1: - filename = f"trainer-STEP-{self.step}.pt" - self.action.save_state_to(f'{path}/{filename}') - self._saved_ckpts.append(f'-{self.step}.pt') + filename = f"trainer-STEP-{state.step}.pt" + state.save_state_to(f'{path}/{filename}') + self._saved_ckpts.append(f'-{state.step}.pt') else: - filename = f"trainer-EPOCH-{self.epoch_num}.pt" - self.action.save_state_to(f'{path}/{filename}') - self._saved_ckpts.append(f'-{self.epoch_num}.pt') + filename = f"trainer-EPOCH-{state.epoch_num}.pt" + state.save_state_to(f'{path}/{filename}') + self._saved_ckpts.append(f'-{state.epoch_num}.pt') if len(self._saved_ckpts) > self._ckpt2keep: for end in self._saved_ckpts[: -self._ckpt2keep]: @@ -437,7 +478,7 @@ def __save_to(self, path): self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :] logging.info(f'Saved checkpoint: {path}/{filename}') - def __restore_from(self, path): + def __restore_from(self, path, state): if not os.path.isdir(path): if self._force_load: raise ValueError("force_load was set to True for checkpoint callback but a checkpoint was not found.") @@ -446,7 +487,7 @@ def __restore_from(self, path): logging.info(f"Found checkpoint folder {path}. Will attempt to restore checkpoints from it.") modules_to_restore = [] modules_to_restore_name = [] - for module in self.action.modules: + for module in AppState().modules: if module.num_weights > 0: modules_to_restore.append(module) modules_to_restore_name.append(str(module)) @@ -454,7 +495,7 @@ def __restore_from(self, path): module_checkpoints = get_checkpoint_from_dir(modules_to_restore_name, path) for mod, checkpoint in zip(modules_to_restore, module_checkpoints): - mod.restore_from(checkpoint, self.local_rank) + mod.restore_from(checkpoint, state.local_rank) except (BaseException, ValueError) as e: if self._force_load: raise ValueError( @@ -469,8 +510,8 @@ def __restore_from(self, path): try: trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path) - for tr, checkpoint in zip([self.action], trainer_checkpoints): - tr.restore_state_from(checkpoint) + state.restore_state_from(checkpoint) + # for tr, checkpoint in zip([self.action], trainer_checkpoints): except (BaseException, ValueError) as e: logging.warning(e) logging.warning( @@ -479,10 +520,10 @@ def __restore_from(self, path): ) return - def on_action_start(self): + def on_train_start(self, state): num_parameters = 0 unique_mod_names = set() - for module in self.action.modules: + for module in AppState().modules: if module.num_weights > 0: if str(module) in unique_mod_names: raise NotImplementedError( @@ -491,29 +532,25 @@ def on_action_start(self): ) unique_mod_names.add(str(module)) num_parameters += module.num_weights - logging.info(f"Found {len(unique_mod_names)} modules with " f"weights:") + logging.info(f"Found {len(unique_mod_names)} modules with weights:") for name in unique_mod_names: logging.info(f"{name}") logging.info(f"Total model parameters: {num_parameters}") self.__restore_from(path=self._load_from_folder) - def on_iteration_end(self): - step = self.step + def on_step_end(self, state): + step = state["step"] if self._step_freq > 0 and step % self._step_freq == 0 and step > 0: self.__save_to(path=self._folder) - def on_action_end(self): + def on_train_end(self, state): if self._step_freq > 0 or self._epoch_freq > 0: self.__save_to(path=self._folder) - def on_epoch_start(self): - self._last_epoch_start = time.time() - - def on_epoch_end(self): - if self._epoch_freq > 0: - if self.global_rank is None or self.global_rank == 0: - if (self.epoch_num + 1) % self._epoch_freq == 0: - self.__save_to(path=self._folder) + def on_epoch_end(self, state): + epoch = state["epoch"] + if self._epoch_freq > 0 and epoch % self._epoch_freq == 0 and epoch > 0: + self.__save_to(path=self._folder) class EvaluatorCallback(ActionCallback): @@ -712,7 +749,7 @@ def on_iteration_start(self): m.unfreeze() -class WandbCallback(ActionCallback): +class OldWandbCallback(ActionCallback): """ Log metrics to [Weights & Biases](https://docs.wandb.com/) """ diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index 79c357582f57..f80df8c39b23 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -192,45 +192,6 @@ class DeviceType(Enum): AllGpu = 3 -class TrainingState: - def __init__(self, action): - tensor_naming_registery = AppState().tensor_names - self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None) - self._action = action - - def tensor_list(self): - return self.tensor_dict.keys() - - def clear_dict(self): - for name in self.tensor_dict: - self.tensor_dict[name] = None - - def set_tensor(self, tensor, value): - self.tensor_dict[tensor.unique_name] = value - - def check_tensor_cached(self, unique_name): - if self.tensor_dict[unique_name] is None: - return False - return True - - def get_tensor(self, name): - unique_name = AppState().tensor_names[name] - return self.tensor_dict[unique_name] - - def get_and_compute_tensor(self, name): - unique_name = AppState().tensor_names[name] - tensor_value = self.tensor_dict[unique_name] - if tensor_value is None: - nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name] - callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self) - # print(callchain) - callchain.insert(0, ()) - self._action.nm_graph_forward_pass(callchain, self.tensor_dict) - # print(self.tensor_dict[unique_name]) - tensor_value = self.tensor_dict[unique_name] - return tensor_value - - class Actions(ABC): """Basic actions allowed on graphs of Neural Modules""" @@ -238,17 +199,6 @@ def __init__(self, local_rank, global_rank, optimization_level=Optimization.mxpr self._local_rank = local_rank self._global_rank = global_rank self._optim_level = optimization_level - self.step = None - self.epoch_num = None - self._training_state = None - - @property - def state(self): - return {"step": self.step, "tensors": self.training_state} - - @property - def training_state(self): - return self._training_state @property def local_rank(self): @@ -312,29 +262,29 @@ def infer(self, tensors: List[NmTensor]): """ pass - @abstractmethod - def save_state_to(self, path: str): - """ - Saves current state such as step, epoch and optimizer parameters - Args: - path: + # @abstractmethod + # def save_state_to(self, path: str): + # """ + # Saves current state such as step, epoch and optimizer parameters + # Args: + # path: - Returns: + # Returns: - """ - pass + # """ + # pass - @abstractmethod - def restore_state_from(self, path: str): - """ - Restores state such as step, epoch and optimizer parameters - Args: - path: + # @abstractmethod + # def restore_state_from(self, path: str): + # """ + # Restores state such as step, epoch and optimizer parameters + # Args: + # path: - Returns: + # Returns: - """ - pass + # """ + # pass @abstractmethod def create_optimizer(self, optimizer, things_to_optimize, optimizer_params): @@ -352,120 +302,120 @@ def create_optimizer(self, optimizer, things_to_optimize, optimizer_params): """ pass - def _perform_on_step_start(self, callbacks): - # TODO: Most of these checks can be relaxed since we enforce callbacks - # to be a list of ActionCallback objects - if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - for callback in callbacks: - if isinstance(callback, ActionCallback): - callback.on_iteration_start() - elif isinstance(callback, NeMoCallback): - callback.on_step_start(self.state) - else: - raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - ) - - def _perform_on_step_end(self, callbacks): - if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - for callback in callbacks: - if isinstance(callback, ActionCallback): - callback.on_iteration_end() - elif isinstance(callback, NeMoCallback): - callback.on_step_end(self.state) - else: - raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - ) - - def _perform_on_action_start(self, callbacks): - if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - for callback in callbacks: - if isinstance(callback, ActionCallback): - callback.on_action_start() - elif isinstance(callback, NeMoCallback): - callback.on_train_start(self.state) - else: - raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - ) - - def _perform_on_action_end(self, callbacks): - if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - for callback in callbacks: - if isinstance(callback, ActionCallback): - callback.on_action_end() - elif isinstance(callback, NeMoCallback): - callback.on_train_end(self.state) - else: - raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - ) - - def _perform_on_epoch_start(self, callbacks): - if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - for callback in callbacks: - if isinstance(callback, ActionCallback): - callback.on_epoch_start() - elif isinstance(callback, NeMoCallback): - callback.on_epoch_start(self.state) - else: - raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - ) - - def _perform_on_epoch_end(self, callbacks): - if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - for callback in callbacks: - if isinstance(callback, ActionCallback): - callback.on_epoch_end() - elif isinstance(callback, NeMoCallback): - callback.on_epoch_end(self.state) - else: - raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - ) - - def _perform_on_batch_start(self, callbacks): - if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - for callback in callbacks: - if isinstance(callback, ActionCallback): - continue - elif isinstance(callback, NeMoCallback): - callback.on_epoch_start(self.state) - else: - raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - ) - - def _perform_on_batch_end(self, callbacks): - if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - for callback in callbacks: - if isinstance(callback, ActionCallback): - continue - elif isinstance(callback, NeMoCallback): - callback.on_epoch_end(self.state) - else: - raise ValueError( - "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - ) - - def _init_callbacks(self, callbacks): - if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - for callback in callbacks: - if isinstance(callback, ActionCallback): - callback.action = self - - def _update_callbacks( - self, callbacks=None, registered_tensors=None, final_loss=None, - ): - # if self.local_rank is None or self.local_rank == 0: - if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - for callback in callbacks: - if isinstance(callback, ActionCallback): - callback._registered_tensors = registered_tensors - else: # For now, we can use the old callback function. In the future we should improve this - self.training_state.tensor_dict["loss"] = final_loss + # def _perform_on_step_start(self, callbacks): + # # TODO: Most of these checks can be relaxed since we enforce callbacks + # # to be a list of ActionCallback objects + # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + # for callback in callbacks: + # if isinstance(callback, ActionCallback): + # callback.on_iteration_start() + # elif isinstance(callback, NeMoCallback): + # callback.on_step_start(self.state) + # else: + # raise ValueError( + # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + # ) + + # def _perform_on_step_end(self, callbacks): + # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + # for callback in callbacks: + # if isinstance(callback, ActionCallback): + # callback.on_iteration_end() + # elif isinstance(callback, NeMoCallback): + # callback.on_step_end(self.state) + # else: + # raise ValueError( + # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + # ) + + # def _perform_on_action_start(self, callbacks): + # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + # for callback in callbacks: + # if isinstance(callback, ActionCallback): + # callback.on_action_start() + # elif isinstance(callback, NeMoCallback): + # callback.on_train_start(self.state) + # else: + # raise ValueError( + # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + # ) + + # def _perform_on_action_end(self, callbacks): + # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + # for callback in callbacks: + # if isinstance(callback, ActionCallback): + # callback.on_action_end() + # elif isinstance(callback, NeMoCallback): + # callback.on_train_end(self.state) + # else: + # raise ValueError( + # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + # ) + + # def _perform_on_epoch_start(self, callbacks): + # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + # for callback in callbacks: + # if isinstance(callback, ActionCallback): + # callback.on_epoch_start() + # elif isinstance(callback, NeMoCallback): + # callback.on_epoch_start(self.state) + # else: + # raise ValueError( + # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + # ) + + # def _perform_on_epoch_end(self, callbacks): + # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + # for callback in callbacks: + # if isinstance(callback, ActionCallback): + # callback.on_epoch_end() + # elif isinstance(callback, NeMoCallback): + # callback.on_epoch_end(self.state) + # else: + # raise ValueError( + # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + # ) + + # def _perform_on_batch_start(self, callbacks): + # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + # for callback in callbacks: + # if isinstance(callback, ActionCallback): + # continue + # elif isinstance(callback, NeMoCallback): + # callback.on_epoch_start(self.state) + # else: + # raise ValueError( + # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + # ) + + # def _perform_on_batch_end(self, callbacks): + # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + # for callback in callbacks: + # if isinstance(callback, ActionCallback): + # continue + # elif isinstance(callback, NeMoCallback): + # callback.on_epoch_end(self.state) + # else: + # raise ValueError( + # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" + # ) + + # def _init_callbacks(self, callbacks): + # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + # for callback in callbacks: + # if isinstance(callback, ActionCallback): + # callback.action = self + + # def _update_callbacks( + # self, callbacks=None, registered_tensors=None, final_loss=None, + # ): + # # if self.local_rank is None or self.local_rank == 0: + # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: + # for callback in callbacks: + # if isinstance(callback, ActionCallback): + # callback._registered_tensors = registered_tensors + # else: # For now, we can use the old callback function. In the future we should improve this + # self.training_state.tensor_dict["loss"] = final_loss def _str_to_opt_level(opt_str: str) -> Optimization: From 3c7b89e216f137a0812c4df5f03d539f8960632b Mon Sep 17 00:00:00 2001 From: Jason Date: Fri, 15 May 2020 17:18:25 -0700 Subject: [PATCH 09/40] adding checkpoint callback Signed-off-by: Jason --- examples/asr/jasper_an4_debug.py | 7 ++-- nemo/backends/pytorch/actions.py | 72 +++++++++++++++++++++++--------- nemo/core/callbacks.py | 30 ++++++------- 3 files changed, 72 insertions(+), 37 deletions(-) diff --git a/examples/asr/jasper_an4_debug.py b/examples/asr/jasper_an4_debug.py index bd0f2ec99b48..761c674ad3be 100755 --- a/examples/asr/jasper_an4_debug.py +++ b/examples/asr/jasper_an4_debug.py @@ -93,7 +93,7 @@ def create_dags(model_config_file, vocab, args, nf): predictions.rename("test") train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["loss", "test"]) - # checkpointer_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq) + checkpointer_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq) # eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e] # eval_callback = nemo.core.EvaluatorCallback( @@ -105,11 +105,12 @@ def create_dags(model_config_file, vocab, args, nf): # eval_at_start=not args.do_not_eval_at_start, # ) # callbacks = [train_callback, checkpointer_callback, eval_callback] - callbacks = [train_callback] + callbacks = [train_callback, checkpointer_callback] @nemo.core.callbacks.on_step_start def my_own_func(state): - print(state) + if state["step"] % 100 == 0: + print(state) callbacks.append(my_own_func) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index ee9f9cd6ce13..f3b249417350 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -21,7 +21,7 @@ from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback, SimpleLossLoggerCallback -from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState, topological_sort_from_leaves +from nemo.core.neural_factory import Actions, OperationMode, Optimization, topological_sort_from_leaves from nemo.core.neural_types import * from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated @@ -1145,8 +1145,40 @@ def _update_callbacks(callbacks=None, registered_tensors=None, final_loss=None): else: # For now, we can use the old callback function. In the future we should improve this registered_tensors["loss"] = final_loss - def get_state(self): - return {"step": self.step, "tensors": self._training_state, "epoch_num":self.epoch_num, "optimizer": self.optimizers} + def get_state(action): + class StateWrapper(dict): + def restore_state_from(self, path): + if os.path.isfile(path): + # map_location could be cuda: but cpu seems to be more + # general since we are also saving step and epoch_num + # load_state_dict should move the variables to the relevant device + checkpoint = torch.load(path, map_location="cpu") + self.step = checkpoint["step"] + self.epoch_num = checkpoint["epoch_num"] + if checkpoint["optimizer_state"]: + for opt, opt_chkpt in zip(self["optimizers"], checkpoint["optimizer_state"]): + opt.load_state_dict(opt_chkpt) + else: + raise FileNotFoundError("Could not find checkpoint file: {0}".format(path)) + + def save_state_to(self, path): + state = { + "step": self["step"], + "epoch_num": self["epoch"], + "optimizer_state": [opt.state_dict() for opt in self["optimizers"]], + } + torch.save(state, path) + + return StateWrapper( + { + "step": action.step, + "tensors": action._training_state, + "epoch": action.epoch_num, + "local_rank": action.local_rank, + "global_rank": action.global_rank, + "optimizers": action.optimizers, + } + ) self._training_state = TrainingState(self) # Analyse the arguments passed to train. @@ -1181,9 +1213,9 @@ def get_state(self): if tensors_to_optimize is None: # This is Evaluation Mode - self._init_callbacks(callbacks) + _init_callbacks(callbacks, self) # Do action start callbacks - self._perform_on_action_end(callbacks=callbacks) + _perform_on_action_end(callbacks, get_state(self)) return # Check if tensors_to_optimize is just a list of NmTensors elif tensors_to_optimize is not None and ( @@ -1385,9 +1417,9 @@ def get_state(self): train_dataloader = dataNM.data_iterator train_sampler = None - self._init_callbacks(callbacks) + _init_callbacks(callbacks, self) # Do action start callbacks - self._perform_on_action_start(callbacks=callbacks) + _perform_on_action_start(callbacks, get_state(self)) nan_or_inf = False @@ -1400,7 +1432,7 @@ def get_state(self): break # Register epochs start with callbacks - self._perform_on_epoch_start(callbacks=callbacks) + _perform_on_epoch_start(callbacks, get_state(self)) # iteration over batches in epoch batch_counter = 0 @@ -1413,7 +1445,7 @@ def get_state(self): curr_optimizer = training_loop[self.step % len(training_loop)][0] curr_optimizer.zero_grad() # Register iteration start with callbacks - self._perform_on_step_start(callbacks=callbacks) + _perform_on_step_start(callbacks, get_state(self)) # set learning rate policy if lr_policy is not None: @@ -1445,18 +1477,18 @@ def get_state(self): for t, d in zip(curr_call_chain[0][2].values(), tensors): if t is not None: - self.training_state.set_tensor(t, d) + self._training_state.set_tensor(t, d) disable_allreduce = batch_counter < (batches_per_step - 1) self.__nm_graph_forward_pass( - call_chain=curr_call_chain, registered_tensors=self.training_state.tensor_dict, + call_chain=curr_call_chain, registered_tensors=self._training_state.tensor_dict, ) curr_tensors_to_optimize = training_loop[self.step % len(training_loop)][1] final_loss = 0 for tensor in curr_tensors_to_optimize: if ( - torch.isnan(self.training_state.tensor_dict[tensor.unique_name]).any() - or torch.isinf(self.training_state.tensor_dict[tensor.unique_name]).any() + torch.isnan(self._training_state.tensor_dict[tensor.unique_name]).any() + or torch.isinf(self._training_state.tensor_dict[tensor.unique_name]).any() ): if ( (stop_on_nan_loss) @@ -1472,7 +1504,7 @@ def get_state(self): ) else: logging.warning('Loss is NaN or inf, continuing training') - final_loss += self.training_state.tensor_dict[tensor.unique_name] + final_loss += self._training_state.tensor_dict[tensor.unique_name] if self._optim_level in AmpOptimizations and self._optim_level != Optimization.mxprO0: with amp.scale_loss(final_loss, curr_optimizer, delay_unscale=disable_allreduce) as scaled_loss: @@ -1514,21 +1546,21 @@ def get_state(self): curr_optimizer.step() batch_counter = 0 # Register iteration end with callbacks - self._update_callbacks( - callbacks=callbacks, registered_tensors=self.training_state.tensor_dict, final_loss=final_loss + _update_callbacks( + callbacks, registered_tensors=self._training_state.tensor_dict, final_loss=final_loss ) - self._perform_on_step_end(callbacks=callbacks) + _perform_on_step_end(callbacks, get_state(self)) self.step += 1 - self.training_state.clear_dict() + self._training_state.clear_dict() # End of epoch for loop # Register epochs end with callbacks - self._perform_on_epoch_end(callbacks=callbacks) + _perform_on_epoch_end(callbacks, get_state(self)) self.epoch_num += 1 # Check again if we should stop on NaN/inf self._check_nan_or_inf(placement_gpu, nan_or_inf) - self._perform_on_action_end(callbacks=callbacks) + _perform_on_action_end(callbacks, get_state(self)) def infer( self, diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index ecdb38fa30dc..27ba28cbd0c7 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -84,6 +84,7 @@ def on_step_end(self, state): # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " # f"Current state tensors include {state['tensors'].tensor_list()}") + class WandBLogger(NeMoCallback): def __init__(self, step_freq=100, tensors_to_log=["loss"]): # Step_freq: how often logs are printed @@ -104,6 +105,7 @@ def on_step_end(self, state): # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " # f"Current state tensors include {state['tensors'].tensor_list()}") + class SimpleLossLogger(NeMoCallback): def __init__(self, step_freq=100, tensors_to_log=["loss"]): # Step_freq: how often logs are printed @@ -442,7 +444,7 @@ def __init__( self._force_load = force_load def __save_to(self, path, state): - if state.global_rank is not None and state.global_rank != 0: + if state["global_rank"] is not None and state["global_rank"] != 0: return if not os.path.isdir(path): logging.info(f"Creating {path} folder") @@ -457,19 +459,19 @@ def __save_to(self, path, state): ) unique_mod_names.add(str(module)) if self._step_freq > -1: - filename = f"{module}-STEP-{state.step}.pt" + filename = f"{module}-STEP-{state['step']}.pt" else: - filename = f"{module}-EPOCH-{state.epoch_num}.pt" + filename = f"{module}-EPOCH-{state['epoch']}.pt" module.save_to(os.path.join(path, filename)) if self._step_freq > -1: - filename = f"trainer-STEP-{state.step}.pt" - state.save_state_to(f'{path}/{filename}') - self._saved_ckpts.append(f'-{state.step}.pt') + filename = f"trainer-STEP-{state['step']}.pt" + state.save_state_to(f"{path}/{filename}") + self._saved_ckpts.append(f"-{state['step']}.pt") else: - filename = f"trainer-EPOCH-{state.epoch_num}.pt" - state.save_state_to(f'{path}/{filename}') - self._saved_ckpts.append(f'-{state.epoch_num}.pt') + filename = f"trainer-EPOCH-{state['epoch']}.pt" + state.save_state_to(f"{path}/{filename}") + self._saved_ckpts.append(f"-{state['epoch']}.pt") if len(self._saved_ckpts) > self._ckpt2keep: for end in self._saved_ckpts[: -self._ckpt2keep]: @@ -495,7 +497,7 @@ def __restore_from(self, path, state): module_checkpoints = get_checkpoint_from_dir(modules_to_restore_name, path) for mod, checkpoint in zip(modules_to_restore, module_checkpoints): - mod.restore_from(checkpoint, state.local_rank) + mod.restore_from(checkpoint, state["local_rank"]) except (BaseException, ValueError) as e: if self._force_load: raise ValueError( @@ -536,21 +538,21 @@ def on_train_start(self, state): for name in unique_mod_names: logging.info(f"{name}") logging.info(f"Total model parameters: {num_parameters}") - self.__restore_from(path=self._load_from_folder) + self.__restore_from(self._load_from_folder, state) def on_step_end(self, state): step = state["step"] if self._step_freq > 0 and step % self._step_freq == 0 and step > 0: - self.__save_to(path=self._folder) + self.__save_to(self._folder, state) def on_train_end(self, state): if self._step_freq > 0 or self._epoch_freq > 0: - self.__save_to(path=self._folder) + self.__save_to(self._folder, state) def on_epoch_end(self, state): epoch = state["epoch"] if self._epoch_freq > 0 and epoch % self._epoch_freq == 0 and epoch > 0: - self.__save_to(path=self._folder) + self.__save_to(self._folder, state) class EvaluatorCallback(ActionCallback): From cf41850017e07b8b7f5ff3ce074bc8d4f444a478 Mon Sep 17 00:00:00 2001 From: Jason Date: Mon, 18 May 2020 14:39:03 -0700 Subject: [PATCH 10/40] enable fetching via NmTensor and string; add WandBCallback, TensorboardLoggerCallback Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 12 ++-- nemo/core/callbacks.py | 94 +++++++++++++++++++++++--------- 2 files changed, 75 insertions(+), 31 deletions(-) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index f3b249417350..7fce14e678d6 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -72,11 +72,10 @@ def check_tensor_cached(self, unique_name): return True def get_tensor(self, name): - unique_name = AppState().tensor_names[name] - return self.tensor_dict[unique_name] - - def get_and_compute_tensor(self, name): - unique_name = AppState().tensor_names[name] + if isinstance(name, NmTensor): + unique_name = name.unique_name + else: + unique_name = AppState().tensor_names[name] tensor_value = self.tensor_dict[unique_name] if tensor_value is None: nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name] @@ -87,6 +86,9 @@ def get_and_compute_tensor(self, name): # print(self.tensor_dict[unique_name]) tensor_value = self.tensor_dict[unique_name] return tensor_value + # unique_name = AppState().tensor_names[name] + # return self.tensor_dict[unique_name] + # def get_and_compute_tensor(self, name): class PtActions(Actions): diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 27ba28cbd0c7..86b99647e2dd 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -65,45 +65,87 @@ def on_train_end(self, state): class TensorboardLogger(NeMoCallback): - def __init__(self, step_freq=100, tensors_to_log=["loss"]): + def __init__(self, step_freq=100, tensors_to_log=["loss"], tb_writer=None, custom_tb_log_func=None): # Step_freq: how often logs are printed self.step_freq = step_freq self.tensors_to_log = tensors_to_log + if tb_writer is None: + logging.error("There was no tb writer") + # Should grab this from default tb writer + else: + self.tb_writer = tb_writer + self.custom_tb_log_func = custom_tb_log_func + self._last_epoch_start = None # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]): # #tensors_to_log: List of keys into state that will be logged - def on_step_end(self, state): - if state["step"] % self.step_freq == 0: - for tensor_key in self.tensors_to_log: - tensor = state["tensors"].get_tensor(tensor_key) - if tensor is None: - tensor = state["tensors"].get_and_compute_tensor(tensor_key) - logging.info("%s: %s", tensor_key, tensor) - # except KeyError: - # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " - # f"Current state tensors include {state['tensors'].tensor_list()}") + def on_epoch_start(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + self._last_epoch_start = time.time() + + def on_epoch_end(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + # always log epoch num and epoch_time + epoch_time = time.time() - self._last_epoch_start + self.tb_writer.add_scalar('misc/epoch', state["epoch"], state["step"]) + self.tb_writer.add_scalar('misc/epoch_time', epoch_time, state["step"]) + def on_step_end(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + if state["step"] % self.step_freq == 0: + tb_log_func = lambda x: self.tb_writer.add_scalar(x, state["tensors"].get_tensor(x), state["step"]) + if self.custom_tb_log_func is not None: + tb_log_func = self.custom_tb_log_func + for tensor_key in self.tensors_to_log: + tb_log_func(tensor_key) class WandBLogger(NeMoCallback): - def __init__(self, step_freq=100, tensors_to_log=["loss"]): - # Step_freq: how often logs are printed - self.step_freq = step_freq - self.tensors_to_log = tensors_to_log + def __init__(self, step_freq=100, tensors_to_log=["loss"], wandb_name=None, wandb_project=None, args=None): + if not _WANDB_AVAILABLE: + logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") + self._step_freq = step_freq + self._tensors_to_log = tensors_to_log + self._name = wandb_name + self._project = wandb_project + self._args = args + self._last_epoch_start = None - # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]): - # #tensors_to_log: List of keys into state that will be logged + def on_train_start(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + if _WANDB_AVAILABLE and wandb.run is None: + wandb.init(name=self._name, project=self._project) + if self._args is not None: + wandb.config.update(self._args) + elif _WANDB_AVAILABLE and wandb.run is not None: + logging.info("Re-using wandb session") + else: + logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") + logging.info("Will not log data to weights and biases.") + self._step_freq = -1 def on_step_end(self, state): - if state["step"] % self.step_freq == 0: - for tensor_key in self.tensors_to_log: - tensor = state["tensors"].get_tensor(tensor_key) - if tensor is None: - tensor = state["tensors"].get_and_compute_tensor(tensor_key) - logging.info("%s: %s", tensor_key, tensor) - # except KeyError: - # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " - # f"Current state tensors include {state['tensors'].tensor_list()}") + # log training metrics + if state["global_rank"] is None or state["global_rank"] == 0: + if state["step"] % self._step_freq == 0 and self._step_freq > 0: + tensors_logged = {t: state["tensors"].get_tensor(t).cpu() for t in self._tensors_to_log} + # Always log learning rate + tensors_logged['LR'] = state["learning_rate"] + self._wandb_log(tensors_logged) + + def on_epoch_start(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + self._last_epoch_start = time.time() + + def on_epoch_end(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + # always log epoch num and epoch_time + epoch_time = time.time() - self._last_epoch_start + self._wandb_log({"epoch": state["epoch"], "epoch_time": epoch_time}) + + def _wandb_log(self, tensors_logged): + if _WANDB_AVAILABLE: + wandb.log(tensors_logged, step=state["step"]) class SimpleLossLogger(NeMoCallback): From b1df99d7edd9007904e3a5d5117317ebeb64c4f6 Mon Sep 17 00:00:00 2001 From: Jason Date: Mon, 18 May 2020 14:39:31 -0700 Subject: [PATCH 11/40] style Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 1 + nemo/core/callbacks.py | 1 + 2 files changed, 2 insertions(+) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 7fce14e678d6..cae2f6740da9 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -86,6 +86,7 @@ def get_tensor(self, name): # print(self.tensor_dict[unique_name]) tensor_value = self.tensor_dict[unique_name] return tensor_value + # unique_name = AppState().tensor_names[name] # return self.tensor_dict[unique_name] # def get_and_compute_tensor(self, name): diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 86b99647e2dd..7b26531ef30a 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -100,6 +100,7 @@ def on_step_end(self, state): for tensor_key in self.tensors_to_log: tb_log_func(tensor_key) + class WandBLogger(NeMoCallback): def __init__(self, step_freq=100, tensors_to_log=["loss"], wandb_name=None, wandb_project=None, args=None): if not _WANDB_AVAILABLE: From fa6553f722c9cfe2e5169945be03789116372cfa Mon Sep 17 00:00:00 2001 From: Jason Date: Mon, 18 May 2020 16:14:16 -0700 Subject: [PATCH 12/40] DDP bug fix Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 47240c22f47a..c671db5644d0 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -1334,6 +1334,7 @@ def save_state_to(self, path): for module in AppState().modules: key = module.unique_instance_id num_trainable_weights = module.num_weights + self.ddp_module_dict[key] = module if not isinstance(module, DDP) and isinstance(module, torch.nn.Module) and num_trainable_weights > 0: # gpf = 1 # if gradient_predivide: From ba84c807693e7d6686ebca6561da5a72ed0ad6e7 Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 20 May 2020 13:43:06 -0700 Subject: [PATCH 13/40] clean up of checkpoint Signed-off-by: Jason --- examples/asr/jasper_an4.py | 5 - nemo/backends/pytorch/actions.py | 189 ++++++++++++++++++------------- nemo/core/callbacks.py | 6 +- setup.py | 2 +- 4 files changed, 114 insertions(+), 88 deletions(-) diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py index 9ac79f3d1935..93bd887ee663 100644 --- a/examples/asr/jasper_an4.py +++ b/examples/asr/jasper_an4.py @@ -233,13 +233,8 @@ def main(): folder=checkpoint_dir, step_freq=args.checkpoint_save_freq, force_load=True, ) - # Distributed Data Parallel changes the underlying class so we need - # to reinstantiate Encoder and Decoder args.num_epochs += 10 previous_step_count = total_steps - loss, eval_tensors, callbacks, total_steps, _, _ = create_dags(args.model_config, vocab, args, nf) - - nf.reset_trainer() nf.train( tensors_to_optimize=[loss], callbacks=callbacks, diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index c671db5644d0..f5def9142109 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -131,8 +131,8 @@ def __init__( local_rank=local_rank, global_rank=global_rank, optimization_level=optimization_level, ) - self.step = 0 - self.epoch_num = 0 + self._step = 0 + self._epoch = 0 self.optimizers = [] self.tb_writer = tb_writer self.cache = None @@ -140,6 +140,27 @@ def __init__( self.ddp_initialized = False self.ddp_module_dict = {} + @property + def step(self): + return self._step + + @step.setter + def step(self, step): + self._step = step + + @property + def epoch(self): + return self._epoch + + @epoch.setter + def epoch(self, epoch): + self._epoch = epoch + + @property + @deprecated + def epoch_num(self): + return self._epoch + def __get_top_sorted_modules_and_dataloader(self, hook): """ TODO """ @@ -311,28 +332,29 @@ def __setup_optimizer( def __initialize_amp( self, optimizer, optim_level, amp_max_loss_scale=2.0 ** 24, amp_min_loss_scale=1.0, ): - if optim_level not in AmpOptimizations: - raise ValueError(f"__initialize_amp() was called with unknown optim_level={optim_level}") - # in this case, nothing to do here - if optim_level == Optimization.mxprO0: - return optimizer - - if len(AppState().modules) < 1: - raise ValueError("There were no modules to initialize") - pt_modules = [] - for module in AppState().modules: - if isinstance(module, nn.Module): - pt_modules.append(module) - elif isinstance(module, TrainableNeuralModuleWrapper): - pt_modules.append(module._pt_module) - - _, optimizer = amp.initialize( - max_loss_scale=amp_max_loss_scale, - min_loss_scale=amp_min_loss_scale, - models=pt_modules, - optimizers=optimizer, - opt_level=AmpOptimizations[optim_level], - ) + if not self.amp_initialized: + if optim_level not in AmpOptimizations: + raise ValueError(f"__initialize_amp() was called with unknown optim_level={optim_level}") + # in this case, nothing to do here + if optim_level == Optimization.mxprO0: + return optimizer + + if len(AppState().modules) < 1: + raise ValueError("There were no modules to initialize") + pt_modules = [] + for module in AppState().modules: + if isinstance(module, nn.Module): + pt_modules.append(module) + elif isinstance(module, TrainableNeuralModuleWrapper): + pt_modules.append(module._pt_module) + + _, optimizer = amp.initialize( + max_loss_scale=amp_max_loss_scale, + min_loss_scale=amp_min_loss_scale, + models=pt_modules, + optimizers=optimizer, + opt_level=AmpOptimizations[optim_level], + ) self.amp_initialized = True return optimizer @@ -816,43 +838,43 @@ def clear_cache(self): """ self.cache = None - def save_state_to(self, path: str): - """ - Saves current state such as step, epoch and optimizer parameters - Args: - path: - - Returns: - - """ - state = { - "step": self.step, - "epoch_num": self.epoch_num, - "optimizer_state": [opt.state_dict() for opt in self.optimizers], - } - torch.save(state, path) - - def restore_state_from(self, path: str): - """ - Restores state such as step, epoch and optimizer parameters - Args: - path: - - Returns: - - """ - if os.path.isfile(path): - # map_location could be cuda: but cpu seems to be more - # general since we are also saving step and epoch_num - # load_state_dict should move the variables to the relevant device - checkpoint = torch.load(path, map_location="cpu") - self.step = checkpoint["step"] - self.epoch_num = checkpoint["epoch_num"] - if checkpoint["optimizer_state"]: - for opt, opt_chkpt in zip(self.optimizers, checkpoint["optimizer_state"]): - opt.load_state_dict(opt_chkpt) - else: - raise FileNotFoundError("Could not find checkpoint file: {0}".format(path)) + # def save_state_to(self, path: str): + # """ + # Saves current state such as step, epoch and optimizer parameters + # Args: + # path: + + # Returns: + + # """ + # state = { + # "step": self.step, + # "epoch": self.epoch, + # "optimizer_state": [opt.state_dict() for opt in self.optimizers], + # } + # torch.save(state, path) + + # def restore_state_from(self, path: str): + # """ + # Restores state such as step, epoch and optimizer parameters + # Args: + # path: + + # Returns: + + # """ + # if os.path.isfile(path): + # # map_location could be cuda: but cpu seems to be more + # # general since we are also saving step and epoch + # # load_state_dict should move the variables to the relevant device + # checkpoint = torch.load(path, map_location="cpu") + # self.step = checkpoint["step"] + # self.epoch = checkpoint["epoch"] + # if checkpoint["optimizer_state"]: + # for opt, opt_chkpt in zip(self.optimizers, checkpoint["optimizer_state"]): + # opt.load_state_dict(opt_chkpt) + # else: + # raise FileNotFoundError("Could not find checkpoint file: {0}".format(path)) @staticmethod def _check_all_tensors(list_of_tensors): @@ -1133,14 +1155,32 @@ def _update_callbacks(callbacks=None, registered_tensors=None, final_loss=None): def get_state(action): class StateWrapper(dict): + def __init__(self, action): + self.action = action + super().__init__( + { + "step": action.step, + "tensors": action._training_state, + "epoch": action.epoch, + "local_rank": action.local_rank, + "global_rank": action.global_rank, + "optimizers": action.optimizers, + }) def restore_state_from(self, path): if os.path.isfile(path): # map_location could be cuda: but cpu seems to be more - # general since we are also saving step and epoch_num + # general since we are also saving step and epoch # load_state_dict should move the variables to the relevant device checkpoint = torch.load(path, map_location="cpu") - self.step = checkpoint["step"] - self.epoch_num = checkpoint["epoch_num"] + action.step = checkpoint["step"] + self["step"] = action.step + epoch = checkpoint.get("epoch", None) + if epoch is None: + epoch = checkpoint.get("epoch_num", None) + if epoch is None: + raise ValueError("Epoch was not found in the trainer checkpoint") + action.epoch = epoch + self["epoch"] = action.epoch if checkpoint["optimizer_state"]: for opt, opt_chkpt in zip(self["optimizers"], checkpoint["optimizer_state"]): opt.load_state_dict(opt_chkpt) @@ -1150,21 +1190,12 @@ def restore_state_from(self, path): def save_state_to(self, path): state = { "step": self["step"], - "epoch_num": self["epoch"], + "epoch": self["epoch"], "optimizer_state": [opt.state_dict() for opt in self["optimizers"]], } torch.save(state, path) - return StateWrapper( - { - "step": action.step, - "tensors": action._training_state, - "epoch": action.epoch_num, - "local_rank": action.local_rank, - "global_rank": action.global_rank, - "optimizers": action.optimizers, - } - ) + return StateWrapper(action) self._training_state = TrainingState(self) # Analyse the arguments passed to train. @@ -1410,9 +1441,9 @@ def save_state_to(self, path): # MAIN TRAINING LOOP # iteration over epochs - while num_epochs is None or self.epoch_num < num_epochs: + while num_epochs is None or self.epoch < num_epochs: if train_sampler is not None: - train_sampler.set_epoch(self.epoch_num) + train_sampler.set_epoch(self.epoch) if max_steps is not None and self.step >= max_steps: break @@ -1434,7 +1465,7 @@ def save_state_to(self, path): # set learning rate policy if lr_policy is not None: - adjusted_lr = lr_policy(optimization_params["lr"], self.step, self.epoch_num) + adjusted_lr = lr_policy(optimization_params["lr"], self.step, self.epoch) for param_group in curr_optimizer.param_groups: param_group["lr"] = adjusted_lr if self.tb_writer is not None: @@ -1533,7 +1564,7 @@ def save_state_to(self, path): # End of epoch for loop # Register epochs end with callbacks _perform_on_epoch_end(callbacks, get_state(self)) - self.epoch_num += 1 + self.epoch += 1 _perform_on_action_end(callbacks, get_state(self)) def infer( diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 7b26531ef30a..dc9491f9a4b7 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -541,7 +541,7 @@ def __restore_from(self, path, state): for mod, checkpoint in zip(modules_to_restore, module_checkpoints): mod.restore_from(checkpoint, state["local_rank"]) - except (BaseException, ValueError) as e: + except (ValueError) as e: if self._force_load: raise ValueError( "force_load was set to True for checkpoint callback but a checkpoint was not found." @@ -555,9 +555,9 @@ def __restore_from(self, path, state): try: trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path) - state.restore_state_from(checkpoint) + state.restore_state_from(trainer_checkpoints[0]) # for tr, checkpoint in zip([self.action], trainer_checkpoints): - except (BaseException, ValueError) as e: + except (ValueError) as e: logging.warning(e) logging.warning( "Trainer state such as optimizer state and current step/epoch was not restored. Pretrained weights" diff --git a/setup.py b/setup.py index a7c93c9d8c54..fae6a943613d 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def is_build_action(): if len(sys.argv) <= 1: return False - BUILD_TOKENS = ["egg_info", "dist", "bdist", "sdist", "install", "build", "develop", "style"] + BUILD_TOKENS = ["egg_info", "dist", "bdist", "sdist", "install", "build", "develop", "style", "clean"] if any([sys.argv[1].startswith(x) for x in BUILD_TOKENS]): return True From fc3ce629aa2cb5244c1461512589a98789cac53a Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 20 May 2020 15:26:53 -0700 Subject: [PATCH 14/40] update an4 Signed-off-by: Jason --- examples/asr/jasper_an4.py | 109 +++++++++++++++++-------------- nemo/backends/pytorch/actions.py | 45 +++++++------ 2 files changed, 82 insertions(+), 72 deletions(-) diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py index 93bd887ee663..419777e6b84e 100644 --- a/examples/asr/jasper_an4.py +++ b/examples/asr/jasper_an4.py @@ -7,6 +7,7 @@ from ruamel.yaml import YAML import nemo +from nemo.core import NeuralGraph import nemo.collections.asr as nemo_asr import nemo.utils.argparse as nm_argparse from nemo.collections.asr.helpers import ( @@ -21,62 +22,62 @@ logging = nemo.logging - def create_dags(model_config_file, vocab, args, nf): - # Create a data_layer for training. - data_layer = nemo_asr.AudioToTextDataLayer.import_from_config( - model_config_file, - "AudioToTextDataLayer_train", - overwrite_params={"manifest_filepath": args.train_dataset, "batch_size": args.batch_size}, - ) + with NeuralGraph() as g0: + # Create a data_layer for training. + data_layer = nemo_asr.AudioToTextDataLayer.import_from_config( + model_config_file, + "AudioToTextDataLayer_train", + overwrite_params={"manifest_filepath": args.train_dataset, "batch_size": args.batch_size}, + ) - num_samples = len(data_layer) - steps_per_epoch = math.ceil(num_samples / (data_layer.batch_size * args.iter_per_step * nf.world_size)) - total_steps = steps_per_epoch * args.num_epochs - logging.info("Train samples=", num_samples, "num_steps=", total_steps) + num_samples = len(data_layer) + steps_per_epoch = math.ceil(num_samples / (data_layer.batch_size * args.iter_per_step * nf.world_size)) + total_steps = steps_per_epoch * args.num_epochs + logging.info("Train samples=", num_samples, "num_steps=", total_steps) - # Create a data_layer for evaluation. - data_layer_eval = nemo_asr.AudioToTextDataLayer.import_from_config( - model_config_file, "AudioToTextDataLayer_eval", overwrite_params={"manifest_filepath": args.eval_datasets}, - ) + # Create a data_layer for evaluation. + data_layer_eval = nemo_asr.AudioToTextDataLayer.import_from_config( + model_config_file, "AudioToTextDataLayer_eval", overwrite_params={"manifest_filepath": args.eval_datasets}, + ) - num_samples = len(data_layer_eval) - logging.info(f"Eval samples={num_samples}") + num_samples = len(data_layer_eval) + logging.info(f"Eval samples={num_samples}") - # Instantiate data processor. - data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config( - model_config_file, "AudioToMelSpectrogramPreprocessor" - ) + # Instantiate data processor. + data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config( + model_config_file, "AudioToMelSpectrogramPreprocessor" + ) - # Instantiate JASPER encoder-decoder modules. - jasper_encoder = nemo_asr.JasperEncoder.import_from_config(model_config_file, "JasperEncoder") - jasper_decoder = nemo_asr.JasperDecoderForCTC.import_from_config( - model_config_file, "JasperDecoderForCTC", overwrite_params={"num_classes": len(vocab)} - ) + # Instantiate JASPER encoder-decoder modules. + jasper_encoder = nemo_asr.JasperEncoder.import_from_config(model_config_file, "JasperEncoder") + jasper_decoder = nemo_asr.JasperDecoderForCTC.import_from_config( + model_config_file, "JasperDecoderForCTC", overwrite_params={"num_classes": len(vocab)} + ) - # Instantiate losses. - ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) - greedy_decoder = nemo_asr.GreedyCTCDecoder() - - # Create a training graph. - audio, audio_len, transcript, transcript_len = data_layer() - processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len) - encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len) - log_probs = jasper_decoder(encoder_output=encoded) - predictions = greedy_decoder(log_probs=log_probs) - loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,) - - # Create an evaluation graph. - audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval() - processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e) - encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e) - log_probs_e = jasper_decoder(encoder_output=encoded_e) - predictions_e = greedy_decoder(log_probs=log_probs_e) - loss_e = ctc_loss( - log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, - ) - logging.info("Num of params in encoder: {0}".format(jasper_encoder.num_weights)) + # Instantiate losses. + ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) + greedy_decoder = nemo_asr.GreedyCTCDecoder() + + # Create a training graph. + audio, audio_len, transcript, transcript_len = data_layer() + processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len) + encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len) + log_probs = jasper_decoder(encoder_output=encoded) + predictions = greedy_decoder(log_probs=log_probs) + loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,) + + # Create an evaluation graph. + audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval() + processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e) + encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e) + log_probs_e = jasper_decoder(encoder_output=encoded_e) + predictions_e = greedy_decoder(log_probs=log_probs_e) + loss_e = ctc_loss( + log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, + ) + logging.error("Num of params in encoder: {0}".format(jasper_encoder.num_weights)) # Callbacks to print info to console and Tensorboard. train_callback = nemo.core.SimpleLossLoggerCallback( @@ -107,6 +108,7 @@ def create_dags(model_config_file, vocab, args, nf): total_steps, log_probs_e, encoded_len_e, + g0 ) @@ -167,7 +169,7 @@ def main(): # Get vocabulary. vocab = jasper_params['labels'] - (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e,) = create_dags( + (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e, g0) = create_dags( args.model_config, vocab, args, nf ) @@ -235,6 +237,15 @@ def main(): args.num_epochs += 10 previous_step_count = total_steps + + # Distributed Data Parallel and amp changes the underlying class so we need to reinstantiate modules + # Clear the module registery + nemo.utils.app_state.AppState().modules.clear() + # Delete old graph and make a new one + del g0 + loss, eval_tensors, callbacks, total_steps, _, _, new_g = create_dags(args.model_config, vocab, args, nf) + + nf.reset_trainer() nf.train( tensors_to_optimize=[loss], callbacks=callbacks, diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index f5def9142109..84fd37dd72b5 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -332,29 +332,28 @@ def __setup_optimizer( def __initialize_amp( self, optimizer, optim_level, amp_max_loss_scale=2.0 ** 24, amp_min_loss_scale=1.0, ): - if not self.amp_initialized: - if optim_level not in AmpOptimizations: - raise ValueError(f"__initialize_amp() was called with unknown optim_level={optim_level}") - # in this case, nothing to do here - if optim_level == Optimization.mxprO0: - return optimizer - - if len(AppState().modules) < 1: - raise ValueError("There were no modules to initialize") - pt_modules = [] - for module in AppState().modules: - if isinstance(module, nn.Module): - pt_modules.append(module) - elif isinstance(module, TrainableNeuralModuleWrapper): - pt_modules.append(module._pt_module) - - _, optimizer = amp.initialize( - max_loss_scale=amp_max_loss_scale, - min_loss_scale=amp_min_loss_scale, - models=pt_modules, - optimizers=optimizer, - opt_level=AmpOptimizations[optim_level], - ) + if optim_level not in AmpOptimizations: + raise ValueError(f"__initialize_amp() was called with unknown optim_level={optim_level}") + # in this case, nothing to do here + if optim_level == Optimization.mxprO0: + return optimizer + + if len(AppState().modules) < 1: + raise ValueError("There were no modules to initialize") + pt_modules = [] + for module in AppState().modules: + if isinstance(module, nn.Module): + pt_modules.append(module) + elif isinstance(module, TrainableNeuralModuleWrapper): + pt_modules.append(module._pt_module) + + _, optimizer = amp.initialize( + max_loss_scale=amp_max_loss_scale, + min_loss_scale=amp_min_loss_scale, + models=pt_modules, + optimizers=optimizer, + opt_level=AmpOptimizations[optim_level], + ) self.amp_initialized = True return optimizer From e5b82585ac4021285fe734df6777fda9cd6ffea7 Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 20 May 2020 15:31:41 -0700 Subject: [PATCH 15/40] style Signed-off-by: Jason --- examples/asr/jasper_an4.py | 16 +++++----------- nemo/backends/pytorch/actions.py | 4 +++- nemo/core/callbacks.py | 2 +- nemo/core/neural_factory.py | 5 ++--- 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py index 045a8725f757..1100234bf816 100644 --- a/examples/asr/jasper_an4.py +++ b/examples/asr/jasper_an4.py @@ -7,7 +7,6 @@ from ruamel.yaml import YAML import nemo -from nemo.core import NeuralGraph import nemo.collections.asr as nemo_asr import nemo.utils.argparse as nm_argparse from nemo.collections.asr.helpers import ( @@ -18,6 +17,7 @@ process_evaluation_epoch, word_error_rate, ) +from nemo.core import NeuralGraph from nemo.utils import logging from nemo.utils.lr_policies import CosineAnnealing @@ -66,7 +66,9 @@ def create_dags(model_config_file, vocab, args, nf): encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len) log_probs = jasper_decoder(encoder_output=encoded) predictions = greedy_decoder(log_probs=log_probs) - loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,) + loss = ctc_loss( + log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, + ) # Create an evaluation graph. audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval() @@ -101,15 +103,7 @@ def create_dags(model_config_file, vocab, args, nf): callbacks = [train_callback, checkpointer_callback, eval_callback] # Return entities required by the actual training. - return ( - loss, - eval_tensors, - callbacks, - total_steps, - log_probs_e, - encoded_len_e, - g0 - ) + return (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e, g0) def main(): diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 84fd37dd72b5..db28ec73d0d5 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -1164,7 +1164,9 @@ def __init__(self, action): "local_rank": action.local_rank, "global_rank": action.global_rank, "optimizers": action.optimizers, - }) + } + ) + def restore_state_from(self, path): if os.path.isfile(path): # map_location could be cuda: but cpu seems to be more diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 96f3c80ce17e..a9a4970d0254 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -25,8 +25,8 @@ from collections import namedtuple import nemo -from nemo.utils.app_state import AppState from nemo.utils import get_checkpoint_from_dir, logging +from nemo.utils.app_state import AppState try: import wandb diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index e8d379faf292..583563819041 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -34,13 +34,11 @@ import numpy as np import nemo - from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback from nemo.core.neural_types import NmTensor -from nemo.utils import ExpManager +from nemo.utils import ExpManager, logging from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated -from nemo.utils import logging # def topological_sort_from_leaves(leaf_nmtensors, cached_training_state: TrainingState = None): @@ -149,6 +147,7 @@ def is_in_degree_zero(node, processed_nodes, cached_training_state): return top_sorted_modules + class DeploymentFormat(Enum): """Which format to use when exporting a Neural Module for deployment""" From 53610035d98eb184761a1b47160d0948d5e2449e Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 20 May 2020 15:39:29 -0700 Subject: [PATCH 16/40] undo comenting Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 74 ++++++++++++++++---------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index db28ec73d0d5..2b79ab26a11b 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -837,43 +837,43 @@ def clear_cache(self): """ self.cache = None - # def save_state_to(self, path: str): - # """ - # Saves current state such as step, epoch and optimizer parameters - # Args: - # path: - - # Returns: - - # """ - # state = { - # "step": self.step, - # "epoch": self.epoch, - # "optimizer_state": [opt.state_dict() for opt in self.optimizers], - # } - # torch.save(state, path) - - # def restore_state_from(self, path: str): - # """ - # Restores state such as step, epoch and optimizer parameters - # Args: - # path: - - # Returns: - - # """ - # if os.path.isfile(path): - # # map_location could be cuda: but cpu seems to be more - # # general since we are also saving step and epoch - # # load_state_dict should move the variables to the relevant device - # checkpoint = torch.load(path, map_location="cpu") - # self.step = checkpoint["step"] - # self.epoch = checkpoint["epoch"] - # if checkpoint["optimizer_state"]: - # for opt, opt_chkpt in zip(self.optimizers, checkpoint["optimizer_state"]): - # opt.load_state_dict(opt_chkpt) - # else: - # raise FileNotFoundError("Could not find checkpoint file: {0}".format(path)) + def save_state_to(self, path: str): + """ + Saves current state such as step, epoch and optimizer parameters + Args: + path: + + Returns: + + """ + state = { + "step": self.step, + "epoch": self.epoch, + "optimizer_state": [opt.state_dict() for opt in self.optimizers], + } + torch.save(state, path) + + def restore_state_from(self, path: str): + """ + Restores state such as step, epoch and optimizer parameters + Args: + path: + + Returns: + + """ + if os.path.isfile(path): + # map_location could be cuda: but cpu seems to be more + # general since we are also saving step and epoch + # load_state_dict should move the variables to the relevant device + checkpoint = torch.load(path, map_location="cpu") + self.step = checkpoint["step"] + self.epoch = checkpoint["epoch"] + if checkpoint["optimizer_state"]: + for opt, opt_chkpt in zip(self.optimizers, checkpoint["optimizer_state"]): + opt.load_state_dict(opt_chkpt) + else: + raise FileNotFoundError("Could not find checkpoint file: {0}".format(path)) @staticmethod def _check_all_tensors(list_of_tensors): From 9fc00d77e1e46fe0893ff3b899b75e0d3575952b Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 20 May 2020 15:45:04 -0700 Subject: [PATCH 17/40] unpate Signed-off-by: Jason --- tests/unit/core/test_actions_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/core/test_actions_api.py b/tests/unit/core/test_actions_api.py index ef631027de0f..f4a8ad555ac2 100644 --- a/tests/unit/core/test_actions_api.py +++ b/tests/unit/core/test_actions_api.py @@ -33,10 +33,10 @@ def test_checkpointing(self): optimizer = PtActions() optimizer.save_state_to(path) optimizer.step = 123 - optimizer.epoch_num = 324 + optimizer.epoch = 324 optimizer.restore_state_from(path) self.assertEqual(optimizer.step, 0) - self.assertEqual(optimizer.epoch_num, 0) + self.assertEqual(optimizer.epoch, 0) self.assertEqual(len(optimizer.optimizers), 0) os.remove(path) @@ -53,7 +53,7 @@ def test_multi_optimizer(self): self.assertEqual(len(optimizer.optimizers), 5) optimizer.save_state_to(path) optimizer.step = 123 - optimizer.epoch_num = 324 + optimizer.epoch = 324 for i, opt in enumerate(optimizer.optimizers): for param_group in opt.param_groups: self.assertEqual(param_group['lr'], float(i + 1)) @@ -63,6 +63,6 @@ def test_multi_optimizer(self): for param_group in opt.param_groups: self.assertEqual(param_group['lr'], float(i + 1)) self.assertEqual(optimizer.step, 0) - self.assertEqual(optimizer.epoch_num, 0) + self.assertEqual(optimizer.epoch, 0) self.assertEqual(len(optimizer.optimizers), 5) os.remove(path) From d806e7e3cdf95184c5ba081567fb32e9b6bdb7e9 Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 20 May 2020 17:09:57 -0700 Subject: [PATCH 18/40] wip Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 12 ++++++++++++ nemo/constants.py | 1 + nemo/utils/formatters/base.py | 3 +++ nemo/utils/nemo_logging.py | 24 ++++++++++++++++-------- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 2b79ab26a11b..1d122b09b878 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -1435,6 +1435,7 @@ def save_state_to(self, path): else: train_dataloader = dataNM.data_iterator train_sampler = None + logging.info("very start") _init_callbacks(callbacks, self) # Do action start callbacks @@ -1450,6 +1451,7 @@ def save_state_to(self, path): # Register epochs start with callbacks _perform_on_epoch_start(callbacks, get_state(self)) + logging.info("I'm here") # iteration over batches in epoch batch_counter = 0 @@ -1457,12 +1459,14 @@ def save_state_to(self, path): if max_steps is not None and self.step >= max_steps: break + logging.info("I'm there") if batch_counter == 0: # Started step, zero gradients curr_optimizer = training_loop[self.step % len(training_loop)][0] curr_optimizer.zero_grad() # Register iteration start with callbacks _perform_on_step_start(callbacks, get_state(self)) + logging.info("I'm everywhere") # set learning rate policy if lr_policy is not None: @@ -1475,6 +1479,7 @@ def save_state_to(self, path): if callbacks is not None: for callback in callbacks: callback.learning_rate = curr_optimizer.param_groups[0]['lr'] + logging.info("I'm everywhere2") # registered_tensors will contain created tensors # named by output port and uuid of module which created them @@ -1486,24 +1491,29 @@ def save_state_to(self, path): tensors = [] if isinstance(data, torch.Tensor): data = (data,) + logging.info(dl_device) for d in data: if isinstance(d, torch.Tensor): tensors.append(d.to(dl_device)) else: tensors.append(d) + logging.info("I'm everywhere3") for t, d in zip(curr_call_chain[0][2].values(), tensors): if t is not None: self._training_state.set_tensor(t, d) disable_allreduce = batch_counter < (batches_per_step - 1) + logging.info("before forward") self.__nm_graph_forward_pass( call_chain=curr_call_chain, registered_tensors=self._training_state.tensor_dict, ) + logging.info("after forward") curr_tensors_to_optimize = training_loop[self.step % len(training_loop)][1] final_loss = 0 for tensor in curr_tensors_to_optimize: final_loss += self._training_state.tensor_dict[tensor.unique_name] + logging.info("Or there") # Check for NaN/inf loss (across workers if applicable) loss_nan_inf_checker = final_loss.clone() @@ -1519,6 +1529,7 @@ def save_state_to(self, path): logging.warning('Loss is NaN or inf. Skipping update.') continue + logging.info("Am I Here?") if self._optim_level in AmpOptimizations and self._optim_level != Optimization.mxprO0: with amp.scale_loss(final_loss, curr_optimizer, delay_unscale=disable_allreduce) as scaled_loss: if disable_allreduce: @@ -1549,6 +1560,7 @@ def save_state_to(self, path): batch_counter += 1 + raise ValueError if batch_counter == batches_per_step: # Ended step. Do optimizer update if grad_norm_clip is not None: diff --git a/nemo/constants.py b/nemo/constants.py index 6cd3a1f60ff8..9d6793d7630a 100644 --- a/nemo/constants.py +++ b/nemo/constants.py @@ -47,4 +47,5 @@ # NEMO_ENV_VARNAME_DEBUG_VERBOSITY = "NEMO_DEBUG_VERBOSITY" NEMO_ENV_VARNAME_ENABLE_COLORING = "NEMO_ENABLE_COLORING" NEMO_ENV_VARNAME_REDIRECT_LOGS_TO_STDERR = "NEMO_REDIRECT_LOGS_TO_STDERR" +NEMO_ENV_VARNAME_TESTING = "NEMO_TESTING" # NEMO_ENV_VARNAME_SAVE_LOGS_TO_DIR = "NEMO_SAVE_LOGS_TO_DIR" diff --git a/nemo/utils/formatters/base.py b/nemo/utils/formatters/base.py index 6b844877b185..e507aaedecf5 100644 --- a/nemo/utils/formatters/base.py +++ b/nemo/utils/formatters/base.py @@ -126,3 +126,6 @@ def format(self, record): class BaseNeMoFormatter(BaseFormatter): DEFAULT_FORMAT = "%(color)s[NeMo %(levelname)1.1s %(asctime)s %(module)s:%(lineno)d]%(end_color)s %(message)s" + +class DebugNeMoFormatter(BaseFormatter): + DEFAULT_FORMAT = "%(color)s[NeMo %(levelname)1.1s %(asctime)s %(module)s:%(lineno)d %(rank)d]%(end_color)s %(message)s" diff --git a/nemo/utils/nemo_logging.py b/nemo/utils/nemo_logging.py index 1551acf84839..8a2bd06040d6 100644 --- a/nemo/utils/nemo_logging.py +++ b/nemo/utils/nemo_logging.py @@ -20,9 +20,9 @@ from contextlib import contextmanager # from nemo.constants import NEMO_ENV_VARNAME_SAVE_LOGS_TO_DIR -from nemo.constants import NEMO_ENV_VARNAME_REDIRECT_LOGS_TO_STDERR +from nemo.constants import NEMO_ENV_VARNAME_REDIRECT_LOGS_TO_STDERR, NEMO_ENV_VARNAME_TESTING from nemo.utils.env_var_parsing import get_envbool, get_envint -from nemo.utils.formatters.base import BaseNeMoFormatter +from nemo.utils.formatters.base import BaseNeMoFormatter, DebugNeMoFormatter from nemo.utils.metaclasses import Singleton __all__ = ["Logger", "LogMode"] @@ -88,7 +88,17 @@ def _define_logger(self): self._logger = _logging.getLogger("nemo_logger") # By default, silence all loggers except the logger for rank 0 self.remove_stream_handlers() - if get_envint("RANK", 0) == 0: + if get_envbool(NEMO_ENV_VARNAME_TESTING, False): + old_factory = _logging.getLogRecordFactory() + + def record_factory(*args, **kwargs): + record = old_factory(*args, **kwargs) + record.rank = get_envint("RANK", 0) + return record + + _logging.setLogRecordFactory(record_factory) + self.add_stream_handlers(formatter=DebugNeMoFormatter) + elif get_envint("RANK", 0) == 0: self.add_stream_handlers() finally: @@ -112,7 +122,7 @@ def remove_stream_handlers(self): except KeyError: pass - def add_stream_handlers(self): + def add_stream_handlers(self, formatter=BaseNeMoFormatter): if self._logger is None: raise RuntimeError("Impossible to set handlers if the Logger is not predefined") @@ -127,8 +137,6 @@ def add_stream_handlers(self): self._handlers["stream_stderr"] = _logging.StreamHandler(sys.stderr) self._handlers["stream_stderr"].addFilter(lambda record: record.levelno > _logging.INFO) - formatter = BaseNeMoFormatter - self._handlers["stream_stdout"].setFormatter(formatter()) self._logger.addHandler(self._handlers["stream_stdout"]) @@ -138,9 +146,9 @@ def add_stream_handlers(self): except KeyError: pass - def reset_stream_handler(self): + def reset_stream_handler(self, formatter=BaseNeMoFormatter): self.remove_stream_handlers() - self.add_stream_handlers() + self.add_stream_handlers(formatter=formatter) def add_file_handler(self, log_file): if self._logger is None: From f1c8aa827b5e06925faf28ecc61c2070e3b52909 Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 21 May 2020 10:37:01 -0700 Subject: [PATCH 19/40] more logging Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 1d122b09b878..a98a7cbe5114 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -1491,8 +1491,11 @@ def save_state_to(self, path): tensors = [] if isinstance(data, torch.Tensor): data = (data,) + dl_device = f"cuda:{self._local_rank}" logging.info(dl_device) for d in data: + logging.info(d) + d.to(dl_device) if isinstance(d, torch.Tensor): tensors.append(d.to(dl_device)) else: From 7608d45e151c461734438616785b2b623133566b Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 21 May 2020 11:38:42 -0700 Subject: [PATCH 20/40] remove debugging statements Signed-off-by: Jason --- examples/asr/jasper_an4.py | 3 ++- nemo/backends/pytorch/actions.py | 20 ++++---------------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py index 1100234bf816..7baa44e34f74 100644 --- a/examples/asr/jasper_an4.py +++ b/examples/asr/jasper_an4.py @@ -237,9 +237,10 @@ def main(): nemo.utils.app_state.AppState().modules.clear() # Delete old graph and make a new one del g0 + nf.reset_trainer() + # [print(p) for p in nemo.utils.app_state.AppState().modules] loss, eval_tensors, callbacks, total_steps, _, _, new_g = create_dags(args.model_config, vocab, args, nf) - nf.reset_trainer() nf.train( tensors_to_optimize=[loss], callbacks=callbacks, diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index a98a7cbe5114..a1742ab1e81c 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -1363,7 +1363,10 @@ def save_state_to(self, path): # pmodule = self.module_reference_table[key][1] # num_trainable_weights = self.module_reference_table[key][1].num_weights self.ddp_initialized = True - for module in AppState().modules: + module_list = [mod.name for mod in AppState().modules] + module_list = sorted(module_list) + for module_name in module_list: + module = AppState().modules[module_name] key = module.unique_instance_id num_trainable_weights = module.num_weights self.ddp_module_dict[key] = module @@ -1435,7 +1438,6 @@ def save_state_to(self, path): else: train_dataloader = dataNM.data_iterator train_sampler = None - logging.info("very start") _init_callbacks(callbacks, self) # Do action start callbacks @@ -1451,7 +1453,6 @@ def save_state_to(self, path): # Register epochs start with callbacks _perform_on_epoch_start(callbacks, get_state(self)) - logging.info("I'm here") # iteration over batches in epoch batch_counter = 0 @@ -1459,14 +1460,12 @@ def save_state_to(self, path): if max_steps is not None and self.step >= max_steps: break - logging.info("I'm there") if batch_counter == 0: # Started step, zero gradients curr_optimizer = training_loop[self.step % len(training_loop)][0] curr_optimizer.zero_grad() # Register iteration start with callbacks _perform_on_step_start(callbacks, get_state(self)) - logging.info("I'm everywhere") # set learning rate policy if lr_policy is not None: @@ -1479,7 +1478,6 @@ def save_state_to(self, path): if callbacks is not None: for callback in callbacks: callback.learning_rate = curr_optimizer.param_groups[0]['lr'] - logging.info("I'm everywhere2") # registered_tensors will contain created tensors # named by output port and uuid of module which created them @@ -1491,32 +1489,24 @@ def save_state_to(self, path): tensors = [] if isinstance(data, torch.Tensor): data = (data,) - dl_device = f"cuda:{self._local_rank}" - logging.info(dl_device) for d in data: - logging.info(d) - d.to(dl_device) if isinstance(d, torch.Tensor): tensors.append(d.to(dl_device)) else: tensors.append(d) - logging.info("I'm everywhere3") for t, d in zip(curr_call_chain[0][2].values(), tensors): if t is not None: self._training_state.set_tensor(t, d) disable_allreduce = batch_counter < (batches_per_step - 1) - logging.info("before forward") self.__nm_graph_forward_pass( call_chain=curr_call_chain, registered_tensors=self._training_state.tensor_dict, ) - logging.info("after forward") curr_tensors_to_optimize = training_loop[self.step % len(training_loop)][1] final_loss = 0 for tensor in curr_tensors_to_optimize: final_loss += self._training_state.tensor_dict[tensor.unique_name] - logging.info("Or there") # Check for NaN/inf loss (across workers if applicable) loss_nan_inf_checker = final_loss.clone() @@ -1532,7 +1522,6 @@ def save_state_to(self, path): logging.warning('Loss is NaN or inf. Skipping update.') continue - logging.info("Am I Here?") if self._optim_level in AmpOptimizations and self._optim_level != Optimization.mxprO0: with amp.scale_loss(final_loss, curr_optimizer, delay_unscale=disable_allreduce) as scaled_loss: if disable_allreduce: @@ -1563,7 +1552,6 @@ def save_state_to(self, path): batch_counter += 1 - raise ValueError if batch_counter == batches_per_step: # Ended step. Do optimizer update if grad_norm_clip is not None: From 5fe64fba8cf31f99924e529c917645f8dc63128e Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 21 May 2020 11:53:01 -0700 Subject: [PATCH 21/40] update new warning format with rank Signed-off-by: Jason --- tests/unit/utils/test_deprecated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/utils/test_deprecated.py b/tests/unit/utils/test_deprecated.py index 2ae3e5cb156f..06703a7aaa52 100644 --- a/tests/unit/utils/test_deprecated.py +++ b/tests/unit/utils/test_deprecated.py @@ -30,7 +30,7 @@ class DeprecatedTest(TestCase): NEMO_ERR_MSG_FORMAT = re.compile( - r"\[NeMo W [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} deprecated:[0-9]*\] " + r"\[NeMo W [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} deprecated:[0-9]+( [0-9]+)?\] " ) @pytest.mark.unit From 01dd179cb9c48769ddc301f794a2ffbf6b268a05 Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 21 May 2020 13:28:18 -0700 Subject: [PATCH 22/40] add explicit rank marker Signed-off-by: Jason --- nemo/utils/formatters/base.py | 2 +- tests/unit/utils/test_deprecated.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/utils/formatters/base.py b/nemo/utils/formatters/base.py index 61481154dfef..12500477b9c8 100644 --- a/nemo/utils/formatters/base.py +++ b/nemo/utils/formatters/base.py @@ -130,5 +130,5 @@ class BaseNeMoFormatter(BaseFormatter): class DebugNeMoFormatter(BaseFormatter): DEFAULT_FORMAT = ( - "%(color)s[NeMo %(levelname)1.1s %(asctime)s %(module)s:%(lineno)d %(rank)d]%(end_color)s %(message)s" + "%(color)s[NeMo %(levelname)1.1s %(asctime)s %(module)s:%(lineno)d rank:%(rank)d]%(end_color)s %(message)s" ) diff --git a/tests/unit/utils/test_deprecated.py b/tests/unit/utils/test_deprecated.py index 06703a7aaa52..4f1c9490e60f 100644 --- a/tests/unit/utils/test_deprecated.py +++ b/tests/unit/utils/test_deprecated.py @@ -30,7 +30,7 @@ class DeprecatedTest(TestCase): NEMO_ERR_MSG_FORMAT = re.compile( - r"\[NeMo W [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} deprecated:[0-9]+( [0-9]+)?\] " + r"\[NeMo W [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} deprecated:[0-9]+( rank:[0-9]+)?\] " ) @pytest.mark.unit From c6ece47ca632badfc26f9b8e3f394c7f5a1afe86 Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 21 May 2020 15:09:09 -0700 Subject: [PATCH 23/40] docstrings and more Signed-off-by: Jason --- examples/asr/jasper_an4.py | 2 +- examples/asr/jasper_an4_debug.py | 307 -------------------- nemo/backends/pytorch/actions.py | 77 ++--- nemo/core/neural_factory.py | 88 +++++- nemo/core/neural_types/neural_type.py | 8 +- nemo/core/neural_types/nmtensor_registry.py | 34 +-- nemo/utils/app_state.py | 4 +- 7 files changed, 135 insertions(+), 385 deletions(-) delete mode 100755 examples/asr/jasper_an4_debug.py diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py index 7baa44e34f74..40172008c9da 100644 --- a/examples/asr/jasper_an4.py +++ b/examples/asr/jasper_an4.py @@ -79,7 +79,7 @@ def create_dags(model_config_file, vocab, args, nf): loss_e = ctc_loss( log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, ) - logging.error("Num of params in encoder: {0}".format(jasper_encoder.num_weights)) + logging.info("Num of params in encoder: {0}".format(jasper_encoder.num_weights)) # Callbacks to print info to console and Tensorboard. train_callback = nemo.core.SimpleLossLoggerCallback( diff --git a/examples/asr/jasper_an4_debug.py b/examples/asr/jasper_an4_debug.py deleted file mode 100755 index 761c674ad3be..000000000000 --- a/examples/asr/jasper_an4_debug.py +++ /dev/null @@ -1,307 +0,0 @@ -# Copyright (c) 2019 NVIDIA Corporation -import argparse -import math -import os -from functools import partial - -from ruamel.yaml import YAML - -import nemo -import nemo.collections.asr as nemo_asr -import nemo.utils.argparse as nm_argparse -from nemo.collections.asr.helpers import ( - monitor_asr_train_progress, - post_process_predictions, - post_process_transcripts, - process_evaluation_batch, - process_evaluation_epoch, - word_error_rate, -) -from nemo.utils.lr_policies import CosineAnnealing - -logging = nemo.logging - - -def create_dags(model_config_file, vocab, args, nf): - - # Create a data_layer for training. - data_layer = nemo_asr.AudioToTextDataLayer.import_from_config( - model_config_file, - "AudioToTextDataLayer_train", - overwrite_params={"manifest_filepath": args.train_dataset, "batch_size": args.batch_size}, - ) - - num_samples = len(data_layer) - steps_per_epoch = math.ceil(num_samples / (data_layer.batch_size * args.iter_per_step * nf.world_size)) - total_steps = steps_per_epoch * args.num_epochs - logging.info("Train samples=", num_samples, "num_steps=", total_steps) - - # # Create a data_layer for evaluation. - # data_layer_eval = nemo_asr.AudioToTextDataLayer.import_from_config( - # model_config_file, "AudioToTextDataLayer_eval", overwrite_params={"manifest_filepath": args.eval_datasets}, - # ) - - # num_samples = len(data_layer_eval) - # logging.info(f"Eval samples={num_samples}") - - # Instantiate data processor. - data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config( - model_config_file, "AudioToMelSpectrogramPreprocessor" - ) - - # Instantiate JASPER encoder-decoder modules. - jasper_encoder = nemo_asr.JasperEncoder.import_from_config(model_config_file, "JasperEncoder") - jasper_decoder = nemo_asr.JasperDecoderForCTC.import_from_config( - model_config_file, "JasperDecoderForCTC", overwrite_params={"num_classes": len(vocab)} - ) - - # Instantiate losses. - ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) - greedy_decoder = nemo_asr.GreedyCTCDecoder() - - # Create a training graph. - audio, audio_len, transcript, transcript_len = data_layer() - processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len) - encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len) - log_probs = jasper_decoder(encoder_output=encoded) - predictions = greedy_decoder(log_probs=log_probs) - loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len) - - # # Create an evaluation graph. - # audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval() - # processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e) - # encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e) - # log_probs_e = jasper_decoder(encoder_output=encoded_e) - # predictions_e = greedy_decoder(log_probs=log_probs_e) - # loss_e = ctc_loss( - # log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, - # ) - logging.info("Num of params in encoder: {0}".format(jasper_encoder.num_weights)) - - # Callbacks to print info to console and Tensorboard. - # train_callback = nemo.core.SimpleLossLoggerCallback( - # tensors=[loss, predictions, transcript, transcript_len], - # print_func=partial(monitor_asr_train_progress, labels=vocab), - # get_tb_values=lambda x: [["loss", x[0]]], - # tb_writer=nf.tb_writer, - # ) - - # loss.rename("test") - # train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["test"]) - - # train_callback = nemo.core.SimpleLossLogger() - predictions.rename("test") - train_callback = nemo.core.SimpleLossLogger(tensors_to_log=["loss", "test"]) - - checkpointer_callback = nemo.core.CheckpointCallback(folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq) - - # eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e] - # eval_callback = nemo.core.EvaluatorCallback( - # eval_tensors=eval_tensors, - # user_iter_callback=partial(process_evaluation_batch, labels=vocab), - # user_epochs_done_callback=process_evaluation_epoch, - # eval_step=args.eval_freq, - # tb_writer=nf.tb_writer, - # eval_at_start=not args.do_not_eval_at_start, - # ) - # callbacks = [train_callback, checkpointer_callback, eval_callback] - callbacks = [train_callback, checkpointer_callback] - - @nemo.core.callbacks.on_step_start - def my_own_func(state): - if state["step"] % 100 == 0: - print(state) - - callbacks.append(my_own_func) - - # Return entities required by the actual training. - return ( - loss, - # eval_tensors, - callbacks, - total_steps, - # log_probs_e, - # encoded_len_e, - ) - - -def main(): - parser = argparse.ArgumentParser( - parents=[nm_argparse.NemoArgParser()], description='AN4 ASR', conflict_handler='resolve', - ) - - # Overwrite default args - parser.add_argument("--train_dataset", type=str, help="training dataset path") - parser.add_argument("--eval_datasets", type=str, help="validation dataset path") - - # Create new args - # parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str) - parser.add_argument("--batch_size", default=48, type=int, help="size of the training batch") - parser.add_argument("--lm", default=None, type=str) - parser.add_argument("--test_after_training", action='store_true') - parser.add_argument("--momentum", type=float) - parser.add_argument("--beta1", default=0.95, type=float) - parser.add_argument("--beta2", default=0.25, type=float) - parser.add_argument("--do_not_eval_at_start", action='store_true') - parser.set_defaults( - model_config="./configs/jasper_an4.yaml", - train_dataset="~/TestData/an4_dataset/an4_train.json", - eval_datasets="~/TestData/an4_dataset/an4_val.json", - work_dir="./tmp", - optimizer="novograd", - num_epochs=50, - lr=0.02, - weight_decay=0.005, - checkpoint_save_freq=1000, - eval_freq=100, - amp_opt_level="O1", - ) - - args = parser.parse_args() - betas = (args.beta1, args.beta2) - - wer_thr = 0.20 - beam_wer_thr = 0.15 - - nf = nemo.core.NeuralModuleFactory( - local_rank=args.local_rank, - files_to_copy=[__file__], - optimization_level=args.amp_opt_level, - random_seed=0, - log_dir=args.work_dir, - create_tb_writer=True, - cudnn_benchmark=args.cudnn_benchmark, - ) - tb_writer = nf.tb_writer - checkpoint_dir = nf.checkpoint_dir - - # Load model definition - yaml = YAML(typ="safe") - with open(args.model_config) as f: - jasper_params = yaml.load(f) - # Get vocabulary. - vocab = jasper_params['labels'] - - # (loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e,) = create_dags( - # args.model_config, vocab, args, nf - # ) - - loss, callbacks, total_steps = create_dags(args.model_config, vocab, args, nf) - - nf.train( - tensors_to_optimize=[loss], - callbacks=callbacks, - optimizer=args.optimizer, - lr_policy=CosineAnnealing(total_steps=total_steps, min_lr=args.lr / 100), - optimization_params={ - "num_epochs": args.num_epochs, - "max_steps": args.max_steps, - "lr": args.lr, - "momentum": args.momentum, - "betas": betas, - "weight_decay": args.weight_decay, - "grad_norm_clip": None, - }, - batches_per_step=args.iter_per_step, - amp_max_loss_scale=256.0, - # synced_batchnorm=(nf.global_rank is not None), - ) - - # if args.test_after_training: - # logging.info("Testing greedy and beam search with LM WER.") - # # Create BeamSearch NM - # if nf.world_size > 1 or args.lm is None: - # logging.warning("Skipping beam search WER as it does not work if doing distributed training.") - # else: - # beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM( - # vocab=vocab, beam_width=64, alpha=2.0, beta=1.5, lm_path=args.lm, num_cpus=max(os.cpu_count(), 1), - # ) - # beam_predictions = beam_search_with_lm(log_probs=log_probs_e, log_probs_length=encoded_len_e) - # eval_tensors.append(beam_predictions) - - # evaluated_tensors = nf.infer(eval_tensors) - # if nf.global_rank in [0, None]: - # greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab) - # references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab) - # wer = word_error_rate(hypotheses=greedy_hypotheses, references=references) - # logging.info("Greedy WER: {:.2f}%".format(wer * 100)) - # if wer > wer_thr: - # nf.sync_all_processes(False) - # raise ValueError(f"Final eval greedy WER {wer * 100:.2f}% > :" f"than {wer_thr * 100:.2f}%") - # nf.sync_all_processes() - - # if nf.world_size == 1 and args.lm is not None: - # beam_hypotheses = [] - # # Over mini-batch - # for i in evaluated_tensors[-1]: - # # Over samples - # for j in i: - # beam_hypotheses.append(j[0][1]) - - # beam_wer = word_error_rate(hypotheses=beam_hypotheses, references=references) - # logging.info("Beam WER {:.2f}%".format(beam_wer * 100)) - # assert beam_wer <= beam_wer_thr, "Final eval beam WER {:.2f}% > than {:.2f}%".format( - # beam_wer * 100, beam_wer_thr * 100 - # ) - # assert beam_wer <= wer, "Final eval beam WER > than the greedy WER." - - # # Reload model weights and train for extra 10 epochs - # checkpointer_callback = nemo.core.CheckpointCallback( - # folder=checkpoint_dir, step_freq=args.checkpoint_save_freq, force_load=True, - # ) - - # # Distributed Data Parallel changes the underlying class so we need - # # to reinstantiate Encoder and Decoder - # args.num_epochs += 10 - # previous_step_count = total_steps - # loss, eval_tensors, callbacks, total_steps, _, _ = create_dags(args.model_config, vocab, args, nf) - - # nf.reset_trainer() - # nf.train( - # tensors_to_optimize=[loss], - # callbacks=callbacks, - # optimizer=args.optimizer, - # lr_policy=CosineAnnealing(warmup_steps=previous_step_count, total_steps=total_steps), - # optimization_params={ - # "num_epochs": args.num_epochs, - # "lr": args.lr / 100, - # "momentum": args.momentum, - # "betas": betas, - # "weight_decay": args.weight_decay, - # "grad_norm_clip": None, - # }, - # reset=True, - # amp_max_loss_scale=256.0, - # # synced_batchnorm=(nf.global_rank is not None), - # ) - - # evaluated_tensors = nf.infer(eval_tensors) - # if nf.global_rank in [0, None]: - # greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab) - # references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab) - # wer_new = word_error_rate(hypotheses=greedy_hypotheses, references=references) - # logging.info("New greedy WER: {:.2f}%".format(wer_new * 100)) - # if wer_new > wer * 1.1: - # nf.sync_all_processes(False) - # raise ValueError( - # f"Fine tuning: new WER {wer_new * 100:.2f}% > than the " f"previous WER {wer * 100:.2f}%" - # ) - # nf.sync_all_processes() - - # # Open the log file and ensure that epochs is strictly increasing - # if nf._exp_manager.log_file: - # epochs = [] - # with open(nf._exp_manager.log_file, "r") as log_file: - # line = log_file.readline() - # while line: - # index = line.find("Starting epoch") - # if index != -1: - # epochs.append(int(line[index + len("Starting epoch") :])) - # line = log_file.readline() - # for i, e in enumerate(epochs): - # if i != e: - # raise ValueError("Epochs from logfile was not understood") - - -if __name__ == "__main__": - main() diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index a1742ab1e81c..efd3c9bfcf83 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -20,9 +20,9 @@ from nemo.backends.pytorch.nm import DataLayerNM, TrainableNM from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor -from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback, SimpleLossLoggerCallback -from nemo.core.neural_factory import Actions, OperationMode, Optimization, topological_sort_from_leaves -from nemo.core.neural_types import * +from nemo.core.callbacks import ActionCallback, NeMoCallback, SimpleLossLoggerCallback +from nemo.core.neural_factory import Actions, OperationMode, Optimization, topological_sort_from_leaves, TrainingState +from nemo.core.neural_types import NeuralType, AxisKind from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated from nemo.utils.helpers import get_checkpoint_from_dir @@ -50,48 +50,6 @@ } -class TrainingState: - def __init__(self, action): - tensor_naming_registery = AppState().tensor_names - self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None) - self._action = action - - def tensor_list(self): - return self.tensor_dict.keys() - - def clear_dict(self): - for name in self.tensor_dict: - self.tensor_dict[name] = None - - def set_tensor(self, tensor, value): - self.tensor_dict[tensor.unique_name] = value - - def check_tensor_cached(self, unique_name): - if self.tensor_dict[unique_name] is None: - return False - return True - - def get_tensor(self, name): - if isinstance(name, NmTensor): - unique_name = name.unique_name - else: - unique_name = AppState().tensor_names[name] - tensor_value = self.tensor_dict[unique_name] - if tensor_value is None: - nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name] - callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self) - # print(callchain) - callchain.insert(0, ()) - self._action.nm_graph_forward_pass(callchain, self.tensor_dict) - # print(self.tensor_dict[unique_name]) - tensor_value = self.tensor_dict[unique_name] - return tensor_value - - # unique_name = AppState().tensor_names[name] - # return self.tensor_dict[unique_name] - # def get_and_compute_tensor(self, name): - - class PtActions(Actions): def __init__( self, local_rank=None, global_rank=None, tb_writer=None, optimization_level=Optimization.mxprO0, @@ -161,8 +119,16 @@ def epoch(self, epoch): def epoch_num(self): return self._epoch - def __get_top_sorted_modules_and_dataloader(self, hook): - """ TODO + def __get_top_sorted_modules_and_dataloader(self, hook: List[NmTensor]): + """A function that accepts a list of NmTensors that need to be computed and constructs a call DAG that starts + from a datalayerNM and can be used to compute the NmTensors. + + args: + leaf_nmtensors (List[NmTensors]): The tensors to be computed + + returns: + top_sorted_modules: the callchain DAG + tdataset: the datalayer at the top of the callchain """ top_sorted_modules = topological_sort_from_leaves(hook) @@ -1119,7 +1085,7 @@ def _perform_on_batch_start(callbacks, state): if isinstance(callback, ActionCallback): continue elif isinstance(callback, NeMoCallback): - callback.on_epoch_start(state) + callback.on_batch_start(state) else: raise ValueError( "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" @@ -1131,7 +1097,7 @@ def _perform_on_batch_end(callbacks, state): if isinstance(callback, ActionCallback): continue elif isinstance(callback, NeMoCallback): - callback.on_epoch_end(state) + callback.on_batch_end(state) else: raise ValueError( "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" @@ -1152,9 +1118,14 @@ def _update_callbacks(callbacks=None, registered_tensors=None, final_loss=None): else: # For now, we can use the old callback function. In the future we should improve this registered_tensors["loss"] = final_loss - def get_state(action): + def get_state(action: 'PtAction'): + """Helper function used to create a state for callbacks + """ class StateWrapper(dict): def __init__(self, action): + """A class that wraps a dictionary but adds the functions: restore_state_from and save_state_to + which are helper functions for CheckpointCallback to use. + """ self.action = action super().__init__( { @@ -1467,6 +1438,9 @@ def save_state_to(self, path): # Register iteration start with callbacks _perform_on_step_start(callbacks, get_state(self)) + # Perform batch start callbacks + _perform_on_batch_start(callbacks, get_state(self)) + # set learning rate policy if lr_policy is not None: adjusted_lr = lr_policy(optimization_params["lr"], self.step, self.epoch) @@ -1550,6 +1524,9 @@ def save_state_to(self, path): else: final_loss.backward(bps_scale.to(final_loss.get_device())) + # Perform batch end callbacks + _perform_on_batch_end(callbacks, get_state(self)) + batch_counter += 1 if batch_counter == batches_per_step: diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index 583563819041..29ca0713a8a9 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -25,11 +25,10 @@ 'DeploymentFormat', ] -import copy import random from abc import ABC, abstractmethod from enum import Enum -from typing import List, Optional +from typing import List, Optional, Union import numpy as np @@ -37,13 +36,90 @@ from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback from nemo.core.neural_types import NmTensor from nemo.utils import ExpManager, logging -from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated +from nemo.utils.app_state import AppState + +class TrainingState: + def __init__(self, action: 'Actions'): + """A class used to wrap the current training state of an Actions.train() function. This class holds a mapping + of tensor.unique_name -> it's backend tensor (eg Pytorch Tensor) or None if the tensor has been been computed + on the current step. + + args: + action (Actions): The Actions object this state is associated with. + """ + tensor_naming_registery = AppState().tensor_names + self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None) + self._action = action + def tensor_list(self): + """Returns a list the unique names of all tensors. + """ + return self.tensor_dict.keys() + + def clear_dict(self): + """Clears the dictionary by setting all values to None. Used in-between training batches to clear it's state. + """ + for name in self.tensor_dict: + self.tensor_dict[name] = None -# def topological_sort_from_leaves(leaf_nmtensors, cached_training_state: TrainingState = None): -def topological_sort_from_leaves(leaf_nmtensors, cached_training_state=None): - from nemo.backends.pytorch.nm import DataLayerNM + def set_tensor(self, tensor: NmTensor, value: 'torch.Tensor'): + """Sets the value of tensor + + args: + tensor (NmTensor) + value (torch.Tensor) + """ + self.tensor_dict[tensor.unique_name] = value + + def check_tensor_cached(self, unique_name: str): + """Checks to see the tensor value has been computed in the current step yet. + + args: + unique_name (str): The NmTensor.unique_name that we want to check for. + """ + if self.tensor_dict[unique_name] is None: + return False + return True + + def get_tensor(self, name: Union[str, NmTensor], compute: bool = True): + """Returns the value associated with a tensor. And optionally, computes the value of the tensor if not already + set. + + args: + name (str, NmTensor): The user-defined name for a tensor or the NmTensor itself. + compute (bool): If True and the tensor has not already been computed, there will be an attempt to create a + call DAG and then do a forward pass on this call DAG to compute the tensor. If False, it will return + None if the tensor has not been computed yet. + Defaults to True. + """ + if isinstance(name, NmTensor): + unique_name = name.unique_name + else: + unique_name = AppState().tensor_names[name] + tensor_value = self.tensor_dict[unique_name] + if tensor_value is None and compute: + nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name] + callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self) + callchain.insert(0, ()) + self._action.nm_graph_forward_pass(callchain, self.tensor_dict) + tensor_value = self.tensor_dict[unique_name] + return tensor_value + + +def topological_sort_from_leaves(leaf_nmtensors: List[NmTensor], cached_training_state: TrainingState = None): + """A function that accepts a list of NmTensors that need to be computed and constructs a callchain DAG that starts + from a datalayerNM and can be used to compute the NmTensors. + + args: + leaf_nmtensors (List[NmTensors]): The tensors to be computed + cached_training_state (TrainingState): A dictionary of already computed tensors. + Defaults to None meaning an empty cache. + + returns: + top_sorted_modules: the callchain DAG + """ + from nemo.backends.pytorch.nm import DataLayerNM # TODO: Replace this with a backend agnostic data layer def create_node(producer, producer_args): if producer_args is None: diff --git a/nemo/core/neural_types/neural_type.py b/nemo/core/neural_types/neural_type.py index d3da8a80fdf5..c52f2680b73c 100644 --- a/nemo/core/neural_types/neural_type.py +++ b/nemo/core/neural_types/neural_type.py @@ -324,8 +324,12 @@ def unique_name(self): raise ValueError("This NmTensor does not have a unique name") return f"{self._output_port_name}~~~{self._producer_name}~~~{self._uuid}" - def rename(self, new_name): - """TODO + def rename(self, new_name: str): + """Renames the tensor from its old name to a new user-defined name for easy access within callbacks. Note, + a tensor's unique_name is never changed. This simply adds a reference from new_name -> tensor.unique_name + + args: + new_name (str): the new tensor's name. """ AppState().tensor_names.rename_NmTensor(self, new_name) diff --git a/nemo/core/neural_types/nmtensor_registry.py b/nemo/core/neural_types/nmtensor_registry.py index 5055319c2cef..f1d9591039a4 100755 --- a/nemo/core/neural_types/nmtensor_registry.py +++ b/nemo/core/neural_types/nmtensor_registry.py @@ -18,30 +18,27 @@ class NmTensorNameRegistry: def __init__(self): """ - Constructor. Initializes the manager. Sets active graph to None. + Constructor. Initializes the NmTensorNameRegistry. Reserves the default 'loss' name. - TODO: Should probably be a property of a graph + TODO: We should be recording the tensors of each graph rather than all the tensors. """ # Create the nmtensor_naming_dict # which contains a mapping of str to NMTensor.unique_name self._nmtensor_naming_dict = {"loss": "loss"} # Reserve keyname of 'loss' - # self._nmtensor_uniname_set = set(["loss"]) self._nmtensor_uniname_dict = {"loss": None} - # def summary(self): - # """ Prints a nice summary. """ - # desc = "" - # for graph in self: - # desc = desc + "`{}`: {}\n".format(graph.name, graph) - # return desc - @property def unique_names(self): + """Returns the set of all NmTensors.unique_names + 'loss' + """ return self._nmtensor_uniname_dict.keys() - # def register(self, tensor: NmTensor): - def register(self, tensor): - """TODO + def register(self, tensor: 'NmTensor'): + """Helper function to register a newly created NmTensor by adding it to self.__nmtensor_uniname_dict. + Should be called from NmTensor.__init__() + + args: + tensor (NmTensor): The tensor to be registered. """ # Check if object is already in a set. @@ -51,9 +48,12 @@ def register(self, tensor): # Finally, add object to the set. self._nmtensor_uniname_dict[tensor.unique_name] = tensor - # def rename_NmTensor(self, tensor: NmTensor, new_name: str): - def rename_NmTensor(self, tensor, new_name: str): - """ TODO + def rename_NmTensor(self, tensor: 'NmTensor', new_name: str): + """Helper function that changes the naming dictionary to facilitate user name -> tensor.unique_name lookup. + + args: + tensor (NmTensor): The tensor to be renamed. + new_name (str): its new name. """ # Find old name if exists old_name = tensor.unique_name @@ -68,7 +68,7 @@ def rename_NmTensor(self, tensor, new_name: str): raise KeyError(f"{new_name} already exists in current graph. Please use a unique name") self._nmtensor_naming_dict[new_name] = tensor.unique_name - def __getitem__(self, key): + def __getitem__(self, key: str): """ Object getter function. diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index 6183526b87fe..8bbf120c0f60 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -52,10 +52,10 @@ def __init__(self, device=None): @property def tensor_names(self): - """ Property returning the existing modules. + """ Property returning the NmTensorNameRegistry which maps user-defined names to tensor's unique_names. Returns: - Existing modules (a set object). + NmTensorNameRegistry. """ return self._nmtensor_name_registry From 3c3bee9e74f584b3f865665ea5d4f94ecaeebd66 Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 21 May 2020 15:09:29 -0700 Subject: [PATCH 24/40] style Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 5 +++-- nemo/core/neural_factory.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index efd3c9bfcf83..94f82c5399c5 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -21,8 +21,8 @@ from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor from nemo.core.callbacks import ActionCallback, NeMoCallback, SimpleLossLoggerCallback -from nemo.core.neural_factory import Actions, OperationMode, Optimization, topological_sort_from_leaves, TrainingState -from nemo.core.neural_types import NeuralType, AxisKind +from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState, topological_sort_from_leaves +from nemo.core.neural_types import AxisKind, NeuralType from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated from nemo.utils.helpers import get_checkpoint_from_dir @@ -1121,6 +1121,7 @@ def _update_callbacks(callbacks=None, registered_tensors=None, final_loss=None): def get_state(action: 'PtAction'): """Helper function used to create a state for callbacks """ + class StateWrapper(dict): def __init__(self, action): """A class that wraps a dictionary but adds the functions: restore_state_from and save_state_to diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index 29ca0713a8a9..2dc63ffca36b 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -36,8 +36,9 @@ from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback from nemo.core.neural_types import NmTensor from nemo.utils import ExpManager, logging -from nemo.utils.decorators import deprecated from nemo.utils.app_state import AppState +from nemo.utils.decorators import deprecated + class TrainingState: def __init__(self, action: 'Actions'): From dba45362f785d6d74fe1cac3b00aadc7e6b528fe Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 21 May 2020 16:34:38 -0700 Subject: [PATCH 25/40] callback docstrings Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 7 + nemo/core/callbacks.py | 581 ++++++++++++++++++------------- nemo/core/neural_factory.py | 11 +- 3 files changed, 353 insertions(+), 246 deletions(-) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 94f82c5399c5..86d8ce6aaba7 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -1126,6 +1126,13 @@ class StateWrapper(dict): def __init__(self, action): """A class that wraps a dictionary but adds the functions: restore_state_from and save_state_to which are helper functions for CheckpointCallback to use. + The StateWrapper is a dictionary that contains the following mapping: + "step" (int): the current training step + "epoch" (int): the current epoch step + "local_rank" (int): the local rank that the process is running on + "global_rank" (int): the global rank that the process is running on + "optimizers" (list): a list of optimizers defined during the training process + "tensors" (TrainingState): A TrainingState object that can be used to access tensor values """ self.action = action super().__init__( diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index a9a4970d0254..2adac28d5530 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -23,6 +23,7 @@ import warnings from abc import ABC, abstractmethod from collections import namedtuple +from typing import Callable, List, Union import nemo from nemo.utils import get_checkpoint_from_dir, logging @@ -37,6 +38,11 @@ class NeMoCallback(ABC): + """The base class for callbacks inside of NeMo. It contains no __init__ which children classes are responsible for. + Each callback contains 8 functions which are called at different stages of train(). All functions must take as the + first argument: the current action state. This state is a StateWrapper object. + TODO: Add a link to documentation. + """ def on_train_start(self, state): pass @@ -62,113 +68,10 @@ def on_train_end(self, state): pass -class TensorboardLogger(NeMoCallback): - def __init__(self, step_freq=100, tensors_to_log=["loss"], tb_writer=None, custom_tb_log_func=None): - # Step_freq: how often logs are printed - self.step_freq = step_freq - self.tensors_to_log = tensors_to_log - if tb_writer is None: - logging.error("There was no tb writer") - # Should grab this from default tb writer - else: - self.tb_writer = tb_writer - self.custom_tb_log_func = custom_tb_log_func - self._last_epoch_start = None - - # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]): - # #tensors_to_log: List of keys into state that will be logged - - def on_epoch_start(self, state): - if state["global_rank"] is None or state["global_rank"] == 0: - self._last_epoch_start = time.time() - - def on_epoch_end(self, state): - if state["global_rank"] is None or state["global_rank"] == 0: - # always log epoch num and epoch_time - epoch_time = time.time() - self._last_epoch_start - self.tb_writer.add_scalar('misc/epoch', state["epoch"], state["step"]) - self.tb_writer.add_scalar('misc/epoch_time', epoch_time, state["step"]) - - def on_step_end(self, state): - if state["global_rank"] is None or state["global_rank"] == 0: - if state["step"] % self.step_freq == 0: - tb_log_func = lambda x: self.tb_writer.add_scalar(x, state["tensors"].get_tensor(x), state["step"]) - if self.custom_tb_log_func is not None: - tb_log_func = self.custom_tb_log_func - for tensor_key in self.tensors_to_log: - tb_log_func(tensor_key) - - -class WandBLogger(NeMoCallback): - def __init__(self, step_freq=100, tensors_to_log=["loss"], wandb_name=None, wandb_project=None, args=None): - if not _WANDB_AVAILABLE: - logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") - self._step_freq = step_freq - self._tensors_to_log = tensors_to_log - self._name = wandb_name - self._project = wandb_project - self._args = args - self._last_epoch_start = None - - def on_train_start(self, state): - if state["global_rank"] is None or state["global_rank"] == 0: - if _WANDB_AVAILABLE and wandb.run is None: - wandb.init(name=self._name, project=self._project) - if self._args is not None: - wandb.config.update(self._args) - elif _WANDB_AVAILABLE and wandb.run is not None: - logging.info("Re-using wandb session") - else: - logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") - logging.info("Will not log data to weights and biases.") - self._step_freq = -1 - - def on_step_end(self, state): - # log training metrics - if state["global_rank"] is None or state["global_rank"] == 0: - if state["step"] % self._step_freq == 0 and self._step_freq > 0: - tensors_logged = {t: state["tensors"].get_tensor(t).cpu() for t in self._tensors_to_log} - # Always log learning rate - tensors_logged['LR'] = state["learning_rate"] - self._wandb_log(tensors_logged) - - def on_epoch_start(self, state): - if state["global_rank"] is None or state["global_rank"] == 0: - self._last_epoch_start = time.time() - - def on_epoch_end(self, state): - if state["global_rank"] is None or state["global_rank"] == 0: - # always log epoch num and epoch_time - epoch_time = time.time() - self._last_epoch_start - self._wandb_log({"epoch": state["epoch"], "epoch_time": epoch_time}) - - def _wandb_log(self, tensors_logged): - if _WANDB_AVAILABLE: - wandb.log(tensors_logged, step=state["step"]) - - -class SimpleLossLogger(NeMoCallback): - def __init__(self, step_freq=100, tensors_to_log=["loss"]): - # Step_freq: how often logs are printed - self.step_freq = step_freq - self.tensors_to_log = tensors_to_log - - # def on_optimizer_step_stop(self, state, tensors_to_log=[“loss”]): - # #tensors_to_log: List of keys into state that will be logged - - def on_step_end(self, state): - if state["step"] % self.step_freq == 0: - for tensor_key in self.tensors_to_log: - tensor = state["tensors"].get_tensor(tensor_key) - if tensor is None: - tensor = state["tensors"].get_and_compute_tensor(tensor_key) - logging.info("%s: %s", tensor_key, tensor) - # except KeyError: - # raise KeyError(f"{self} was passed {tensor_key} but the tensor was not found in the state_dict. " - # f"Current state tensors include {state['tensors'].tensor_list()}") - - def on_train_start(func): + """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the + on_train_start callback event. + """ class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -180,6 +83,9 @@ def on_train_start(self, state): def on_epoch_start(func): + """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the + on_epoch_start callback event. + """ class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -191,6 +97,9 @@ def on_epoch_start(self, state): def on_batch_start(func): + """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the + on_batch_start callback event. + """ class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -202,6 +111,9 @@ def on_batch_start(self, state): def on_step_start(func): + """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the + on_step_start callback event. + """ class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -213,6 +125,9 @@ def on_step_start(self, state): def on_step_end(func): + """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the + on_step_end callback event. + """ class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -224,6 +139,9 @@ def on_step_end(self, state): def on_batch_end(func): + """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the + on_batch_end callback event. + """ class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -235,6 +153,9 @@ def on_batch_end(self, state): def on_epoch_end(func): + """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the + on_epoch_end callback event. + """ class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -246,6 +167,9 @@ def on_epoch_end(self, state): def on_train_end(func): + """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the + on_train_end callback event. + """ class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -256,6 +180,313 @@ def on_train_end(self, state): return NeMoCallbackWrapper(func) + +class SimpleLogger(NeMoCallback): + def __init__(self, step_freq:int = 100, tensors_to_log: List[Union[str, 'NmTensor']] = ["loss"]): + """A simple callback that prints tensors to screen. It's default option is to print the training loss every + 100 steps. Additional tensors can be printed by adding them to the tensors_to_log argument. + + args: + step_freq (int): The frequency of printing to screen. Defaults to every 100 steps + tensors_to_log (List of str or NmTensor): A list of either tensor names or NmTensors which will be printed + every step_freq steps. + Defaults to ["loss"] which only prints the loss. + """ + self.step_freq = step_freq + self.tensors_to_log = tensors_to_log + + def on_step_end(self, state): + if state["step"] % self.step_freq == 0: + for tensor_key in self.tensors_to_log: + tensor = state["tensors"].get_tensor(tensor_key) + logging.info("%s: %s", tensor_key, tensor) + + +class TensorboardLogger(NeMoCallback): + def __init__( + self, + tb_writer: 'torch.utils.tensorboard.SummaryWriter', + step_freq:int=100, + tensors_to_log:List[Union[str, 'NmTensor']]=["loss"], + custom_tb_log_func:Callable[[Union[str, 'NmTensor']],None]=None, + log_epoch:bool=True + ): + """A tensorboard callback that logs tensors using a tensorboard writer object. It's default option is to log + the loss every 100 steps. Additional scalar tensors can be logged by adding them to the tensors_to_log + argument. In order to log complex tensorboard entities, the custom_tb_log_func must be passed it. By default, + it always logs the current epoch and the time taken per epoch. + + args: + tb_writer (required): The tensorboard logger object. + step_freq (int): The frequency of tensorboard logging. Defaults to every 100 steps + tensors_to_log (List of str or NmTensor): A list of either tensor names or NmTensors which will be logged + every step_freq steps. + Defaults to ["loss"] which only prints the loss. + custom_tb_log_func (func): TensorboardLogger loops through tensors_to_log and passes these elements to + custom_tb_log_func. So a custom_tb_log_func will receive one argument on each call with the arugment + being an element from tensors_to_log. + Defaults to None which logs each tensors_to_log as a scalar. + log_epoch (bool): Whether to log epoch and epoch training time to tensorboard. + Defaults to True. + """ + self.step_freq = step_freq + self.tensors_to_log = tensors_to_log + self.tb_writer = tb_writer + self.custom_tb_log_func = custom_tb_log_func + self._last_epoch_start = None + self._log_epoch = log_epoch + + def on_epoch_start(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + self._last_epoch_start = time.time() + + def on_epoch_end(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + if self._log_epoch: + epoch_time = time.time() - self._last_epoch_start + self.tb_writer.add_scalar('misc/epoch', state["epoch"], state["step"]) + self.tb_writer.add_scalar('misc/epoch_time', epoch_time, state["step"]) + + def on_step_end(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + if state["step"] % self.step_freq == 0: + tb_log_func = lambda x: self.tb_writer.add_scalar(x, state["tensors"].get_tensor(x), state["step"]) + if self.custom_tb_log_func is not None: + tb_log_func = self.custom_tb_log_func + for tensor_key in self.tensors_to_log: + tb_log_func(tensor_key) + + +class WandBLogger(NeMoCallback): + def __init__( + self, + step_freq:int=100, + tensors_to_log:List[Union[str, 'NmTensor']]=["loss"], + wandb_name:str=None, + wandb_project:str=None, + args=None, + log_epoch:bool=True + ): + """A [Weights & Biases](https://docs.wandb.com/) callback that logs tensors to W&B. It's default option is to + log the loss every 100 steps. Additional scalar tensors can be logged by adding them to the tensors_to_log + argument. By default, it always logs the current epoch and the time taken per epoch. + + args: + step_freq (int): The frequency of tensorboard logging. Defaults to every 100 steps + tensors_to_log (List of str or NmTensor): A list of either tensor names or NmTensors which will be logged + every step_freq steps. + Defaults to ["loss"] which only prints the loss. + wandb_name(str): wandb experiment name. + Defaults to None + wandb_project(str): wandb project name. + Defaults to None + args: argparse flags which will be logged as hyperparameters. + Defaults to None. + log_epoch (bool): Whether to log epoch and epoch training time to tensorboard. + Defaults to True. + """ + if not _WANDB_AVAILABLE: + logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") + self._step_freq = step_freq + self._tensors_to_log = tensors_to_log + self._name = wandb_name + self._project = wandb_project + self._args = args + self._last_epoch_start = None + self._log_epoch = log_epoch + + def on_train_start(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + if _WANDB_AVAILABLE and wandb.run is None: + wandb.init(name=self._name, project=self._project) + if self._args is not None: + wandb.config.update(self._args) + elif _WANDB_AVAILABLE and wandb.run is not None: + logging.info("Re-using wandb session") + else: + logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") + logging.info("Will not log data to weights and biases.") + self._step_freq = -1 + + def on_step_end(self, state): + # log training metrics + if state["global_rank"] is None or state["global_rank"] == 0: + if state["step"] % self._step_freq == 0 and self._step_freq > 0: + tensors_logged = {t: state["tensors"].get_tensor(t).cpu() for t in self._tensors_to_log} + # Always log learning rate + tensors_logged['LR'] = state["learning_rate"] + self._wandb_log(tensors_logged) + + def on_epoch_start(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + self._last_epoch_start = time.time() + + def on_epoch_end(self, state): + if state["global_rank"] is None or state["global_rank"] == 0: + if self._log_epoch: + epoch_time = time.time() - self._last_epoch_start + self._wandb_log({"epoch": state["epoch"], "epoch_time": epoch_time}) + + def _wandb_log(self, tensors_logged): + if _WANDB_AVAILABLE: + wandb.log(tensors_logged, step=state["step"]) + + +class CheckpointCallback(NeMoCallback): + def __init__( + self, + folder:str, + load_from_folder:str=None, + step_freq:int=-1, + epoch_freq:int=-1, + checkpoints_to_keep:int=4, + force_load:bool=False, + ): + """A callback that does checkpointing of module weights and trainer (incl. optimizer) status. + + args: + folder (str, required): A path where checkpoints are to be stored and loaded from if load_from_folder is + None. + load_from_folder (str): A path where checkpoints can be loaded from. + Defaults to None. + step_freq (int): How often in terms of steps to save checkpoints. One of step_freq or epoch_freq is + required. + epoch_freq (int): How often in terms of epochs to save checkpoints. One of step_freq or epoch_freq is + required. + checkpoints_to_keep (int): Number of most recent checkpoints to keep. Older checkpoints will be deleted. + Defaults to 4. + force_load (bool): Whether to crash if loading is unsuccessful. + Defaults to False + """ + if step_freq == -1 and epoch_freq == -1: + logging.warning("No checkpoints will be saved because step_freq and epoch_freq are both -1.") + + if step_freq > -1 and epoch_freq > -1: + logging.warning("You config the model to save by both steps and epochs. Please use one or the other") + epoch_freq = -1 + + self._step_freq = step_freq + self._epoch_freq = epoch_freq + self._folder = folder + self._load_from_folder = load_from_folder if load_from_folder else folder + self._ckpt2keep = checkpoints_to_keep + self._saved_ckpts = [] + # If True, run will fail if we cannot load module weights + self._force_load = force_load + + def __save_to(self, path, state): + if state["global_rank"] is not None and state["global_rank"] != 0: + return + if not os.path.isdir(path): + logging.info(f"Creating {path} folder") + os.makedirs(path, exist_ok=True) + unique_mod_names = set() + for module in AppState().modules: + if module.num_weights > 0: + if str(module) in unique_mod_names: + raise NotImplementedError( + "There were two instances of the same module. Please overwrite __str__() of one of the " + "modules." + ) + unique_mod_names.add(str(module)) + if self._step_freq > -1: + filename = f"{module}-STEP-{state['step']}.pt" + else: + filename = f"{module}-EPOCH-{state['epoch']}.pt" + module.save_to(os.path.join(path, filename)) + + if self._step_freq > -1: + filename = f"trainer-STEP-{state['step']}.pt" + state.save_state_to(f"{path}/{filename}") + self._saved_ckpts.append(f"-{state['step']}.pt") + else: + filename = f"trainer-EPOCH-{state['epoch']}.pt" + state.save_state_to(f"{path}/{filename}") + self._saved_ckpts.append(f"-{state['epoch']}.pt") + + if len(self._saved_ckpts) > self._ckpt2keep: + for end in self._saved_ckpts[: -self._ckpt2keep]: + for file in glob.glob(f'{path}/*{end}'): + os.remove(file) + self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :] + logging.info(f'Saved checkpoint: {path}/{filename}') + + def __restore_from(self, path, state): + if not os.path.isdir(path): + if self._force_load: + raise ValueError("force_load was set to True for checkpoint callback but a checkpoint was not found.") + logging.warning(f"Checkpoint folder {path} not found!") + else: + logging.info(f"Found checkpoint folder {path}. Will attempt to restore checkpoints from it.") + modules_to_restore = [] + modules_to_restore_name = [] + for module in AppState().modules: + if module.num_weights > 0: + modules_to_restore.append(module) + modules_to_restore_name.append(str(module)) + try: + module_checkpoints = get_checkpoint_from_dir(modules_to_restore_name, path) + + for mod, checkpoint in zip(modules_to_restore, module_checkpoints): + mod.restore_from(checkpoint, state["local_rank"]) + except (ValueError) as e: + if self._force_load: + raise ValueError( + "force_load was set to True for checkpoint callback but a checkpoint was not found." + ) + logging.warning(e) + logging.warning( + f"Checkpoint folder {path} was present but nothing was restored. Continuing training from random " + "initialization." + ) + return + + try: + trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path) + state.restore_state_from(trainer_checkpoints[0]) + # for tr, checkpoint in zip([self.action], trainer_checkpoints): + except (ValueError) as e: + logging.warning(e) + logging.warning( + "Trainer state such as optimizer state and current step/epoch was not restored. Pretrained weights" + " have still been restore and fine-tuning should continue fine." + ) + return + + def on_train_start(self, state): + num_parameters = 0 + unique_mod_names = set() + for module in AppState().modules: + if module.num_weights > 0: + if str(module) in unique_mod_names: + raise NotImplementedError( + "There were two instances of the same module. Please overwrite __str__() of one of the " + "modules." + ) + unique_mod_names.add(str(module)) + num_parameters += module.num_weights + logging.info(f"Found {len(unique_mod_names)} modules with weights:") + for name in unique_mod_names: + logging.info(f"{name}") + logging.info(f"Total model parameters: {num_parameters}") + self.__restore_from(self._load_from_folder, state) + + def on_step_end(self, state): + step = state["step"] + if self._step_freq > 0 and step % self._step_freq == 0 and step > 0: + self.__save_to(self._folder, state) + + def on_train_end(self, state): + if self._step_freq > 0 or self._epoch_freq > 0: + self.__save_to(self._folder, state) + + def on_epoch_end(self, state): + epoch = state["epoch"] + if self._epoch_freq > 0 and epoch % self._epoch_freq == 0 and epoch > 0: + self.__save_to(self._folder, state) + + + class ActionCallback(ABC): """Abstract interface for callbacks. """ @@ -458,144 +689,6 @@ def on_iteration_end(self): logging.info(f"Step time: {run_time} seconds") -class CheckpointCallback(NeMoCallback): - """ - For callback documentation: please see - https://nvidia.github.io/NeMo/tutorials/callbacks.html - """ - - def __init__( - self, folder, load_from_folder=None, step_freq=-1, epoch_freq=-1, checkpoints_to_keep=4, force_load=False, - ): - super().__init__() - if step_freq == -1 and epoch_freq == -1: - logging.warning("No checkpoints will be saved because step_freq and epoch_freq are both -1.") - - if step_freq > -1 and epoch_freq > -1: - logging.warning("You config the model to save by both steps and epochs. Please use one or the other") - epoch_freq = -1 - - self._step_freq = step_freq - self._epoch_freq = epoch_freq - self._folder = folder - self._load_from_folder = load_from_folder if load_from_folder else folder - self._ckpt2keep = checkpoints_to_keep - self._saved_ckpts = [] - # If True, run will fail if we cannot load module weights - self._force_load = force_load - - def __save_to(self, path, state): - if state["global_rank"] is not None and state["global_rank"] != 0: - return - if not os.path.isdir(path): - logging.info(f"Creating {path} folder") - os.makedirs(path, exist_ok=True) - unique_mod_names = set() - for module in AppState().modules: - if module.num_weights > 0: - if str(module) in unique_mod_names: - raise NotImplementedError( - "There were two instances of the same module. Please overwrite __str__() of one of the " - "modules." - ) - unique_mod_names.add(str(module)) - if self._step_freq > -1: - filename = f"{module}-STEP-{state['step']}.pt" - else: - filename = f"{module}-EPOCH-{state['epoch']}.pt" - module.save_to(os.path.join(path, filename)) - - if self._step_freq > -1: - filename = f"trainer-STEP-{state['step']}.pt" - state.save_state_to(f"{path}/{filename}") - self._saved_ckpts.append(f"-{state['step']}.pt") - else: - filename = f"trainer-EPOCH-{state['epoch']}.pt" - state.save_state_to(f"{path}/{filename}") - self._saved_ckpts.append(f"-{state['epoch']}.pt") - - if len(self._saved_ckpts) > self._ckpt2keep: - for end in self._saved_ckpts[: -self._ckpt2keep]: - for file in glob.glob(f'{path}/*{end}'): - os.remove(file) - self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :] - logging.info(f'Saved checkpoint: {path}/{filename}') - - def __restore_from(self, path, state): - if not os.path.isdir(path): - if self._force_load: - raise ValueError("force_load was set to True for checkpoint callback but a checkpoint was not found.") - logging.warning(f"Checkpoint folder {path} not found!") - else: - logging.info(f"Found checkpoint folder {path}. Will attempt to restore checkpoints from it.") - modules_to_restore = [] - modules_to_restore_name = [] - for module in AppState().modules: - if module.num_weights > 0: - modules_to_restore.append(module) - modules_to_restore_name.append(str(module)) - try: - module_checkpoints = get_checkpoint_from_dir(modules_to_restore_name, path) - - for mod, checkpoint in zip(modules_to_restore, module_checkpoints): - mod.restore_from(checkpoint, state["local_rank"]) - except (ValueError) as e: - if self._force_load: - raise ValueError( - "force_load was set to True for checkpoint callback but a checkpoint was not found." - ) - logging.warning(e) - logging.warning( - f"Checkpoint folder {path} was present but nothing was restored. Continuing training from random " - "initialization." - ) - return - - try: - trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path) - state.restore_state_from(trainer_checkpoints[0]) - # for tr, checkpoint in zip([self.action], trainer_checkpoints): - except (ValueError) as e: - logging.warning(e) - logging.warning( - "Trainer state such as optimizer state and current step/epoch was not restored. Pretrained weights" - " have still been restore and fine-tuning should continue fine." - ) - return - - def on_train_start(self, state): - num_parameters = 0 - unique_mod_names = set() - for module in AppState().modules: - if module.num_weights > 0: - if str(module) in unique_mod_names: - raise NotImplementedError( - "There were two instances of the same module. Please overwrite __str__() of one of the " - "modules." - ) - unique_mod_names.add(str(module)) - num_parameters += module.num_weights - logging.info(f"Found {len(unique_mod_names)} modules with weights:") - for name in unique_mod_names: - logging.info(f"{name}") - logging.info(f"Total model parameters: {num_parameters}") - self.__restore_from(self._load_from_folder, state) - - def on_step_end(self, state): - step = state["step"] - if self._step_freq > 0 and step % self._step_freq == 0 and step > 0: - self.__save_to(self._folder, state) - - def on_train_end(self, state): - if self._step_freq > 0 or self._epoch_freq > 0: - self.__save_to(self._folder, state) - - def on_epoch_end(self, state): - epoch = state["epoch"] - if self._epoch_freq > 0 and epoch % self._epoch_freq == 0 and epoch > 0: - self.__save_to(self._folder, state) - - class EvaluatorCallback(ActionCallback): """ For callback documentation: please see diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index 2dc63ffca36b..8dba04e4acd3 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -78,6 +78,9 @@ def check_tensor_cached(self, unique_name: str): args: unique_name (str): The NmTensor.unique_name that we want to check for. + + returns: + (bool) whether the tensor with unique_name has been computed yet. """ if self.tensor_dict[unique_name] is None: return False @@ -93,6 +96,10 @@ def get_tensor(self, name: Union[str, NmTensor], compute: bool = True): call DAG and then do a forward pass on this call DAG to compute the tensor. If False, it will return None if the tensor has not been computed yet. Defaults to True. + + returns: + (torch.tensor or None) representing the computed value of the requested name. Returns None if compute is + False and the tensor has not been computed yet. """ if isinstance(name, NmTensor): unique_name = name.unique_name @@ -298,7 +305,7 @@ def global_rank(self): def train( self, tensors_to_optimize: List[NmTensor], - callbacks: Optional[List[ActionCallback]], + callbacks: Optional[List[Union[ActionCallback, NeMoCallback]]], lr_policy=None, batches_per_step=None, stop_on_nan_loss=False, @@ -740,7 +747,7 @@ def train( training_graph=None, optimizer=None, optimization_params=None, - callbacks: Optional[List[ActionCallback]] = None, + callbacks: Optional[List[Union[ActionCallback, NeMoCallback]]] = None, lr_policy=None, batches_per_step=None, stop_on_nan_loss=False, From d95b2d4d45c7084a696a3035d340fc16352d11ad Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 21 May 2020 16:34:58 -0700 Subject: [PATCH 26/40] style Signed-off-by: Jason --- nemo/core/callbacks.py | 55 ++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 2adac28d5530..99a76d5be872 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -43,6 +43,7 @@ class NeMoCallback(ABC): first argument: the current action state. This state is a StateWrapper object. TODO: Add a link to documentation. """ + def on_train_start(self, state): pass @@ -72,6 +73,7 @@ def on_train_start(func): """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the on_train_start callback event. """ + class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -86,6 +88,7 @@ def on_epoch_start(func): """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the on_epoch_start callback event. """ + class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -100,6 +103,7 @@ def on_batch_start(func): """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the on_batch_start callback event. """ + class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -114,6 +118,7 @@ def on_step_start(func): """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the on_step_start callback event. """ + class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -128,6 +133,7 @@ def on_step_end(func): """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the on_step_end callback event. """ + class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -142,6 +148,7 @@ def on_batch_end(func): """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the on_batch_end callback event. """ + class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -156,6 +163,7 @@ def on_epoch_end(func): """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the on_epoch_end callback event. """ + class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -170,6 +178,7 @@ def on_train_end(func): """A function decorator that wraps a Callable inside the NeMoCallback object and runs the function with the on_train_end callback event. """ + class NeMoCallbackWrapper(NeMoCallback): def __init__(self, my_func): self._func = my_func @@ -180,9 +189,8 @@ def on_train_end(self, state): return NeMoCallbackWrapper(func) - class SimpleLogger(NeMoCallback): - def __init__(self, step_freq:int = 100, tensors_to_log: List[Union[str, 'NmTensor']] = ["loss"]): + def __init__(self, step_freq: int = 100, tensors_to_log: List[Union[str, 'NmTensor']] = ["loss"]): """A simple callback that prints tensors to screen. It's default option is to print the training loss every 100 steps. Additional tensors can be printed by adding them to the tensors_to_log argument. @@ -204,13 +212,13 @@ def on_step_end(self, state): class TensorboardLogger(NeMoCallback): def __init__( - self, - tb_writer: 'torch.utils.tensorboard.SummaryWriter', - step_freq:int=100, - tensors_to_log:List[Union[str, 'NmTensor']]=["loss"], - custom_tb_log_func:Callable[[Union[str, 'NmTensor']],None]=None, - log_epoch:bool=True - ): + self, + tb_writer: 'torch.utils.tensorboard.SummaryWriter', + step_freq: int = 100, + tensors_to_log: List[Union[str, 'NmTensor']] = ["loss"], + custom_tb_log_func: Callable[[Union[str, 'NmTensor']], None] = None, + log_epoch: bool = True, + ): """A tensorboard callback that logs tensors using a tensorboard writer object. It's default option is to log the loss every 100 steps. Additional scalar tensors can be logged by adding them to the tensors_to_log argument. In order to log complex tensorboard entities, the custom_tb_log_func must be passed it. By default, @@ -259,14 +267,14 @@ def on_step_end(self, state): class WandBLogger(NeMoCallback): def __init__( - self, - step_freq:int=100, - tensors_to_log:List[Union[str, 'NmTensor']]=["loss"], - wandb_name:str=None, - wandb_project:str=None, - args=None, - log_epoch:bool=True - ): + self, + step_freq: int = 100, + tensors_to_log: List[Union[str, 'NmTensor']] = ["loss"], + wandb_name: str = None, + wandb_project: str = None, + args=None, + log_epoch: bool = True, + ): """A [Weights & Biases](https://docs.wandb.com/) callback that logs tensors to W&B. It's default option is to log the loss every 100 steps. Additional scalar tensors can be logged by adding them to the tensors_to_log argument. By default, it always logs the current epoch and the time taken per epoch. @@ -335,12 +343,12 @@ def _wandb_log(self, tensors_logged): class CheckpointCallback(NeMoCallback): def __init__( self, - folder:str, - load_from_folder:str=None, - step_freq:int=-1, - epoch_freq:int=-1, - checkpoints_to_keep:int=4, - force_load:bool=False, + folder: str, + load_from_folder: str = None, + step_freq: int = -1, + epoch_freq: int = -1, + checkpoints_to_keep: int = 4, + force_load: bool = False, ): """A callback that does checkpointing of module weights and trainer (incl. optimizer) status. @@ -486,7 +494,6 @@ def on_epoch_end(self, state): self.__save_to(self._folder, state) - class ActionCallback(ABC): """Abstract interface for callbacks. """ From 7bb53cdfcde685f74368c9b1d735195251b94fe7 Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 21 May 2020 17:00:23 -0700 Subject: [PATCH 27/40] add deprecation warnings Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 7 ++++++- nemo/core/callbacks.py | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 86d8ce6aaba7..49848ebe0c42 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -115,7 +115,12 @@ def epoch(self, epoch): self._epoch = epoch @property - @deprecated + @deprecated(version="0.12", explanation="epoch_num has been deprecated in favour of epoch.") + def epoch_num(self): + return self._epoch + + @epoch_num.setter + @deprecated(version="0.12", explanation="epoch_num has been deprecated in favour of epoch.") def epoch_num(self): return self._epoch diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 99a76d5be872..511da94199a3 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -28,6 +28,7 @@ import nemo from nemo.utils import get_checkpoint_from_dir, logging from nemo.utils.app_state import AppState +from nemo.utils.decorators import deprecated try: import wandb @@ -560,6 +561,7 @@ class ModuleSaverCallback(ActionCallback): https://nvidia.github.io/NeMo/tutorials/callbacks.html """ + @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") def __init__( self, save_modules_list, step_freq=1000, folder=None, checkpoints_to_keep=4, ): @@ -618,6 +620,7 @@ class SimpleLossLoggerCallback(ActionCallback): https://nvidia.github.io/NeMo/tutorials/callbacks.html """ + @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") def __init__( self, tensors, print_func=None, get_tb_values=None, log_to_tb_func=None, step_freq=25, tb_writer=None, ): @@ -835,6 +838,7 @@ class ValueSetterCallback(ActionCallback): Policy = _Policy Method = _Method + @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") def __init__(self, module, arg_name, policies=None, total_steps=None, tb_writer=None): super().__init__() @@ -880,6 +884,7 @@ def on_iteration_start(self): class UnfreezeCallback(ActionCallback): + @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") def __init__(self, modules, start_epoch=0): super().__init__() @@ -897,6 +902,7 @@ class OldWandbCallback(ActionCallback): Log metrics to [Weights & Biases](https://docs.wandb.com/) """ + @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") def __init__( self, train_tensors=[], wandb_name=None, wandb_project=None, args=None, update_freq=25, ): From d615efa060a3eacbb32948292446d80c97a075a7 Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 21 May 2020 17:01:19 -0700 Subject: [PATCH 28/40] changelog Signed-off-by: Jason --- CHANGELOG.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a749af6e06b..5ffaf33f5b89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -83,6 +83,7 @@ To release a new version, please update the changelog as followed: ### Changed - Syncs across workers at each step to check for NaN or inf loss. Terminates all workers if stop\_on\_nan\_loss is set (as before), lets Apex deal with it if apex.amp optimization level is O1 or higher, and skips the step across workers otherwise. ([PR #637](https://github.com/NVIDIA/NeMo/pull/637)) - @redoctopus +- Updated the callback system. Old callbacks will be deprecated in version 0.12. ([PR #615](https://github.com/NVIDIA/NeMo/pull/615)) - @blisc ### Dependencies Update @@ -123,7 +124,7 @@ files, along with unit tests, examples and tutorials ([PR #375](https://github.com/NVIDIA/NeMo/pull/375)) - @titu1994 ### Changed -- Refactoring of `nemo_nlp` collections: +- Refactoring of `nemo_nlp` collections: ([PR #368](https://github.com/NVIDIA/NeMo/pull/368)) - @VahidooX, @yzhang123, @ekmb - renaming and restructuring of files, folder, and functions in `nemo_nlp` - losses cleaned up. LossAggregatorNM moved to nemo/backends/pytorch/common/losses @@ -138,7 +139,7 @@ files, along with unit tests, examples and tutorials ([PR #284](https://github.com/NVIDIA/NeMo/pull/284)) - @stasbel - NeMo is not longer using pep8 code style rules. Code style rules are now enforced with `isort` and `black` incorporated into CI checks. ([PR #286](https://github.com/NVIDIA/NeMo/pull/286)) - @stasbel -- Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params). +- Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params). ([PR #309](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia - Updated nemo's use of the logging library. from nemo import logging is now the reccomended way of using the nemo logger. neural_factory.logger and all other instances of logger are now deprecated and planned for removal in the next version. Please see PR 267 for complete change information. ([PR #267](https://github.com/NVIDIA/NeMo/pull/267), [PR #283](https://github.com/NVIDIA/NeMo/pull/283), [PR #305](https://github.com/NVIDIA/NeMo/pull/305), [PR #311](https://github.com/NVIDIA/NeMo/pull/311)) - @blisc @@ -147,7 +148,7 @@ files, along with unit tests, examples and tutorials - Added TRADE (dialogue state tracking model) on MultiWOZ dataset ([PR #322](https://github.com/NVIDIA/NeMo/pull/322)) - @chiphuyen, @VahidooX -- Question answering: +- Question answering: ([PR #390](https://github.com/NVIDIA/NeMo/pull/390)) - @yzhang123 - Changed question answering task to use Roberta and Albert as alternative backends to Bert - Added inference mode that does not require ground truth labels @@ -158,7 +159,7 @@ files, along with unit tests, examples and tutorials ### Deprecated ### Fixed -- Critical fix of the training action on CPU +- Critical fix of the training action on CPU ([PR #308](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia - Fixed issue in Tacotron 2 prenet ([PR #444](https://github.com/NVIDIA/NeMo/pull/444)) - @blisc From 21f4cf10bba1082bf174cc585c0751840c99f04f Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 21 May 2020 17:09:51 -0700 Subject: [PATCH 29/40] rename oldwandbcallback Signed-off-by: Jason --- nemo/core/callbacks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 511da94199a3..97784d2b66e3 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -897,7 +897,7 @@ def on_iteration_start(self): m.unfreeze() -class OldWandbCallback(ActionCallback): +class WandbCallback(ActionCallback): """ Log metrics to [Weights & Biases](https://docs.wandb.com/) """ From 1c99f548b93366528c68ef6af93e1b4d74c59d82 Mon Sep 17 00:00:00 2001 From: Jason Date: Fri, 22 May 2020 15:11:54 -0700 Subject: [PATCH 30/40] test Signed-off-by: Jason --- nemo/core/callbacks.py | 16 ++ nemo/utils/nemo_logging.py | 33 ++++ tests/unit/core/test_nemo_callbacks.py | 209 +++++++++++++++++++++++++ 3 files changed, 258 insertions(+) create mode 100755 tests/unit/core/test_nemo_callbacks.py diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 97784d2b66e3..4b9826e9b6c1 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -15,6 +15,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +# __all__ = [ +# "NeMoCallback", +# "SimpleLogger", +# "TensorboardLogger", +# "WandBLogger", +# "CheckpointCallback", +# "on_train_start", +# "on_train_end", +# "on_epoch_start", +# "on_epoch_end", +# "on_batch_start", +# "on_batch_end", +# "on_step_start", +# "on_step_end", +# ] + import datetime import glob import os diff --git a/nemo/utils/nemo_logging.py b/nemo/utils/nemo_logging.py index 8a2bd06040d6..ee5cb0f6ee4d 100644 --- a/nemo/utils/nemo_logging.py +++ b/nemo/utils/nemo_logging.py @@ -212,6 +212,39 @@ def patch_stderr_handler(self, stream): else: raise RuntimeError("Impossible to patch logging handlers if handler does not exist") + @contextmanager + def patch_stdout_handler(self, stream): + """ Useful for unittests + """ + if self._logger is not None: + try: + old_stream = self._handlers["stream_stdout"].stream + if old_stream is None: + raise ValueError + + # Port backwards set_stream() from python 3.7 + self._handlers["stream_stdout"].acquire() + try: + self._handlers["stream_stdout"].flush() + self._handlers["stream_stdout"].stream = stream + finally: + self._handlers["stream_stdout"].release() + + yield stream + except (KeyError, ValueError): + raise RuntimeError("Impossible to patch logging handlers if handler does not exist") + finally: + # Port backwards set_stream() from python 3.7 + self._handlers["stream_stdout"].acquire() + try: + self._handlers["stream_stdout"].flush() + self._handlers["stream_stdout"].stream = old_stream + finally: + self._handlers["stream_stdout"].release() + + else: + raise RuntimeError("Impossible to patch logging handlers if handler does not exist") + @contextmanager def temp_verbosity(self, verbosity_level): """Sets the a temporary threshold for what messages will be logged.""" diff --git a/tests/unit/core/test_nemo_callbacks.py b/tests/unit/core/test_nemo_callbacks.py new file mode 100755 index 000000000000..2242ece9775e --- /dev/null +++ b/tests/unit/core/test_nemo_callbacks.py @@ -0,0 +1,209 @@ +# ! /usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright 2019 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import os +import shutil +from io import StringIO + +import pytest +from tensorboard.backend.event_processing import event_file_inspector as efi +from torch.utils.tensorboard import SummaryWriter + +from nemo.backends.pytorch.tutorials import MSELoss, RealFunctionDataLayer, TaylorNet +from nemo.core.callbacks import * +from nemo.utils import logging + +@pytest.mark.usefixtures("neural_factory") +class TestNeMoCallbacks(): + @pytest.fixture() + def clean_up(self): + yield + self.nf.reset_trainer() + + @pytest.fixture() + def create_tensorboard_file(self): + os.makedirs("temp") + summary_writter = SummaryWriter("temp") + yield summary_writter + shutil.rmtree("temp") + + @pytest.mark.unit + def test_SimpleLogger(self, clean_up): + data_source = RealFunctionDataLayer(n=100, batch_size=1) + trainable_module = TaylorNet(dim=4) + loss = MSELoss() + + # Create the graph by connnecting the modules. + x, y = data_source() + y_pred = trainable_module(x=x) + loss_tensor = loss(predictions=y_pred, target=y) + + # Mock up both std and stderr streams. + with logging.patch_stdout_handler(StringIO()) as std_out: + self.nf.train( + tensors_to_optimize=[loss_tensor], + callbacks=[SimpleLogger(step_freq=1)], + optimization_params={"max_steps": 4, "lr": 0.01}, + optimizer="sgd" + ) + + output_lines = std_out.getvalue().splitlines() + assert len(output_lines) == 4 + for line in output_lines: + assert "loss" in line + + @pytest.mark.unit + def test_rename_and_log(self, clean_up): + data_source = RealFunctionDataLayer(n=100, batch_size=1) + trainable_module = TaylorNet(dim=4) + loss = MSELoss() + + # Create the graph by connnecting the modules. + x, y = data_source() + y_pred = trainable_module(x=x) + loss_tensor = loss(predictions=y_pred, target=y) + + y_pred.rename("y_pred") + + # Mock up both std and stderr streams. + with logging.patch_stdout_handler(StringIO()) as std_out: + self.nf.train( + tensors_to_optimize=[loss_tensor], + callbacks=[SimpleLogger(step_freq=1, tensors_to_log=['y_pred'])], + optimization_params={"max_steps": 4, "lr": 0.01}, + optimizer="sgd" + ) + + output_lines = std_out.getvalue().splitlines() + assert len(output_lines) == 4 + for line in output_lines: + assert "y_pred" in line + + @pytest.mark.unit + def test_TensorboardLogger(self, clean_up, create_tensorboard_file): + data_source = RealFunctionDataLayer(n=100, batch_size=1) + trainable_module = TaylorNet(dim=4) + loss = MSELoss() + + # Create the graph by connnecting the modules. + x, y = data_source() + y_pred = trainable_module(x=x) + loss_tensor = loss(predictions=y_pred, target=y) + + tb_logger = TensorboardLogger(create_tensorboard_file, step_freq=1) + callbacks = [tb_logger] + + self.nf.train( + tensors_to_optimize=[loss_tensor], + callbacks=callbacks, + optimization_params={"max_steps": 4, "lr": 0.01}, + optimizer="sgd" + ) + + # efi.inspect("temp", tag="loss") + inspection_units = efi.get_inspection_units("temp", "", "loss") + + # Make sure there is only 1 tensorboard file + assert len(inspection_units) == 1 + + # Assert that there the loss scalars has been logged 4 times + assert len(inspection_units[0].field_to_obs['scalars']) == 4 + + @pytest.mark.unit + def test_epoch_decorators(self, clean_up): + data_source = RealFunctionDataLayer(n=24, batch_size=12) + trainable_module = TaylorNet(dim=4) + loss = MSELoss() + + # Create the graph by connnecting the modules. + x, y = data_source() + y_pred = trainable_module(x=x) + loss_tensor = loss(predictions=y_pred, target=y) + + epoch_start_counter = [0] + epoch_end_counter = [0] + @on_epoch_start + def count_epoch_starts(state, counter=epoch_start_counter): + counter[0] += 1 + + @on_epoch_end + def count_epoch_ends(state, counter=epoch_end_counter): + counter[0] -= 1 + + callbacks = [count_epoch_starts, count_epoch_ends] + + self.nf.train( + tensors_to_optimize=[loss_tensor], + callbacks=callbacks, + optimization_params={"max_steps": 4, "lr": 0.01}, + optimizer="sgd" + ) + + assert epoch_start_counter[0] == 2 + assert epoch_end_counter[0] == -2 + + @pytest.mark.unit + def test_step_batch_decorators(self, clean_up): + """Showcase the difference between step and batch""" + data_source = RealFunctionDataLayer(n=24, batch_size=12) + trainable_module = TaylorNet(dim=4) + loss = MSELoss() + + # Create the graph by connnecting the modules. + x, y = data_source() + y_pred = trainable_module(x=x) + loss_tensor = loss(predictions=y_pred, target=y) + + epoch_step_counter = [0] + epoch_batch_counter = [0] + @on_step_end + def count_steps(state, counter=epoch_step_counter): + counter[0] += 1 + + @on_batch_end + def count_batches(state, counter=epoch_batch_counter): + counter[0] += 1 + + callbacks = [count_steps, count_batches] + + self.nf.train( + tensors_to_optimize=[loss_tensor], + callbacks=callbacks, + optimization_params={"max_steps": 4, "lr": 0.01}, + optimizer="sgd" + ) + + # when grad accumlation steps (aka iter_per_step or batches_per_step) = 1, num_steps == num_batches + assert epoch_step_counter[0] == 4 + assert epoch_batch_counter[0] == 4 + + epoch_step_counter[0] = 0 + epoch_batch_counter[0] = 0 + + self.nf.train( + tensors_to_optimize=[loss_tensor], + callbacks=callbacks, + optimization_params={"max_steps": 4, "lr": 0.01}, + optimizer="sgd", + reset=True, + batches_per_step=2 + ) + + # when grad accumlation steps != 1, num_steps != num_batches + assert epoch_step_counter[0] == 4 + assert epoch_batch_counter[0] == 8 From b976ec0a2b1839052534341d83a743603b3241be Mon Sep 17 00:00:00 2001 From: Jason Date: Fri, 22 May 2020 15:12:10 -0700 Subject: [PATCH 31/40] style Signed-off-by: Jason --- tests/unit/core/test_nemo_callbacks.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/unit/core/test_nemo_callbacks.py b/tests/unit/core/test_nemo_callbacks.py index 2242ece9775e..3152b6fd4ffc 100755 --- a/tests/unit/core/test_nemo_callbacks.py +++ b/tests/unit/core/test_nemo_callbacks.py @@ -28,8 +28,9 @@ from nemo.core.callbacks import * from nemo.utils import logging + @pytest.mark.usefixtures("neural_factory") -class TestNeMoCallbacks(): +class TestNeMoCallbacks: @pytest.fixture() def clean_up(self): yield @@ -59,7 +60,7 @@ def test_SimpleLogger(self, clean_up): tensors_to_optimize=[loss_tensor], callbacks=[SimpleLogger(step_freq=1)], optimization_params={"max_steps": 4, "lr": 0.01}, - optimizer="sgd" + optimizer="sgd", ) output_lines = std_out.getvalue().splitlines() @@ -86,7 +87,7 @@ def test_rename_and_log(self, clean_up): tensors_to_optimize=[loss_tensor], callbacks=[SimpleLogger(step_freq=1, tensors_to_log=['y_pred'])], optimization_params={"max_steps": 4, "lr": 0.01}, - optimizer="sgd" + optimizer="sgd", ) output_lines = std_out.getvalue().splitlines() @@ -112,7 +113,7 @@ def test_TensorboardLogger(self, clean_up, create_tensorboard_file): tensors_to_optimize=[loss_tensor], callbacks=callbacks, optimization_params={"max_steps": 4, "lr": 0.01}, - optimizer="sgd" + optimizer="sgd", ) # efi.inspect("temp", tag="loss") @@ -137,6 +138,7 @@ def test_epoch_decorators(self, clean_up): epoch_start_counter = [0] epoch_end_counter = [0] + @on_epoch_start def count_epoch_starts(state, counter=epoch_start_counter): counter[0] += 1 @@ -151,7 +153,7 @@ def count_epoch_ends(state, counter=epoch_end_counter): tensors_to_optimize=[loss_tensor], callbacks=callbacks, optimization_params={"max_steps": 4, "lr": 0.01}, - optimizer="sgd" + optimizer="sgd", ) assert epoch_start_counter[0] == 2 @@ -171,6 +173,7 @@ def test_step_batch_decorators(self, clean_up): epoch_step_counter = [0] epoch_batch_counter = [0] + @on_step_end def count_steps(state, counter=epoch_step_counter): counter[0] += 1 @@ -185,7 +188,7 @@ def count_batches(state, counter=epoch_batch_counter): tensors_to_optimize=[loss_tensor], callbacks=callbacks, optimization_params={"max_steps": 4, "lr": 0.01}, - optimizer="sgd" + optimizer="sgd", ) # when grad accumlation steps (aka iter_per_step or batches_per_step) = 1, num_steps == num_batches @@ -201,7 +204,7 @@ def count_batches(state, counter=epoch_batch_counter): optimization_params={"max_steps": 4, "lr": 0.01}, optimizer="sgd", reset=True, - batches_per_step=2 + batches_per_step=2, ) # when grad accumlation steps != 1, num_steps != num_batches From 6ec04aa342cc9ee66dbf6d1673b60d6cb2d565a7 Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 27 May 2020 15:55:10 -0700 Subject: [PATCH 32/40] first commit of changes Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 20 +- nemo/core/actions.py | 299 +++++++++++++ nemo/core/neural_factory.py | 442 -------------------- nemo/core/neural_types/__init__.py | 1 - nemo/core/neural_types/nmtensor_registry.py | 7 +- tests/unit/core/test_nemo_callbacks.py | 19 +- 6 files changed, 313 insertions(+), 475 deletions(-) create mode 100755 nemo/core/actions.py diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 49848ebe0c42..7ae84d1893f3 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -21,7 +21,8 @@ from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor from nemo.core.callbacks import ActionCallback, NeMoCallback, SimpleLossLoggerCallback -from nemo.core.neural_factory import Actions, OperationMode, Optimization, TrainingState, topological_sort_from_leaves +from nemo.core.neural_factory import OperationMode, Optimization +from nemo.core.actions import Actions, TrainingState, topological_sort_from_leaves from nemo.core.neural_types import AxisKind, NeuralType from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated @@ -1387,23 +1388,6 @@ def save_state_to(self, path): ) self.ddp_module_dict[key] = module - # # Convert batchnorm modules to synced if applicable - # if synced_batchnorm and isinstance(pmodule, torch.nn.Module): - # world_size = dist.get_world_size() - # if synced_batchnorm_groupsize > 0 and world_size % synced_batchnorm_groupsize != 0: - # raise ValueError( - # f"Synchronized batch norm group size" - # f" ({synced_batchnorm_groupsize}) must be 0" - # f" or divide total number of GPUs" - # f" ({world_size})." - # ) - # process_group = create_syncbn_process_group(synced_batchnorm_groupsize) - # pmodule = convert_syncbn(pmodule, process_group=process_group) - - # self.module_reference_table[key] = ( - # self.module_reference_table[key][0], - # pmodule, - # ) # single GPU/CPU training else: if t_dataset is not None: diff --git a/nemo/core/actions.py b/nemo/core/actions.py new file mode 100755 index 000000000000..6a988b265e06 --- /dev/null +++ b/nemo/core/actions.py @@ -0,0 +1,299 @@ +# ! /usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import List, Optional, Union + + +from nemo.core.neural_types import NmTensor +from nemo.utils.app_state import AppState +from nemo.core.neural_modules import ModuleType +from nemo.core.neural_factory import Optimization + + +def topological_sort_from_leaves(leaf_nmtensors: List[NmTensor], cached_training_state: 'TrainingState' = None): + """A function that accepts a list of NmTensors that need to be computed and constructs a callchain DAG that starts + from a datalayerNM and can be used to compute the NmTensors. + + args: + leaf_nmtensors (List[NmTensors]): The tensors to be computed + cached_training_state (TrainingState): A dictionary of already computed tensors. + Defaults to None meaning an empty cache. + + returns: + top_sorted_modules: the callchain DAG + """ + + def create_node(producer, producer_args): + if producer_args is None: + return tuple((producer, ())) + return tuple((producer, tuple([(k, v) for k, v in producer_args.items()]),)) + + def is_in_degree_zero(node, processed_nodes, cached_training_state): + """A node has in degree of zero""" + if node[1] == (): + return True + for _, nmtensor in node[1]: + node = create_node(nmtensor.producer, nmtensor.producer_args) + if node not in processed_nodes: + if cached_training_state and cached_training_state.check_tensor_cached(nmtensor.unique_name): + continue + return False + return True + + hooks = leaf_nmtensors if isinstance(leaf_nmtensors, list) else [leaf_nmtensors] + + # ensures that no tensors are processed twice + processed_nmtensors = set() + + indices_to_remove = [] + # Check for duplicates in hook + for i, nmtensor in enumerate(hooks): + if nmtensor in processed_nmtensors: + indices_to_remove.append(i) + else: + processed_nmtensors.add(nmtensor) + + for i in reversed(indices_to_remove): + hooks.pop(i) + + _top_sorted_modules = [] + all_nodes = {} + + # extract all nodes to all_nodes set + hooks_lst = list(hooks) + while len(hooks_lst) > 0: + # take nmtensor from the end of the list + nmtensor = hooks_lst.pop() + producer_args = nmtensor.producer_args + + node = create_node(nmtensor.producer, producer_args) + # Store nmtensor as an output of its producer + # first make sure all keys are present per output port + # and nm is inside all_nodes + if node not in all_nodes: + all_nodes[node] = {k: None for k in nmtensor.producer.output_ports} + # second, populate output port with current nmtensor + # where applicable + all_nodes[node][nmtensor.name] = nmtensor + processed_nmtensors.add(nmtensor) + + new_tensors = set() + if producer_args is not None and producer_args != {}: + for _, new_nmtensor in producer_args.items(): + if new_nmtensor not in processed_nmtensors: + new_tensors.add(new_nmtensor) + + if cached_training_state: + for _, input_nmtensor in producer_args.items(): + if cached_training_state.check_tensor_cached(input_nmtensor.unique_name): + new_tensors.remove(input_nmtensor) + + for new_nmtensor in new_tensors: + # put in the start of list + hooks_lst.insert(0, new_nmtensor) + + all_node_with_output = [] + # Iterate over all_nodes to create new nodes that include its output + # now all nodes have (module, input tensors, output tensors) + for node in all_nodes: + all_node_with_output.append(tuple((node[0], node[1], all_nodes[node]))) + + processed_nodes = [] + while len(all_node_with_output) > 0: + for node in all_node_with_output.copy(): + # if node's in_degree is zero it can be added to + # _top_sorted_modules + # this will also reduce in_degree of its children + if is_in_degree_zero(node, processed_nodes, cached_training_state): + _top_sorted_modules.append(node) + processed_nodes.append((node[0], node[1])) + all_node_with_output.remove(node) + + # Create top_sorted_modules aka callchain + top_sorted_modules = [] + for i, mod in enumerate(_top_sorted_modules): + top_sorted_modules.append((mod[0], dict(mod[1]), mod[2])) + # Ensure that there is only one dataset in callchain + if i > 0 and mod[0].type == ModuleType.datalayer: + raise ValueError("There were more than one DataLayer NeuralModule inside your DAG.") + + if cached_training_state and mod[0].type == ModuleType.datalayer: + raise ValueError("Could not compute tensor from current cached training state.") + + return top_sorted_modules + + +class TrainingState: + def __init__(self, action: 'Actions'): + """A class used to wrap the current training state of an Actions.train() function. This class holds a mapping + of tensor.unique_name -> it's backend tensor (eg Pytorch Tensor) or None if the tensor has been been computed + on the current step. + + args: + action (Actions): The Actions object this state is associated with. + """ + tensor_naming_registery = AppState().tensor_names + self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None) + self._action = action + + def tensor_list(self): + """Returns a list the unique names of all tensors. + """ + return self.tensor_dict.keys() + + def clear_dict(self): + """Clears the dictionary by setting all values to None. Used in-between training batches to clear it's state. + """ + for name in self.tensor_dict: + self.tensor_dict[name] = None + + def set_tensor(self, tensor: NmTensor, value: 'torch.Tensor'): + """Sets the value of tensor + + args: + tensor (NmTensor) + value (torch.Tensor) + """ + self.tensor_dict[tensor.unique_name] = value + + def check_tensor_cached(self, unique_name: str): + """Checks to see the tensor value has been computed in the current step yet. + + args: + unique_name (str): The NmTensor.unique_name that we want to check for. + + returns: + (bool) whether the tensor with unique_name has been computed yet. + """ + if self.tensor_dict[unique_name] is None: + return False + return True + + def get_tensor(self, name: Union[str, NmTensor], compute: bool = True): + """Returns the value associated with a tensor. And optionally, computes the value of the tensor if not already + set. + + args: + name (str, NmTensor): The user-defined name for a tensor or the NmTensor itself. + compute (bool): If True and the tensor has not already been computed, there will be an attempt to create a + call DAG and then do a forward pass on this call DAG to compute the tensor. If False, it will return + None if the tensor has not been computed yet. + Defaults to True. + + returns: + (torch.tensor or None) representing the computed value of the requested name. Returns None if compute is + False and the tensor has not been computed yet. + """ + if isinstance(name, NmTensor): + unique_name = name.unique_name + else: + unique_name = AppState().tensor_names[name] + tensor_value = self.tensor_dict[unique_name] + if tensor_value is None and compute: + nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name] + callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self) + callchain.insert(0, ()) + self._action.nm_graph_forward_pass(callchain, self.tensor_dict) + tensor_value = self.tensor_dict[unique_name] + return tensor_value + + +class Actions(ABC): + """Basic actions allowed on graphs of Neural Modules""" + + def __init__(self, local_rank, global_rank, optimization_level=Optimization.mxprO0): + self._local_rank = local_rank + self._global_rank = global_rank + self._optim_level = optimization_level + + @property + def local_rank(self): + """Local rank during distributed execution. None if single GPU/CPU + + Returns: + (int) rank or worker or None if not in distributed model + """ + return self._local_rank + + @property + def global_rank(self): + """Global rank during distributed execution. None if single GPU/CPU + + Returns: + (int) rank or worker or None if not in distributed model + """ + return self._global_rank + + @abstractmethod + def train( + self, + tensors_to_optimize: List[NmTensor], + callbacks: Optional[List[Union[ActionCallback, NeMoCallback]]], + lr_policy=None, + batches_per_step=None, + stop_on_nan_loss=False, + ): + """This action executes training and (optionally) evaluation. + + Args: + tensors_to_optimize: which tensors to optimize. Typically this is + single loss tesnor. + callbacks: list of callback objects + lr_policy: function which should take (initial_lr, step, epoch) and + return learning rate + batches_per_step: number of mini-batches to process before one + optimizer step. (default: None, same as 1). Use this + to simulate larger batch sizes on hardware which could not fit + larger batch in memory otherwise. Effectively, this will make + "algorithmic" batch size per GPU/worker = batches_per_step* + batch_size + stop_on_nan_loss: (default: False) If set to True, the training + will stop if loss=nan or inf. If set to False, the training + will continue. + + Returns: + None + """ + pass + + @abstractmethod + def infer(self, tensors: List[NmTensor]): + """This action executes inference. Nothing is optimized. + Args: + tensors: which tensors to evaluate. + + Returns: + None + """ + pass + + @abstractmethod + def create_optimizer(self, optimizer, things_to_optimize, optimizer_params): + """ + Creates an optimizer object to be use in the train() method. + + Args: + optimizer: Specifies which optimizer to use. + things_to_optimize: A list of neural modules or tensors to be + optimized. + optimizer_params: Specifies the parameters of the optimizer + + Returns: + Optimizer + """ + pass diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index 8dba04e4acd3..87de6e7ac3ac 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -20,13 +20,11 @@ 'OperationMode', 'Optimization', 'DeviceType', - 'Actions', 'NeuralModuleFactory', 'DeploymentFormat', ] import random -from abc import ABC, abstractmethod from enum import Enum from typing import List, Optional, Union @@ -36,202 +34,9 @@ from nemo.core.callbacks import ActionCallback, EvaluatorCallback, NeMoCallback from nemo.core.neural_types import NmTensor from nemo.utils import ExpManager, logging -from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated -class TrainingState: - def __init__(self, action: 'Actions'): - """A class used to wrap the current training state of an Actions.train() function. This class holds a mapping - of tensor.unique_name -> it's backend tensor (eg Pytorch Tensor) or None if the tensor has been been computed - on the current step. - - args: - action (Actions): The Actions object this state is associated with. - """ - tensor_naming_registery = AppState().tensor_names - self.tensor_dict = {}.fromkeys(tensor_naming_registery.unique_names, None) - self._action = action - - def tensor_list(self): - """Returns a list the unique names of all tensors. - """ - return self.tensor_dict.keys() - - def clear_dict(self): - """Clears the dictionary by setting all values to None. Used in-between training batches to clear it's state. - """ - for name in self.tensor_dict: - self.tensor_dict[name] = None - - def set_tensor(self, tensor: NmTensor, value: 'torch.Tensor'): - """Sets the value of tensor - - args: - tensor (NmTensor) - value (torch.Tensor) - """ - self.tensor_dict[tensor.unique_name] = value - - def check_tensor_cached(self, unique_name: str): - """Checks to see the tensor value has been computed in the current step yet. - - args: - unique_name (str): The NmTensor.unique_name that we want to check for. - - returns: - (bool) whether the tensor with unique_name has been computed yet. - """ - if self.tensor_dict[unique_name] is None: - return False - return True - - def get_tensor(self, name: Union[str, NmTensor], compute: bool = True): - """Returns the value associated with a tensor. And optionally, computes the value of the tensor if not already - set. - - args: - name (str, NmTensor): The user-defined name for a tensor or the NmTensor itself. - compute (bool): If True and the tensor has not already been computed, there will be an attempt to create a - call DAG and then do a forward pass on this call DAG to compute the tensor. If False, it will return - None if the tensor has not been computed yet. - Defaults to True. - - returns: - (torch.tensor or None) representing the computed value of the requested name. Returns None if compute is - False and the tensor has not been computed yet. - """ - if isinstance(name, NmTensor): - unique_name = name.unique_name - else: - unique_name = AppState().tensor_names[name] - tensor_value = self.tensor_dict[unique_name] - if tensor_value is None and compute: - nmtensor = AppState().tensor_names._nmtensor_uniname_dict[unique_name] - callchain = topological_sort_from_leaves([nmtensor], cached_training_state=self) - callchain.insert(0, ()) - self._action.nm_graph_forward_pass(callchain, self.tensor_dict) - tensor_value = self.tensor_dict[unique_name] - return tensor_value - - -def topological_sort_from_leaves(leaf_nmtensors: List[NmTensor], cached_training_state: TrainingState = None): - """A function that accepts a list of NmTensors that need to be computed and constructs a callchain DAG that starts - from a datalayerNM and can be used to compute the NmTensors. - - args: - leaf_nmtensors (List[NmTensors]): The tensors to be computed - cached_training_state (TrainingState): A dictionary of already computed tensors. - Defaults to None meaning an empty cache. - - returns: - top_sorted_modules: the callchain DAG - """ - from nemo.backends.pytorch.nm import DataLayerNM # TODO: Replace this with a backend agnostic data layer - - def create_node(producer, producer_args): - if producer_args is None: - return tuple((producer, ())) - else: - return tuple((producer, tuple([(k, v) for k, v in producer_args.items()]),)) - - def is_in_degree_zero(node, processed_nodes, cached_training_state): - """A node has in degree of zero""" - if node[1] == (): - return True - for portname, nmtensor in node[1]: - nd = create_node(nmtensor.producer, nmtensor.producer_args) - if nd not in processed_nodes: - if cached_training_state and cached_training_state.check_tensor_cached(nmtensor.unique_name): - continue - return False - return True - - hooks = leaf_nmtensors if isinstance(leaf_nmtensors, list) else [leaf_nmtensors] - - # ensures that no tensors are processed twice - processed_nmtensors = set() - - indices_to_remove = [] - # Check for duplicates in hook - for i, nmtensor in enumerate(hooks): - if nmtensor in processed_nmtensors: - indices_to_remove.append(i) - else: - processed_nmtensors.add(nmtensor) - - for i in reversed(indices_to_remove): - hooks.pop(i) - - _top_sorted_modules = [] - all_nodes = {} - - # extract all nodes to all_nodes set - hooks_lst = list(hooks) - while len(hooks_lst) > 0: - # take nmtensor from the end of the list - nmtensor = hooks_lst.pop() - producer_args = nmtensor.producer_args - - node = create_node(nmtensor.producer, producer_args) - # Store nmtensor as an output of its producer - # first make sure all keys are present per output port - # and nm is inside all_nodes - if node not in all_nodes: - all_nodes[node] = {k: None for k in nmtensor.producer.output_ports} - # second, populate output port with current nmtensor - # where applicable - all_nodes[node][nmtensor.name] = nmtensor - processed_nmtensors.add(nmtensor) - - new_tensors = set() - if producer_args is not None and producer_args != {}: - for _, new_nmtensor in producer_args.items(): - if new_nmtensor not in processed_nmtensors: - new_tensors.add(new_nmtensor) - - # TODO - if cached_training_state: - for name, input_nmtensor in producer_args.items(): - if cached_training_state.check_tensor_cached(input_nmtensor.unique_name): - new_tensors.remove(input_nmtensor) - - for new_nmtensor in new_tensors: - # put in the start of list - hooks_lst.insert(0, new_nmtensor) - - all_node_with_output = [] - # Iterate over all_nodes to create new nodes that include its output - # now all nodes have (module, input tensors, output tensors) - for node in all_nodes: - all_node_with_output.append(tuple((node[0], node[1], all_nodes[node]))) - - processed_nodes = [] - while len(all_node_with_output) > 0: - for node in all_node_with_output.copy(): - # if node's in_degree is zero it can be added to - # _top_sorted_modules - # this will also reduce in_degree of its children - if is_in_degree_zero(node, processed_nodes, cached_training_state): - _top_sorted_modules.append(node) - processed_nodes.append((node[0], node[1])) - all_node_with_output.remove(node) - - # Create top_sorted_modules aka callchain - top_sorted_modules = [] - for i, m in enumerate(_top_sorted_modules): - top_sorted_modules.append((m[0], dict(m[1]), m[2])) - # Ensure that there is only one dataset in callchain - if i > 0 and isinstance(m[0], DataLayerNM): - raise ValueError("There were more than one DataLayer NeuralModule inside your DAG.") - - # TODO - if cached_training_state and isinstance(m[0], DataLayerNM): - raise ValueError("Could not compute tensor from current cached training state.") - - return top_sorted_modules - - class DeploymentFormat(Enum): """Which format to use when exporting a Neural Module for deployment""" @@ -275,238 +80,6 @@ class DeviceType(Enum): AllGpu = 3 -class Actions(ABC): - """Basic actions allowed on graphs of Neural Modules""" - - def __init__(self, local_rank, global_rank, optimization_level=Optimization.mxprO0): - self._local_rank = local_rank - self._global_rank = global_rank - self._optim_level = optimization_level - - @property - def local_rank(self): - """Local rank during distributed execution. None if single GPU/CPU - - Returns: - (int) rank or worker or None if not in distributed model - """ - return self._local_rank - - @property - def global_rank(self): - """Global rank during distributed execution. None if single GPU/CPU - - Returns: - (int) rank or worker or None if not in distributed model - """ - return self._global_rank - - @abstractmethod - def train( - self, - tensors_to_optimize: List[NmTensor], - callbacks: Optional[List[Union[ActionCallback, NeMoCallback]]], - lr_policy=None, - batches_per_step=None, - stop_on_nan_loss=False, - ): - """This action executes training and (optionally) evaluation. - - Args: - tensors_to_optimize: which tensors to optimize. Typically this is - single loss tesnor. - callbacks: list of callback objects - lr_policy: function which should take (initial_lr, step, epoch) and - return learning rate - batches_per_step: number of mini-batches to process before one - optimizer step. (default: None, same as 1). Use this - to simulate larger batch sizes on hardware which could not fit - larger batch in memory otherwise. Effectively, this will make - "algorithmic" batch size per GPU/worker = batches_per_step* - batch_size - stop_on_nan_loss: (default: False) If set to True, the training - will stop if loss=nan or inf. If set to False, the training - will continue. - - Returns: - None - """ - pass - - @abstractmethod - def infer(self, tensors: List[NmTensor]): - """This action executes inference. Nothing is optimized. - Args: - tensors: which tensors to evaluate. - - Returns: - None - """ - pass - - # @abstractmethod - # def save_state_to(self, path: str): - # """ - # Saves current state such as step, epoch and optimizer parameters - # Args: - # path: - - # Returns: - - # """ - # pass - - # @abstractmethod - # def restore_state_from(self, path: str): - # """ - # Restores state such as step, epoch and optimizer parameters - # Args: - # path: - - # Returns: - - # """ - # pass - - @abstractmethod - def create_optimizer(self, optimizer, things_to_optimize, optimizer_params): - """ - Creates an optimizer object to be use in the train() method. - - Args: - optimizer: Specifies which optimizer to use. - things_to_optimize: A list of neural modules or tensors to be - optimized. - optimizer_params: Specifies the parameters of the optimizer - - Returns: - Optimizer - """ - pass - - # def _perform_on_step_start(self, callbacks): - # # TODO: Most of these checks can be relaxed since we enforce callbacks - # # to be a list of ActionCallback objects - # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - # for callback in callbacks: - # if isinstance(callback, ActionCallback): - # callback.on_iteration_start() - # elif isinstance(callback, NeMoCallback): - # callback.on_step_start(self.state) - # else: - # raise ValueError( - # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - # ) - - # def _perform_on_step_end(self, callbacks): - # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - # for callback in callbacks: - # if isinstance(callback, ActionCallback): - # callback.on_iteration_end() - # elif isinstance(callback, NeMoCallback): - # callback.on_step_end(self.state) - # else: - # raise ValueError( - # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - # ) - - # def _perform_on_action_start(self, callbacks): - # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - # for callback in callbacks: - # if isinstance(callback, ActionCallback): - # callback.on_action_start() - # elif isinstance(callback, NeMoCallback): - # callback.on_train_start(self.state) - # else: - # raise ValueError( - # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - # ) - - # def _perform_on_action_end(self, callbacks): - # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - # for callback in callbacks: - # if isinstance(callback, ActionCallback): - # callback.on_action_end() - # elif isinstance(callback, NeMoCallback): - # callback.on_train_end(self.state) - # else: - # raise ValueError( - # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - # ) - - # def _perform_on_epoch_start(self, callbacks): - # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - # for callback in callbacks: - # if isinstance(callback, ActionCallback): - # callback.on_epoch_start() - # elif isinstance(callback, NeMoCallback): - # callback.on_epoch_start(self.state) - # else: - # raise ValueError( - # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - # ) - - # def _perform_on_epoch_end(self, callbacks): - # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - # for callback in callbacks: - # if isinstance(callback, ActionCallback): - # callback.on_epoch_end() - # elif isinstance(callback, NeMoCallback): - # callback.on_epoch_end(self.state) - # else: - # raise ValueError( - # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - # ) - - # def _perform_on_batch_start(self, callbacks): - # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - # for callback in callbacks: - # if isinstance(callback, ActionCallback): - # continue - # elif isinstance(callback, NeMoCallback): - # callback.on_epoch_start(self.state) - # else: - # raise ValueError( - # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - # ) - - # def _perform_on_batch_end(self, callbacks): - # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - # for callback in callbacks: - # if isinstance(callback, ActionCallback): - # continue - # elif isinstance(callback, NeMoCallback): - # callback.on_epoch_end(self.state) - # else: - # raise ValueError( - # "Callback was not a child of ActionCallback nor NeMoCallback and was not understood" - # ) - - # def _init_callbacks(self, callbacks): - # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - # for callback in callbacks: - # if isinstance(callback, ActionCallback): - # callback.action = self - - # def _update_callbacks( - # self, callbacks=None, registered_tensors=None, final_loss=None, - # ): - # # if self.local_rank is None or self.local_rank == 0: - # if callbacks is not None and isinstance(callbacks, List) and len(callbacks) > 0: - # for callback in callbacks: - # if isinstance(callback, ActionCallback): - # callback._registered_tensors = registered_tensors - # else: # For now, we can use the old callback function. In the future we should improve this - # self.training_state.tensor_dict["loss"] = final_loss - - -def _str_to_opt_level(opt_str: str) -> Optimization: - number = int(opt_str[1:]) - if number not in Optimization._value2member_map_: - raise ValueError(f"Unknown optimization value {opt_str}") - return Optimization(number) - - class NeuralModuleFactory(object): _DEFAULT = None @@ -716,21 +289,6 @@ def get_module(self, name, collection, params, pretrained=False): NeuralModule instance """ - # TK: "optimization_level" is not passed as parameter anymore. - # if params is not None and "optimization_level" in params: - # if params["optimization_level"] != self._optim_level: - # logging.warning( - # "Module's {0} requested optimization level {1} is" - # "different from the one specified by factory - {2}." - # "Using: {3} for this module".format( - # name, params["optimization_level"], self._optim_level, params["optimization_level"], - # ) - # ) - # else: - # if params is None: - # params = {} - # params["optimization_level"] = self._optim_level - if self._backend == Backend.PyTorch: return self.__get_pytorch_module(name=name, collection=collection, params=params, pretrained=pretrained,) else: diff --git a/nemo/core/neural_types/__init__.py b/nemo/core/neural_types/__init__.py index 0ae947d90137..1fb5bf349076 100644 --- a/nemo/core/neural_types/__init__.py +++ b/nemo/core/neural_types/__init__.py @@ -19,4 +19,3 @@ from nemo.core.neural_types.comparison import * from nemo.core.neural_types.elements import * from nemo.core.neural_types.neural_type import * -from nemo.core.neural_types.nmtensor_registry import NmTensorNameRegistry diff --git a/nemo/core/neural_types/nmtensor_registry.py b/nemo/core/neural_types/nmtensor_registry.py index f1d9591039a4..c8188c65af7f 100755 --- a/nemo/core/neural_types/nmtensor_registry.py +++ b/nemo/core/neural_types/nmtensor_registry.py @@ -25,13 +25,14 @@ def __init__(self): # Create the nmtensor_naming_dict # which contains a mapping of str to NMTensor.unique_name self._nmtensor_naming_dict = {"loss": "loss"} # Reserve keyname of 'loss' - self._nmtensor_uniname_dict = {"loss": None} + # Create a set object to track all unique_names + self._nmtensor_uniname_dict = set(["loss"]) @property def unique_names(self): """Returns the set of all NmTensors.unique_names + 'loss' """ - return self._nmtensor_uniname_dict.keys() + return list(self._nmtensor_uniname_dict) def register(self, tensor: 'NmTensor'): """Helper function to register a newly created NmTensor by adding it to self.__nmtensor_uniname_dict. @@ -46,7 +47,7 @@ def register(self, tensor: 'NmTensor'): pass # Finally, add object to the set. - self._nmtensor_uniname_dict[tensor.unique_name] = tensor + self._nmtensor_uniname_dict.add(tensor.unique_name) def rename_NmTensor(self, tensor: 'NmTensor', new_name: str): """Helper function that changes the naming dictionary to facilitate user name -> tensor.unique_name lookup. diff --git a/tests/unit/core/test_nemo_callbacks.py b/tests/unit/core/test_nemo_callbacks.py index 3152b6fd4ffc..21e1671eed19 100755 --- a/tests/unit/core/test_nemo_callbacks.py +++ b/tests/unit/core/test_nemo_callbacks.py @@ -1,7 +1,7 @@ # ! /usr/bin/python # -*- coding: utf-8 -*- -# Copyright 2019 NVIDIA. All Rights Reserved. +# Copyright 2020 NVIDIA. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -36,13 +36,6 @@ def clean_up(self): yield self.nf.reset_trainer() - @pytest.fixture() - def create_tensorboard_file(self): - os.makedirs("temp") - summary_writter = SummaryWriter("temp") - yield summary_writter - shutil.rmtree("temp") - @pytest.mark.unit def test_SimpleLogger(self, clean_up): data_source = RealFunctionDataLayer(n=100, batch_size=1) @@ -96,7 +89,7 @@ def test_rename_and_log(self, clean_up): assert "y_pred" in line @pytest.mark.unit - def test_TensorboardLogger(self, clean_up, create_tensorboard_file): + def test_TensorboardLogger(self, clean_up, tmpdir): data_source = RealFunctionDataLayer(n=100, batch_size=1) trainable_module = TaylorNet(dim=4) loss = MSELoss() @@ -106,7 +99,11 @@ def test_TensorboardLogger(self, clean_up, create_tensorboard_file): y_pred = trainable_module(x=x) loss_tensor = loss(predictions=y_pred, target=y) - tb_logger = TensorboardLogger(create_tensorboard_file, step_freq=1) + logging_dir = tmpdir.mkdir("temp") + + writer = SummaryWriter(logging_dir) + + tb_logger = TensorboardLogger(writer, step_freq=1) callbacks = [tb_logger] self.nf.train( @@ -117,7 +114,7 @@ def test_TensorboardLogger(self, clean_up, create_tensorboard_file): ) # efi.inspect("temp", tag="loss") - inspection_units = efi.get_inspection_units("temp", "", "loss") + inspection_units = efi.get_inspection_units(logging_dir, "", "loss") # Make sure there is only 1 tensorboard file assert len(inspection_units) == 1 From 7009bee78377bb06521f141265a29fc263ec2aa0 Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 27 May 2020 16:23:37 -0700 Subject: [PATCH 33/40] some fixes Signed-off-by: Jason --- nemo/core/actions.py | 2 +- nemo/utils/app_state.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo/core/actions.py b/nemo/core/actions.py index 6a988b265e06..686ad1b0c478 100755 --- a/nemo/core/actions.py +++ b/nemo/core/actions.py @@ -243,7 +243,7 @@ def global_rank(self): def train( self, tensors_to_optimize: List[NmTensor], - callbacks: Optional[List[Union[ActionCallback, NeMoCallback]]], + callbacks: Optional[List[Union['ActionCallback', 'NeMoCallback']]], lr_policy=None, batches_per_step=None, stop_on_nan_loss=False, diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index 8bbf120c0f60..32c46767e5b2 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -22,6 +22,7 @@ from nemo.utils.metaclasses import Singleton from nemo.utils.neural_graph.neural_graph_manager import NeuralGraphManager from nemo.utils.neural_graph.object_registry import ObjectRegistry +from nemo.core.neural_types import NmTensorNameRegistry class AppState(metaclass=Singleton): @@ -48,7 +49,7 @@ def __init__(self, device=None): # Create graph manager (registry with some additional functionality). self._neural_graph_manager = NeuralGraphManager() # Create NmTensor registry - self._nmtensor_name_registry = nemo.core.neural_types.NmTensorNameRegistry() + self._nmtensor_name_registry = NmTensorNameRegistry() @property def tensor_names(self): From 9f4566bd4f56620f69a8731b88fef7ad972f608b Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 27 May 2020 16:27:55 -0700 Subject: [PATCH 34/40] style Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 2 +- nemo/core/actions.py | 5 ++--- nemo/utils/app_state.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 7ae84d1893f3..d3e3261d5e55 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -20,9 +20,9 @@ from nemo.backends.pytorch.nm import DataLayerNM, TrainableNM from nemo.backends.pytorch.optimizers import AdamW, Novograd, master_params from nemo.core import DeploymentFormat, DeviceType, NeuralModule, NmTensor +from nemo.core.actions import Actions, TrainingState, topological_sort_from_leaves from nemo.core.callbacks import ActionCallback, NeMoCallback, SimpleLossLoggerCallback from nemo.core.neural_factory import OperationMode, Optimization -from nemo.core.actions import Actions, TrainingState, topological_sort_from_leaves from nemo.core.neural_types import AxisKind, NeuralType from nemo.utils.app_state import AppState from nemo.utils.decorators import deprecated diff --git a/nemo/core/actions.py b/nemo/core/actions.py index 686ad1b0c478..ad0757e04b39 100755 --- a/nemo/core/actions.py +++ b/nemo/core/actions.py @@ -18,11 +18,10 @@ from abc import ABC, abstractmethod from typing import List, Optional, Union - +from nemo.core.neural_factory import Optimization +from nemo.core.neural_modules import ModuleType from nemo.core.neural_types import NmTensor from nemo.utils.app_state import AppState -from nemo.core.neural_modules import ModuleType -from nemo.core.neural_factory import Optimization def topological_sort_from_leaves(leaf_nmtensors: List[NmTensor], cached_training_state: 'TrainingState' = None): diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index 32c46767e5b2..22ffdf8fce2a 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -19,10 +19,10 @@ # Moreover, at that point nemo module doesn't contain "core", so during "python module registration" # nothing from nemo.core, including e.g. types (so we cannot use them for "python 3 type hints"). import nemo +from nemo.core.neural_types import NmTensorNameRegistry from nemo.utils.metaclasses import Singleton from nemo.utils.neural_graph.neural_graph_manager import NeuralGraphManager from nemo.utils.neural_graph.object_registry import ObjectRegistry -from nemo.core.neural_types import NmTensorNameRegistry class AppState(metaclass=Singleton): From 307f550414cd29f4ba366bb89dca4e676d024d16 Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 27 May 2020 17:14:48 -0700 Subject: [PATCH 35/40] move nmtensor_registry Signed-off-by: Jason --- nemo/utils/__init__.py | 4 ++-- nemo/utils/app_state.py | 2 +- nemo/{core/neural_types => utils}/nmtensor_registry.py | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename nemo/{core/neural_types => utils}/nmtensor_registry.py (100%) diff --git a/nemo/utils/__init__.py b/nemo/utils/__init__.py index b9058a854c3c..15872561c92a 100644 --- a/nemo/utils/__init__.py +++ b/nemo/utils/__init__.py @@ -15,8 +15,8 @@ # limitations under the License. # ============================================================================= -from .nemo_logging import Logger as _Logger -from .nemo_logging import LogMode as logging_mode +from nemo.utils.nemo_logging import Logger as _Logger +from nemo.utils.nemo_logging import LogMode as logging_mode logging = _Logger() diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index 22ffdf8fce2a..45c134ee9995 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -19,10 +19,10 @@ # Moreover, at that point nemo module doesn't contain "core", so during "python module registration" # nothing from nemo.core, including e.g. types (so we cannot use them for "python 3 type hints"). import nemo -from nemo.core.neural_types import NmTensorNameRegistry from nemo.utils.metaclasses import Singleton from nemo.utils.neural_graph.neural_graph_manager import NeuralGraphManager from nemo.utils.neural_graph.object_registry import ObjectRegistry +from nemo.utils.nmtensor_registry import NmTensorNameRegistry class AppState(metaclass=Singleton): diff --git a/nemo/core/neural_types/nmtensor_registry.py b/nemo/utils/nmtensor_registry.py similarity index 100% rename from nemo/core/neural_types/nmtensor_registry.py rename to nemo/utils/nmtensor_registry.py From 31fc556ddd14b721a9918c397ec36d0fcf6817ac Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 27 May 2020 17:18:49 -0700 Subject: [PATCH 36/40] update tests Signed-off-by: Jason --- tests/unit/core/test_nemo_callbacks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/core/test_nemo_callbacks.py b/tests/unit/core/test_nemo_callbacks.py index 21e1671eed19..a2e0bae39f03 100755 --- a/tests/unit/core/test_nemo_callbacks.py +++ b/tests/unit/core/test_nemo_callbacks.py @@ -114,7 +114,7 @@ def test_TensorboardLogger(self, clean_up, tmpdir): ) # efi.inspect("temp", tag="loss") - inspection_units = efi.get_inspection_units(logging_dir, "", "loss") + inspection_units = efi.get_inspection_units(str(logging_dir), "", "loss") # Make sure there is only 1 tensorboard file assert len(inspection_units) == 1 From b9e4441524ca7d8affaf4967a6b6a190bdbdd271 Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 27 May 2020 17:27:57 -0700 Subject: [PATCH 37/40] clean code for comments Signed-off-by: Jason --- examples/asr/jasper_an4.py | 1 - nemo/backends/pytorch/actions.py | 66 ++------------------------------ nemo/core/callbacks.py | 2 - nemo/utils/nemo_logging.py | 4 -- 4 files changed, 4 insertions(+), 69 deletions(-) diff --git a/examples/asr/jasper_an4.py b/examples/asr/jasper_an4.py index 40172008c9da..888d046ef936 100644 --- a/examples/asr/jasper_an4.py +++ b/examples/asr/jasper_an4.py @@ -238,7 +238,6 @@ def main(): # Delete old graph and make a new one del g0 nf.reset_trainer() - # [print(p) for p in nemo.utils.app_state.AppState().modules] loss, eval_tensors, callbacks, total_steps, _, _, new_g = create_dags(args.model_config, vocab, args, nf) nf.train( diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index d3e3261d5e55..95d3a9d1736b 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -30,8 +30,6 @@ # these imports will happen on as-needed basis amp = None -# convert_syncbn = None -# create_syncbn_process_group = None LARC = None FusedLAMB = None FusedAdam = None @@ -63,16 +61,12 @@ def __init__( global amp amp = importlib.import_module('apex.amp') if local_rank is not None: - # global convert_syncbn - # global create_syncbn_process_group global LARC global FusedLAMB global FusedAdam global FusedNovoGrad parallel = importlib.import_module('apex.parallel') apex_optimizer = importlib.import_module('apex.optimizers') - # convert_syncbn = parallel.convert_syncbn_model - # create_syncbn_process_group = parallel.create_syncbn_process_group LARC = parallel.LARC FusedLAMB = apex_optimizer.FusedLAMB FusedAdam = apex_optimizer.FusedAdam @@ -150,12 +144,6 @@ def __get_top_sorted_modules_and_dataloader(self, hook: List[NmTensor]): "distributed mode. Please instantiate NeuralModuleFactory first and pass its instance as " "`factory` parameter to all your Neural Module objects.".format(str(m[0])) ) - # key = m[0].unique_instance_id - # if key not in self.module_reference_table: - # if isinstance(m[0], TrainableNeuralModuleWrapper): - # self.module_reference_table[key] = (m[0], m[0]._pt_module) - # else: - # self.module_reference_table[key] = (m[0], m[0]) return top_sorted_modules, tdataset @@ -349,18 +337,9 @@ def __nm_graph_forward_pass( if in_cache: continue call_args = call_chain[ind][1] - # module = call_chain[ind][0] - # pmodule = self.module_reference_table[m_id][1] m_id = call_chain[ind][0].unique_instance_id pmodule = self.ddp_module_dict[m_id] if self.ddp_initialized else call_chain[ind][0] - # if self._local_rank is not None: - # if isinstance(pmodule, DDP): - # if disable_allreduce: - # pmodule.disable_allreduce() - # else: - # pmodule.enable_allreduce() - if mode == OperationMode.training: # if module.is_trainable(): if isinstance(pmodule, nn.Module): @@ -374,14 +353,8 @@ def __nm_graph_forward_pass( # prepare call signature for `module` call_set = {} for tensor_name, nmtensor in call_args.items(): - # _add_uuid_2_name(nmtensor.name, nmtensor.producer._uuid) key = nmtensor.unique_name call_set[tensor_name] = registered_tensors[key] - # actual PyTorch module call with signature - # if isinstance(self.module_reference_table[m_id][0], TrainableNeuralModuleWrapper,): - # new_tensors = pmodule(**call_set) - # else: - # new_tensors = pmodule(force_pt=True, **call_set) new_tensors = pmodule(force_pt=True, **call_set) if not isinstance(new_tensors, List): @@ -462,11 +435,6 @@ def _eval(self, tensors_2_evaluate, callback, step, verbose=False): assert dist.is_initialized() is_distributed = True world_size = torch.distributed.get_world_size() - # logging.info( - # "Doing distributed evaluation. Rank {0} of {1}".format( - # self.local_rank, world_size - # ) - # ) if dl_nm.dataset is not None: sampler = None @@ -638,11 +606,6 @@ def _infer( assert dist.is_initialized() is_distributed = True world_size = torch.distributed.get_world_size() - # logging.info( - # "Doing distributed evaluation. Rank {0} of {1}".format( - # self.local_rank, world_size - # ) - # ) if dl_nm.dataset is not None: sampler = None if not isinstance(dl_nm.dataset, torch.utils.data.IterableDataset): @@ -729,12 +692,6 @@ def _infer( use_cache=use_cache, ) - # if offload_to_cpu: - # # Take all cuda tensors and save them to value_dict as - # # cpu tensors to save GPU memory - # for name, tensor in registered_e_tensors.items(): - # if isinstance(tensor, torch.Tensor): - # registered_e_tensors[name] = tensor.cpu() if cache: self.append_to_cache(registered_e_tensors, offload_to_cpu) @@ -913,10 +870,10 @@ def __extract_dynamic_axes(port_name: str, ntype: NeuralType, dynamic_axes: defa module.eval() try: - # # Remove NeMo-related things from the module - # # We need to change __call__ method. Note that this will change the - # # whole class, not just this object! Which is why we need to repair it - # # in the finally block + # Remove NeMo-related things from the module + # We need to change __call__ method. Note that this will change the + # whole class, not just this object! Which is why we need to repair it + # in the finally block __orig_call__ = type(module).__call__ type(module).__call__ = torch.nn.Module.__call__ @@ -1313,10 +1270,6 @@ def save_state_to(self, path): dataNM = training_loop[0][2][0][0] placement_gpu = dataNM.placement == DeviceType.AllGpu if placement_gpu: - # if len(training_loop) > 1: - # raise NotImplementedError( - # "Distributed training does nor work with multiple " - # "optimizers") logging.info("Doing distributed training") if t_dataset is not None: train_sampler = None @@ -1341,12 +1294,6 @@ def save_state_to(self, path): else: train_sampler = None - # for train_iter in training_loop: - # call_chain = train_iter[2] - # for i in range(1, len(call_chain) - 1): - # key = call_chain[i][0].unique_instance_id - # pmodule = self.module_reference_table[key][1] - # num_trainable_weights = self.module_reference_table[key][1].num_weights self.ddp_initialized = True module_list = [mod.name for mod in AppState().modules] module_list = sorted(module_list) @@ -1356,11 +1303,6 @@ def save_state_to(self, path): num_trainable_weights = module.num_weights self.ddp_module_dict[key] = module if not isinstance(module, DDP) and isinstance(module, torch.nn.Module) and num_trainable_weights > 0: - # gpf = 1 - # if gradient_predivide: - # gpf = dist.get_world_size() - # pmodule = DDP(pmodule, gradient_predivide_factor=gpf) # Old Apex Method - # Per pytorch docs, convert sync bn prior to DDP if synced_batchnorm: world_size = dist.get_world_size() diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index 4b9826e9b6c1..d79eb23536db 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -469,7 +469,6 @@ def __restore_from(self, path, state): try: trainer_checkpoints = get_checkpoint_from_dir(["trainer"], path) state.restore_state_from(trainer_checkpoints[0]) - # for tr, checkpoint in zip([self.action], trainer_checkpoints): except (ValueError) as e: logging.warning(e) logging.warning( @@ -891,7 +890,6 @@ def on_iteration_start(self): setattr(self.module, self.arg_name, value) if self.tb_writer is not None: class_name = self.module.__class__.__name__ - # name = f'param/{class_name}.{self.arg_name}' name = f"param/{class_name}.{self.arg_name}" self.tb_writer.add_scalar(name, value, self.step) else: diff --git a/nemo/utils/nemo_logging.py b/nemo/utils/nemo_logging.py index ee5cb0f6ee4d..7fed7ff0c5c3 100644 --- a/nemo/utils/nemo_logging.py +++ b/nemo/utils/nemo_logging.py @@ -366,7 +366,3 @@ def critical(self, msg, *args, mode=LogMode.EACH, **kwargs): and not self._logged_once(msg, mode) ): self._logger._log(Logger.CRITICAL, msg, args, **kwargs) - - -# # Necessary to catch the correct caller -# _logging._srcfile = os.path.normcase(inspect.getfile(Logger.__class__)) From c036084e24bc6504b841707785b32aa022f33367 Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 28 May 2020 11:40:33 -0700 Subject: [PATCH 38/40] add back str_to_opt_level Signed-off-by: Jason --- nemo/core/neural_factory.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index 87de6e7ac3ac..40cfee69f838 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -80,6 +80,13 @@ class DeviceType(Enum): AllGpu = 3 +def _str_to_opt_level(opt_str: str) -> Optimization: + number = int(opt_str[1:]) + if number not in Optimization._value2member_map_: + raise ValueError(f"Unknown optimization value {opt_str}") + return Optimization(number) + + class NeuralModuleFactory(object): _DEFAULT = None From 1e429afcd934894134855249bef480de1eb555af Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 28 May 2020 11:52:17 -0700 Subject: [PATCH 39/40] split callbacks into two files; update error messages Signed-off-by: Jason --- nemo/backends/pytorch/actions.py | 4 +- nemo/core/callbacks.py | 483 +------------------------------ 2 files changed, 13 insertions(+), 474 deletions(-) diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index 95d3a9d1736b..76323fa8521b 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -1236,7 +1236,9 @@ def save_state_to(self, path): if callbacks is not None: for callback in callbacks: if not isinstance(callback, ActionCallback) and not isinstance(callback, NeMoCallback): - raise ValueError("A callback was received that was not a child of ActionCallback") + raise ValueError( + "A callback was received that was not a child of ActionCallback nor a child of NeMoCallback" + ) elif isinstance(callback, SimpleLossLoggerCallback): if logging_callchain: raise ValueError("We only support one logger callback but more than one were found") diff --git a/nemo/core/callbacks.py b/nemo/core/callbacks.py index d79eb23536db..d667b4130529 100644 --- a/nemo/core/callbacks.py +++ b/nemo/core/callbacks.py @@ -31,20 +31,23 @@ # "on_step_end", # ] -import datetime import glob import os -import sys import time -import warnings -from abc import ABC, abstractmethod -from collections import namedtuple +from abc import ABC from typing import Callable, List, Union -import nemo +from nemo.core.deprecated_callbacks import ( + ActionCallback, + EvaluatorCallback, + ModuleSaverCallback, + SimpleLossLoggerCallback, + UnfreezeCallback, + ValueSetterCallback, + WandbCallback, +) from nemo.utils import get_checkpoint_from_dir, logging from nemo.utils.app_state import AppState -from nemo.utils.decorators import deprecated try: import wandb @@ -508,469 +511,3 @@ def on_epoch_end(self, state): epoch = state["epoch"] if self._epoch_freq > 0 and epoch % self._epoch_freq == 0 and epoch > 0: self.__save_to(self._folder, state) - - -class ActionCallback(ABC): - """Abstract interface for callbacks. - """ - - def __init__(self): - self._registered_tensors = {} - self._action = None - - @property - def step(self): - return self.action.step - - @property - def epoch_num(self): - return self.action.epoch_num - - @property - def registered_tensors(self): - return self._registered_tensors - - @property - def local_rank(self): - return self.action.local_rank - - @property - def global_rank(self): - return self.action.global_rank - - @property - def action(self): - return self._action - - @action.setter - def action(self, action_obj): - self._action = action_obj - - @property - def logger(self): - warnings.warn("This will be deprecated in future releases. Please use nemo.logging instead") - return nemo.logging - - def on_action_start(self): - pass - - def on_action_end(self): - pass - - def on_epoch_start(self): - pass - - def on_epoch_end(self): - pass - - def on_iteration_start(self): - pass - - def on_iteration_end(self): - pass - - -class ModuleSaverCallback(ActionCallback): - """ - For callback documentation: please see - https://nvidia.github.io/NeMo/tutorials/callbacks.html - """ - - @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") - def __init__( - self, save_modules_list, step_freq=1000, folder=None, checkpoints_to_keep=4, - ): - super().__init__() - self._save_modules_list = save_modules_list - self._folder = folder - self._step_freq = step_freq - self._ckpt2keep = checkpoints_to_keep - self._saved_ckpts = [] - - def on_iteration_end(self): - step = self.step - if ( - self._step_freq > 0 - and step % self._step_freq == 0 - and step > 0 - and (self.global_rank is None or self.global_rank == 0) - ): - for m in self._save_modules_list: - class_name = m.__class__.__name__ - uid = m.unique_instance_id - fn = f"{class_name}_{uid}-STEP-{step}.pt" - if self._folder is None: - file_name = fn - else: - file_name = os.path.join(self._folder, fn) - logging.info(f"Saving module {class_name} in {file_name}") - m.save_to(file_name) - logging.info("Saved.") - self._saved_ckpts.append(f'-{self.step}.pt') - if len(self._saved_ckpts) > self._ckpt2keep: - for end in self._saved_ckpts[: -self._ckpt2keep]: - for file in glob.glob(f'{self._folder}/*{end}'): - os.remove(file) - self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :] - - def on_action_end(self): - step = self.step - if self.global_rank is None or self.global_rank == 0: - for m in self._save_modules_list: - class_name = m.__class__.__name__ - uid = m.unique_instance_id - fn = f"{class_name}_{uid}-STEP-{step}.pt" - if self._folder is None: - file_name = fn - else: - file_name = os.path.join(self._folder, fn) - logging.info(f"Saving module {class_name} in {file_name}") - m.save_to(file_name) - logging.info("Saved.") - - -class SimpleLossLoggerCallback(ActionCallback): - """ - For callback documentation: please see - https://nvidia.github.io/NeMo/tutorials/callbacks.html - """ - - @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") - def __init__( - self, tensors, print_func=None, get_tb_values=None, log_to_tb_func=None, step_freq=25, tb_writer=None, - ): - - super().__init__() - if not isinstance(tensors, list): - tensors = [tensors] - self._tensors = tensors - self._print_func = print_func - self._get_tb_values = get_tb_values - self._log_to_tb_func = log_to_tb_func - self._step_freq = step_freq - self._swriter = tb_writer - self._start_time = None - self._last_epoch_start = None - self._last_iter_start = None - - @property - def tensors(self): - return self._tensors - - def on_action_start(self): - if self.global_rank is None or self.global_rank == 0: - logging.info("Starting .....") - self._start_time = time.time() - - def on_action_end(self): - if self.global_rank is None or self.global_rank == 0: - if self._swriter is not None: - self._swriter.close() - delta = datetime.timedelta(seconds=(time.time() - self._start_time)) - logging.info("Done in %s", delta) - - def on_epoch_start(self): - if self.global_rank is None or self.global_rank == 0: - logging.info(f"Starting epoch {self.epoch_num}") - self._last_epoch_start = time.time() - - def on_epoch_end(self): - if self.global_rank is None or self.global_rank == 0: - step = self.step - - delta = datetime.timedelta(seconds=(time.time() - self._last_epoch_start)) - logging.info(f"Finished epoch {self.epoch_num} in {delta}") - - if self._swriter is not None: - value = self.epoch_num - self._swriter.add_scalar('misc/epoch', value, step) - value = time.time() - self._last_epoch_start - self._swriter.add_scalar('misc/epoch_time', value, step) - - def on_iteration_start(self): - if self.global_rank is None or self.global_rank == 0: - self._last_iter_start = time.time() - - def on_iteration_end(self): - if self.global_rank is None or self.global_rank == 0: - step = self.step - if step % self._step_freq == 0: - tensor_values = [self.registered_tensors[t.unique_name] for t in self.tensors] - logging.info(f"Step: {step}") - if self._print_func: - self._print_func(tensor_values) - sys.stdout.flush() - if self._swriter is not None: - if self._get_tb_values: - tb_objects = self._get_tb_values(tensor_values) - for name, value in tb_objects: - value = value.item() - self._swriter.add_scalar(name, value, step) - if self._log_to_tb_func: - self._log_to_tb_func(self._swriter, tensor_values, step) - run_time = time.time() - self._last_iter_start - self._swriter.add_scalar('misc/step_time', run_time, step) - run_time = time.time() - self._last_iter_start - logging.info(f"Step time: {run_time} seconds") - - -class EvaluatorCallback(ActionCallback): - """ - For callback documentation: please see - https://nvidia.github.io/NeMo/tutorials/callbacks.html - """ - - def __init__( - self, - eval_tensors, - user_iter_callback, - user_epochs_done_callback, - tb_writer=None, - tb_writer_func=None, - eval_step=1, - eval_epoch=None, - wandb_name=None, - wandb_project=None, - eval_at_start=True, - ): - # TODO: Eval_epoch currently does nothing - if eval_step is None and eval_epoch is None: - raise ValueError("Either eval_step or eval_epoch must be set. " f"But got: {eval_step} and {eval_epoch}") - if (eval_step is not None and eval_step <= 0) or (eval_epoch is not None and eval_epoch <= 0): - raise ValueError(f"Eval_step and eval_epoch must be > 0." f"But got: {eval_step} and {eval_epoch}") - super().__init__() - self._eval_tensors = eval_tensors - self._swriter = tb_writer - self._tb_writer_func = tb_writer_func - self._eval_frequency = eval_step - self._eval_at_start = eval_at_start - # will be passed to callbacks below - self._global_var_dict = {} - - # Callbacks - self.user_iter_callback = user_iter_callback - self.user_done_callback = user_epochs_done_callback - - # Weights and biases - self._wandb_project = wandb_project - self._wandb_name = wandb_name - - @property - def eval_tensors(self): - return self._eval_tensors - - @property - def tb_writer_func(self): - return self._tb_writer_func - - @property - def swriter(self): - return self._swriter - - def on_epoch_end(self): - pass - - def on_iteration_end(self): - if self.step == 0 and not self._eval_at_start: - return - if self.step % self._eval_frequency == 0: - if self.global_rank == 0 or self.global_rank is None: - logging.info('Doing Evaluation ' + '.' * 30) - start_time = time.time() - self.action._eval(self._eval_tensors, self, self.step) - elapsed_time = time.time() - start_time - if self.global_rank == 0 or self.global_rank is None: - logging.info(f'Evaluation time: {elapsed_time} seconds') - - def on_action_start(self): - if self.global_rank is None or self.global_rank == 0: - if self._wandb_name is not None or self._wandb_project is not None: - if _WANDB_AVAILABLE and wandb.run is None: - wandb.init(name=self._wandb_name, project=self._wandb_project) - elif _WANDB_AVAILABLE and wandb.run is not None: - logging.info("Re-using wandb session") - else: - logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") - logging.info("Will not log data to weights and biases.") - self._wandb_name = None - self._wandb_project = None - - def on_action_end(self): - step = self.step - if self.global_rank == 0 or self.global_rank is None: - logging.info('Final Evaluation ' + '.' * 30) - start_time = time.time() - self.action._eval(self._eval_tensors, self, step) - elapsed_time = time.time() - start_time - if self.global_rank == 0 or self.global_rank is None: - logging.info(f'Evaluation time: {elapsed_time} seconds') - - def clear_global_var_dict(self): - self._global_var_dict = {} - - def wandb_log(self, tensors_logged): - if self._wandb_name is not None and _WANDB_AVAILABLE: - wandb.log(tensors_logged, step=self.step) - - -_Policy = namedtuple('Policy', 'method start end') - - -class _Method(ABC): - """ Classes inherited from _Method are used for - ValueSetterCallback below - """ - - @abstractmethod - def __call__(self, step, total_steps): - pass - - -class _Const(_Method): - def __init__(self, value): - super().__init__() - - self.value = value - - def __call__(self, step, total_steps): - return self.value - - -class _Linear(_Method): - def __init__(self, a, b): - super().__init__() - self.a, self.b = a, b - - def __call__(self, step, total_steps): - return self.a + (step / (total_steps - 1)) * (self.b - self.a) - - -_Method.Const = _Const -_Method.Linear = _Linear - - -class ValueSetterCallback(ActionCallback): - Policy = _Policy - Method = _Method - - @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") - def __init__(self, module, arg_name, policies=None, total_steps=None, tb_writer=None): - super().__init__() - - if policies is None: - initial_value = getattr(module, arg_name) - policies = [_Policy(method=Const(initial_value), start=0.0, end=1.0)] - - new_policies = [] - for p in policies: - start, end = p.start, p.end - if isinstance(start, float): - start = int(start * total_steps) - if isinstance(end, float): - end = int(end * total_steps) - new_policies.append(_Policy(p.method, start, end)) - policies = new_policies - assert policies[0].start == 0 - assert policies[-1].end == total_steps - - self.module = module - self.arg_name = arg_name - self.policies = policies - self.total_steps = total_steps - self.tb_writer = tb_writer - - self.cur_i = 0 - - def on_iteration_start(self): - cur_policy = self.policies[self.cur_i] - if self.step < cur_policy.end: - step = self.step - cur_policy.start - total_steps = cur_policy.end - cur_policy.start - value = cur_policy.method(step, total_steps) - setattr(self.module, self.arg_name, value) - if self.tb_writer is not None: - class_name = self.module.__class__.__name__ - name = f"param/{class_name}.{self.arg_name}" - self.tb_writer.add_scalar(name, value, self.step) - else: - self.cur_i += 1 - self.on_iteration_start() - - -class UnfreezeCallback(ActionCallback): - @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") - def __init__(self, modules, start_epoch=0): - super().__init__() - - self.modules = modules - self.start_epoch = start_epoch - - def on_iteration_start(self): - if self.epoch_num == self.start_epoch: - for m in self.modules: - m.unfreeze() - - -class WandbCallback(ActionCallback): - """ - Log metrics to [Weights & Biases](https://docs.wandb.com/) - """ - - @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") - def __init__( - self, train_tensors=[], wandb_name=None, wandb_project=None, args=None, update_freq=25, - ): - """ - Args: - train_tensors: list of tensors to evaluate and log based on training batches - wandb_name: wandb experiment name - wandb_project: wandb project name - args: argparse flags - will be logged as hyperparameters - update_freq: frequency with which to log updates - """ - super().__init__() - - if not _WANDB_AVAILABLE: - logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") - - self._update_freq = update_freq - self._train_tensors = train_tensors - self._name = wandb_name - self._project = wandb_project - self._args = args - - def on_action_start(self): - if self.global_rank is None or self.global_rank == 0: - if _WANDB_AVAILABLE and wandb.run is None: - wandb.init(name=self._name, project=self._project) - if self._args is not None: - wandb.config.update(self._args) - elif _WANDB_AVAILABLE and wandb.run is not None: - logging.info("Re-using wandb session") - else: - logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") - logging.info("Will not log data to weights and biases.") - self._update_freq = -1 - - def on_iteration_end(self): - # log training metrics - if self.global_rank is None or self.global_rank == 0: - if self.step % self._update_freq == 0 and self._update_freq > 0: - tensors_logged = {t.name: self.registered_tensors[t.unique_name].cpu() for t in self._train_tensors} - # Always log learning rate - tensors_logged['LR'] = self.learning_rate - self.wandb_log(tensors_logged) - - def on_epoch_start(self): - if self.global_rank is None or self.global_rank == 0: - self._last_epoch_start = time.time() - - def on_epoch_end(self): - if self.global_rank is None or self.global_rank == 0: - # always log epoch num and epoch_time - epoch_time = time.time() - self._last_epoch_start - self.wandb_log({"epoch": self.epoch_num, "epoch_time": epoch_time}) - - def wandb_log(self, tensors_logged): - if _WANDB_AVAILABLE: - wandb.log(tensors_logged, step=self.step) From fdae1f35b28608ed941321e9d32ed110d6e43ac9 Mon Sep 17 00:00:00 2001 From: Jason Date: Thu, 28 May 2020 12:03:49 -0700 Subject: [PATCH 40/40] add deprecated callbacks files Signed-off-by: Jason --- nemo/core/deprecated_callbacks.py | 509 ++++++++++++++++++++++++++++++ 1 file changed, 509 insertions(+) create mode 100755 nemo/core/deprecated_callbacks.py diff --git a/nemo/core/deprecated_callbacks.py b/nemo/core/deprecated_callbacks.py new file mode 100755 index 000000000000..a0c7608f2d58 --- /dev/null +++ b/nemo/core/deprecated_callbacks.py @@ -0,0 +1,509 @@ +# ! /usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + "ActionCallback", + "ModuleSaverCallback", + "SimpleLossLoggerCallback", + "EvaluatorCallback", + "ValueSetterCallback", + "UnfreezeCallback", + "WandbCallback", +] + +import datetime +import glob +import os +import sys +import time +from abc import ABC, abstractmethod +from collections import namedtuple + +from nemo.utils import logging +from nemo.utils.decorators import deprecated + +try: + import wandb + + _WANDB_AVAILABLE = True +except (ImportError, ModuleNotFoundError): + _WANDB_AVAILABLE = False + + +class ActionCallback(ABC): + """Abstract interface for callbacks. + """ + + def __init__(self): + self._registered_tensors = {} + self._action = None + + @property + def step(self): + return self.action.step + + @property + def epoch_num(self): + return self.action.epoch_num + + @property + def registered_tensors(self): + return self._registered_tensors + + @property + def local_rank(self): + return self.action.local_rank + + @property + def global_rank(self): + return self.action.global_rank + + @property + def action(self): + return self._action + + @action.setter + def action(self, action_obj): + self._action = action_obj + + @property + def logger(self): + return logging + + def on_action_start(self): + pass + + def on_action_end(self): + pass + + def on_epoch_start(self): + pass + + def on_epoch_end(self): + pass + + def on_iteration_start(self): + pass + + def on_iteration_end(self): + pass + + +class ModuleSaverCallback(ActionCallback): + """ + For callback documentation: please see + https://nvidia.github.io/NeMo/tutorials/callbacks.html + """ + + @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") + def __init__( + self, save_modules_list, step_freq=1000, folder=None, checkpoints_to_keep=4, + ): + super().__init__() + self._save_modules_list = save_modules_list + self._folder = folder + self._step_freq = step_freq + self._ckpt2keep = checkpoints_to_keep + self._saved_ckpts = [] + + def on_iteration_end(self): + step = self.step + if ( + self._step_freq > 0 + and step % self._step_freq == 0 + and step > 0 + and (self.global_rank is None or self.global_rank == 0) + ): + for m in self._save_modules_list: + class_name = m.__class__.__name__ + uid = m.unique_instance_id + fn = f"{class_name}_{uid}-STEP-{step}.pt" + if self._folder is None: + file_name = fn + else: + file_name = os.path.join(self._folder, fn) + logging.info(f"Saving module {class_name} in {file_name}") + m.save_to(file_name) + logging.info("Saved.") + self._saved_ckpts.append(f'-{self.step}.pt') + if len(self._saved_ckpts) > self._ckpt2keep: + for end in self._saved_ckpts[: -self._ckpt2keep]: + for file in glob.glob(f'{self._folder}/*{end}'): + os.remove(file) + self._saved_ckpts = self._saved_ckpts[-self._ckpt2keep :] + + def on_action_end(self): + step = self.step + if self.global_rank is None or self.global_rank == 0: + for m in self._save_modules_list: + class_name = m.__class__.__name__ + uid = m.unique_instance_id + fn = f"{class_name}_{uid}-STEP-{step}.pt" + if self._folder is None: + file_name = fn + else: + file_name = os.path.join(self._folder, fn) + logging.info(f"Saving module {class_name} in {file_name}") + m.save_to(file_name) + logging.info("Saved.") + + +class SimpleLossLoggerCallback(ActionCallback): + """ + For callback documentation: please see + https://nvidia.github.io/NeMo/tutorials/callbacks.html + """ + + @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") + def __init__( + self, tensors, print_func=None, get_tb_values=None, log_to_tb_func=None, step_freq=25, tb_writer=None, + ): + + super().__init__() + if not isinstance(tensors, list): + tensors = [tensors] + self._tensors = tensors + self._print_func = print_func + self._get_tb_values = get_tb_values + self._log_to_tb_func = log_to_tb_func + self._step_freq = step_freq + self._swriter = tb_writer + self._start_time = None + self._last_epoch_start = None + self._last_iter_start = None + + @property + def tensors(self): + return self._tensors + + def on_action_start(self): + if self.global_rank is None or self.global_rank == 0: + logging.info("Starting .....") + self._start_time = time.time() + + def on_action_end(self): + if self.global_rank is None or self.global_rank == 0: + if self._swriter is not None: + self._swriter.close() + delta = datetime.timedelta(seconds=(time.time() - self._start_time)) + logging.info("Done in %s", delta) + + def on_epoch_start(self): + if self.global_rank is None or self.global_rank == 0: + logging.info(f"Starting epoch {self.epoch_num}") + self._last_epoch_start = time.time() + + def on_epoch_end(self): + if self.global_rank is None or self.global_rank == 0: + step = self.step + + delta = datetime.timedelta(seconds=(time.time() - self._last_epoch_start)) + logging.info(f"Finished epoch {self.epoch_num} in {delta}") + + if self._swriter is not None: + value = self.epoch_num + self._swriter.add_scalar('misc/epoch', value, step) + value = time.time() - self._last_epoch_start + self._swriter.add_scalar('misc/epoch_time', value, step) + + def on_iteration_start(self): + if self.global_rank is None or self.global_rank == 0: + self._last_iter_start = time.time() + + def on_iteration_end(self): + if self.global_rank is None or self.global_rank == 0: + step = self.step + if step % self._step_freq == 0: + tensor_values = [self.registered_tensors[t.unique_name] for t in self.tensors] + logging.info(f"Step: {step}") + if self._print_func: + self._print_func(tensor_values) + sys.stdout.flush() + if self._swriter is not None: + if self._get_tb_values: + tb_objects = self._get_tb_values(tensor_values) + for name, value in tb_objects: + value = value.item() + self._swriter.add_scalar(name, value, step) + if self._log_to_tb_func: + self._log_to_tb_func(self._swriter, tensor_values, step) + run_time = time.time() - self._last_iter_start + self._swriter.add_scalar('misc/step_time', run_time, step) + run_time = time.time() - self._last_iter_start + logging.info(f"Step time: {run_time} seconds") + + +class EvaluatorCallback(ActionCallback): + """ + For callback documentation: please see + https://nvidia.github.io/NeMo/tutorials/callbacks.html + """ + + def __init__( + self, + eval_tensors, + user_iter_callback, + user_epochs_done_callback, + tb_writer=None, + tb_writer_func=None, + eval_step=1, + eval_epoch=None, + wandb_name=None, + wandb_project=None, + eval_at_start=True, + ): + # TODO: Eval_epoch currently does nothing + if eval_step is None and eval_epoch is None: + raise ValueError("Either eval_step or eval_epoch must be set. " f"But got: {eval_step} and {eval_epoch}") + if (eval_step is not None and eval_step <= 0) or (eval_epoch is not None and eval_epoch <= 0): + raise ValueError(f"Eval_step and eval_epoch must be > 0." f"But got: {eval_step} and {eval_epoch}") + super().__init__() + self._eval_tensors = eval_tensors + self._swriter = tb_writer + self._tb_writer_func = tb_writer_func + self._eval_frequency = eval_step + self._eval_at_start = eval_at_start + # will be passed to callbacks below + self._global_var_dict = {} + + # Callbacks + self.user_iter_callback = user_iter_callback + self.user_done_callback = user_epochs_done_callback + + # Weights and biases + self._wandb_project = wandb_project + self._wandb_name = wandb_name + + @property + def eval_tensors(self): + return self._eval_tensors + + @property + def tb_writer_func(self): + return self._tb_writer_func + + @property + def swriter(self): + return self._swriter + + def on_epoch_end(self): + pass + + def on_iteration_end(self): + if self.step == 0 and not self._eval_at_start: + return + if self.step % self._eval_frequency == 0: + if self.global_rank == 0 or self.global_rank is None: + logging.info('Doing Evaluation ' + '.' * 30) + start_time = time.time() + self.action._eval(self._eval_tensors, self, self.step) + elapsed_time = time.time() - start_time + if self.global_rank == 0 or self.global_rank is None: + logging.info(f'Evaluation time: {elapsed_time} seconds') + + def on_action_start(self): + if self.global_rank is None or self.global_rank == 0: + if self._wandb_name is not None or self._wandb_project is not None: + if _WANDB_AVAILABLE and wandb.run is None: + wandb.init(name=self._wandb_name, project=self._wandb_project) + elif _WANDB_AVAILABLE and wandb.run is not None: + logging.info("Re-using wandb session") + else: + logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") + logging.info("Will not log data to weights and biases.") + self._wandb_name = None + self._wandb_project = None + + def on_action_end(self): + step = self.step + if self.global_rank == 0 or self.global_rank is None: + logging.info('Final Evaluation ' + '.' * 30) + start_time = time.time() + self.action._eval(self._eval_tensors, self, step) + elapsed_time = time.time() - start_time + if self.global_rank == 0 or self.global_rank is None: + logging.info(f'Evaluation time: {elapsed_time} seconds') + + def clear_global_var_dict(self): + self._global_var_dict = {} + + def wandb_log(self, tensors_logged): + if self._wandb_name is not None and _WANDB_AVAILABLE: + wandb.log(tensors_logged, step=self.step) + + +_Policy = namedtuple('Policy', 'method start end') + + +class _Method(ABC): + """ Classes inherited from _Method are used for + ValueSetterCallback below + """ + + @abstractmethod + def __call__(self, step, total_steps): + pass + + +class _Const(_Method): + def __init__(self, value): + super().__init__() + + self.value = value + + def __call__(self, step, total_steps): + return self.value + + +class _Linear(_Method): + def __init__(self, a, b): + super().__init__() + self.a, self.b = a, b + + def __call__(self, step, total_steps): + return self.a + (step / (total_steps - 1)) * (self.b - self.a) + + +_Method.Const = _Const +_Method.Linear = _Linear + + +class ValueSetterCallback(ActionCallback): + Policy = _Policy + Method = _Method + + @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") + def __init__(self, module, arg_name, policies=None, total_steps=None, tb_writer=None): + super().__init__() + + if policies is None: + initial_value = getattr(module, arg_name) + policies = [_Policy(method=Const(initial_value), start=0.0, end=1.0)] + + new_policies = [] + for p in policies: + start, end = p.start, p.end + if isinstance(start, float): + start = int(start * total_steps) + if isinstance(end, float): + end = int(end * total_steps) + new_policies.append(_Policy(p.method, start, end)) + policies = new_policies + assert policies[0].start == 0 + assert policies[-1].end == total_steps + + self.module = module + self.arg_name = arg_name + self.policies = policies + self.total_steps = total_steps + self.tb_writer = tb_writer + + self.cur_i = 0 + + def on_iteration_start(self): + cur_policy = self.policies[self.cur_i] + if self.step < cur_policy.end: + step = self.step - cur_policy.start + total_steps = cur_policy.end - cur_policy.start + value = cur_policy.method(step, total_steps) + setattr(self.module, self.arg_name, value) + if self.tb_writer is not None: + class_name = self.module.__class__.__name__ + name = f"param/{class_name}.{self.arg_name}" + self.tb_writer.add_scalar(name, value, self.step) + else: + self.cur_i += 1 + self.on_iteration_start() + + +class UnfreezeCallback(ActionCallback): + @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") + def __init__(self, modules, start_epoch=0): + super().__init__() + + self.modules = modules + self.start_epoch = start_epoch + + def on_iteration_start(self): + if self.epoch_num == self.start_epoch: + for m in self.modules: + m.unfreeze() + + +class WandbCallback(ActionCallback): + """ + Log metrics to [Weights & Biases](https://docs.wandb.com/) + """ + + @deprecated(version="0.12", explanation="The callback section of NeMo has been updated.") + def __init__( + self, train_tensors=[], wandb_name=None, wandb_project=None, args=None, update_freq=25, + ): + """ + Args: + train_tensors: list of tensors to evaluate and log based on training batches + wandb_name: wandb experiment name + wandb_project: wandb project name + args: argparse flags - will be logged as hyperparameters + update_freq: frequency with which to log updates + """ + super().__init__() + + if not _WANDB_AVAILABLE: + logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") + + self._update_freq = update_freq + self._train_tensors = train_tensors + self._name = wandb_name + self._project = wandb_project + self._args = args + + def on_action_start(self): + if self.global_rank is None or self.global_rank == 0: + if _WANDB_AVAILABLE and wandb.run is None: + wandb.init(name=self._name, project=self._project) + if self._args is not None: + wandb.config.update(self._args) + elif _WANDB_AVAILABLE and wandb.run is not None: + logging.info("Re-using wandb session") + else: + logging.error("Could not import wandb. Did you install it (pip install --upgrade wandb)?") + logging.info("Will not log data to weights and biases.") + self._update_freq = -1 + + def on_iteration_end(self): + # log training metrics + if self.global_rank is None or self.global_rank == 0: + if self.step % self._update_freq == 0 and self._update_freq > 0: + tensors_logged = {t.name: self.registered_tensors[t.unique_name].cpu() for t in self._train_tensors} + # Always log learning rate + tensors_logged['LR'] = self.learning_rate + self.wandb_log(tensors_logged) + + def on_epoch_start(self): + if self.global_rank is None or self.global_rank == 0: + self._last_epoch_start = time.time() + + def on_epoch_end(self): + if self.global_rank is None or self.global_rank == 0: + # always log epoch num and epoch_time + epoch_time = time.time() - self._last_epoch_start + self.wandb_log({"epoch": self.epoch_num, "epoch_time": epoch_time}) + + def wandb_log(self, tensors_logged): + if _WANDB_AVAILABLE: + wandb.log(tensors_logged, step=self.step)