diff --git a/CHANGELOG.md b/CHANGELOG.md index 3972e4e85d50..a00d22c74c6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -75,8 +75,12 @@ To release a new version, please update the changelog as followed: - Online Data Augmentation for ASR Collection. ([PR #565](https://github.com/NVIDIA/NeMo/pull/565)) - @titu1994 - Speed augmentation on CPU, TimeStretch augmentation on CPU+GPU ([PR #594](https://github.com/NVIDIA/NeMo/pull/565)) - @titu1994 - Added TarredAudioToTextDataLayer, which allows for loading ASR datasets with tarred audio. Existing datasets can be converted with the `convert_to_tarred_audio_dataset.py` script. ([PR #602](https://github.com/NVIDIA/NeMo/pull/602)) +- Online audio augmentation notebook in ASR examples ([PR #605](https://github.com/NVIDIA/NeMo/pull/605)) - @titu1994 +- ContextNet Encoder + Decoder Initial Support ([PR #630](https://github.com/NVIDIA/NeMo/pull/630)) - @titu1994 +- Added finetuning with Megatron-LM ([PR #601](https://github.com/NVIDIA/NeMo/pull/601)) - @ekmb ### Changed +- Syncs across workers at each step to check for NaN or inf loss. Terminates all workers if stop\_on\_nan\_loss is set (as before), lets Apex deal with it if apex.amp optimization level is O1 or higher, and skips the step across workers otherwise. ([PR #637](https://github.com/NVIDIA/NeMo/pull/637)) - @redoctopus ### Dependencies Update diff --git a/Jenkinsfile b/Jenkinsfile index 4bc091ef2d48..bf6cd48cb448 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -184,6 +184,13 @@ pipeline { sh 'rm -rf examples/nlp/token_classification/token_classification_output' } } + stage('Megatron finetuning Token Classification Training/Inference Test') { + steps { + sh 'cd examples/nlp/token_classification && CUDA_VISIBLE_DEVICES=0 python token_classification.py --data_dir /home/TestData/nlp/token_classification_punctuation/ --batch_size 2 --num_epochs 1 --save_epoch_freq 1 --work_dir megatron_output --pretrained_model_name megatron-bert-345m-uncased' + sh 'cd examples/nlp/token_classification && DATE_F=$(ls megatron_output/) && CUDA_VISIBLE_DEVICES=0 python token_classification_infer.py --checkpoint_dir megatron_output/$DATE_F/checkpoints/ --labels_dict /home/TestData/nlp/token_classification_punctuation/label_ids.csv --pretrained_model_name megatron-bert-345m-uncased' + sh 'rm -rf examples/nlp/token_classification/megatron_output' + } + } stage ('Punctuation and Classification Training/Inference Test') { steps { sh 'cd examples/nlp/token_classification && CUDA_VISIBLE_DEVICES=1 python punctuation_capitalization.py --data_dir /home/TestData/nlp/token_classification_punctuation/ --work_dir punctuation_output --save_epoch_freq 1 --num_epochs 1 --save_step_freq -1 --batch_size 2' diff --git a/README.rst b/README.rst index 71a642babc34..be60290c3c78 100644 --- a/README.rst +++ b/README.rst @@ -24,8 +24,8 @@ -NVIDIA Neural Modules: NeMo -=========================== +NVIDIA NeMo +=========== NeMo is a toolkit for creating `Conversational AI `_ applications. @@ -162,14 +162,33 @@ If you prefer to use NeMo's latest development version (from GitHub) follow the python setup.py style --fix # Tries to fix error in-place. python setup.py style --scope=tests # Operates within certain scope (dir of file). -**Unittests** +** NeMo Test Suite** -This command runs unittests: +NeMo contains test suite divided into 5 subsets: + 1) ``unit``: unit tests, i.e. testing a single, well isolated functionality + 2) ``integration``: tests checking the elements when integrated into subsystems + 3) ``system``: tests working at the highest integration level + 4) ``acceptance``: tests checking whether the developed product/model passes the user defined acceptance criteria + 5) ``docs``: tests related to documentation (deselect with '-m "not docs"') + +The user can run all the tests locally by simply executing: .. code-block:: bash - ./reinstall.sh - pytest tests + pytest + +In order to run a subset of tests one can use the ``-m`` argument followed by the subset name, e.g. for ``system`` subset: + +.. code-block:: bash + + pytest -m system + +By default, all the tests will be executed on GPU. There is also an option to run the test suite on CPU +by passing the ``--cpu`` command line argument, e.g.: + +.. code-block:: bash + + pytest -m unit --cpu Citation diff --git a/docs/docs_zh/sources/source/index.rst b/docs/docs_zh/sources/source/index.rst index 67cf4eab6f9b..ef13831fdc97 100644 --- a/docs/docs_zh/sources/source/index.rst +++ b/docs/docs_zh/sources/source/index.rst @@ -1,5 +1,5 @@ -NVIDIA Neural Modules 开发者指南(中文版) -========================================== +NVIDIA NeMo 开发者指南 +==================== .. toctree:: :hidden: @@ -17,7 +17,7 @@ NVIDIA Neural Modules 开发者指南(中文版) -Neural Modules (NeMo) 是一个用神经模块来构建 AI 应用的工具包,它与具体的框架无关。当前支持 PyTorch 框架。 +NeMo 是一个用神经模块来构建 AI 应用的工具包,它与具体的框架无关。当前支持 PyTorch 框架。 一个“神经模块”指的是,根据一系列的输入来计算一系列输出的代码块。 @@ -27,7 +27,7 @@ Neural Modules (NeMo) 是一个用神经模块来构建 AI 应用的工具包, 简介 ------ +--- 我们可以通过以下这个视频有个概览: @@ -39,7 +39,7 @@ Neural Modules (NeMo) 是一个用神经模块来构建 AI 应用的工具包, 核心概念和特性 ------------------- +----------- * `NeuralModule` 类 - 表示以及执行一个神经模块。 * `NmTensor` - 表示的是神经模块端口之间流动的激活元。 @@ -50,7 +50,7 @@ Neural Modules (NeMo) 是一个用神经模块来构建 AI 应用的工具包, 安装依赖 ----------- +------- 1) Python 3.6 or 3.7 2) PyTorch >= 1.4 带GPU支持 diff --git a/docs/sources/source/api-docs/nemo.rst b/docs/sources/source/api-docs/nemo.rst index 8ff87c9b7527..f5893ec7ea87 100644 --- a/docs/sources/source/api-docs/nemo.rst +++ b/docs/sources/source/api-docs/nemo.rst @@ -18,6 +18,14 @@ neural_modules :undoc-members: :show-inheritance: +neural_graph +-------------- + +.. automodule:: nemo.core.neural_graph + :members: + :undoc-members: + :show-inheritance: + neural_factory -------------- diff --git a/docs/sources/source/asr/tutorial.rst b/docs/sources/source/asr/tutorial.rst index 251e8f762144..6c4805bbce2b 100644 --- a/docs/sources/source/asr/tutorial.rst +++ b/docs/sources/source/asr/tutorial.rst @@ -340,6 +340,40 @@ Perform the following steps: python /examples/asr/jasper_eval.py --model_config=/examples/asr/configs/quartznet15x5.yaml --eval_datasets "/dev_clean.json" --load_dir= --lm_path= + +Using and Converting to Tarred Datasets +--------------------------------------- + +If you are training on a distributed cluster, you may want to avoid a dataset consisting of many small files and instead perform batched reads from tarballs. +In this case, you can use the ``TarredAudioToTextDataLayer`` to load your data. + +The ``TarredAudioToTextDataLayer`` takes in an ``audio_tar_filepaths`` argument, which specifies the path(s) to the tarballs that contain the audio files, and a ``manifest_filepath`` argument that should contain the transcripts and durations corresponding to those files (with a unique WAV basename per entry). +The ``audio_tar_filepaths`` argument can be in the form of a string, either containing a path to a single tarball or braceexpand-able to multiple paths, or a list of paths. +Note that the data layer's size (via ``len``) is set by the number of entries of the manifest, rather than the number of files across all tarballs. + +This DataLayer uses `WebDataset `_ to read the tarred audio files. +Since reads are performed sequentially, shuffling is done with a buffer which can be specified by the argument ``shuffle_n``. + +Please see the ``TarredAudioToTextDataLayer`` `documentation `_ and the WebDataset documentation for more details. + +.. note:: + + If using ``torch.distributed`` processes, the ``TarredAudioToTextDataLayer`` will automatically partition the audio tarballs across workers. + As such, if you are training on `n` workers, please make sure to divide your WAV files evenly across a number of tarballs that is divisible by `n`. + +Conversion from an Existing Dataset to Tarred Dataset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you already have an ASR dataset that you would like to convert to one that is compatible with the ``TarredAudioToTextDataLayer``, you can use ``scripts/convert_to_tarred_audio_dataset.py``. + +This script takes a few arguments: + +* ``manifest_path`` (required): The path to your existing dataset's manifest file. +* ``target_dir``: The directory where the tarballs and new manifest will be written. If none if given, defaults to ``./tarred``. +* ``num_shards``: The number of shards (tarballs) to create. If using multiple workers for training, set this to be a multiple of the number of workers you have. Defaults to 1. +* ``shuffle``: Setting this flag will shuffle the entries in your original manifest before creating the new dataset. You may want to do this if your original dataset is ordered, since the ``TarredAudioToTextDataLayer`` cannot shuffle the whole dataset (see ``shuffle_n``). + + Kaldi Compatibility ------------------- @@ -354,7 +388,7 @@ Of course, you will also need the .ark files that contain the audio data in the To load your Kaldi-formatted data, you can simply use the ``KaldiFeatureDataLayer`` instead of the ``AudioToTextDataLayer``. The ``KaldiFeatureDataLayer`` takes in an argument ``kaldi_dir`` instead of a ``manifest_filepath``, and this argument should be set to the directory that contains the files mentioned above. -See `the documentation `_ for more detailed information about the arguments to this data layer. +See `the documentation `_ for more detailed information about the arguments to this data layer. .. note:: diff --git a/docs/sources/source/index.rst b/docs/sources/source/index.rst index 8d4fdfb68de6..f0b68255fab4 100644 --- a/docs/sources/source/index.rst +++ b/docs/sources/source/index.rst @@ -1,5 +1,5 @@ -NVIDIA Neural Modules Developer Guide -===================================== +NVIDIA NeMo Developer Guide +=========================== .. toctree:: :hidden: @@ -9,6 +9,7 @@ NVIDIA Neural Modules Developer Guide tutorials/intro training asr/intro + speaker_recognition/intro speech_command/intro nlp/intro tts/intro @@ -17,7 +18,7 @@ NVIDIA Neural Modules Developer Guide chinese/intro -Neural Modules (NeMo) is a framework-agnostic toolkit for building AI applications powered by Neural Modules. Current support is for PyTorch framework. +NeMo is a framework-agnostic toolkit for building AI applications powered by Neural Modules. Current support is for PyTorch framework. A "Neural Module" is a block of code that computes a set of outputs from a set of inputs. diff --git a/docs/sources/source/nlp/intro.rst b/docs/sources/source/nlp/intro.rst index 63bc65cca0bf..4ece61b68051 100644 --- a/docs/sources/source/nlp/intro.rst +++ b/docs/sources/source/nlp/intro.rst @@ -32,6 +32,12 @@ Pretraining BERT bert_pretraining +Megatron-LM for Downstream tasks +-------------------------------- +.. toctree:: + :maxdepth: 8 + + megatron_finetuning Transformer Language Model -------------------------- diff --git a/docs/sources/source/nlp/joint_intent_slot_filling.rst b/docs/sources/source/nlp/joint_intent_slot_filling.rst index 21a3b40374da..72d5f002b234 100644 --- a/docs/sources/source/nlp/joint_intent_slot_filling.rst +++ b/docs/sources/source/nlp/joint_intent_slot_filling.rst @@ -7,7 +7,7 @@ All the code introduced in this tutorial is based on ``examples/nlp/intent_detec There are a variety pre-trained BERT models that we can select as the base encoder for our model. We're currently using the script for loading pre-trained models from `transformers`. \ -See the list of available pre-trained models by calling `nemo.collections.nlp.nm.trainables.get_bert_models_list()`. \ +See the list of available pre-trained models by calling `nemo_nlp.nm.trainables.get_pretrained_lm_models_list()`. \ The type of the encoder can get defined by the argument `--pretrained_model_name`. .. tip:: @@ -95,9 +95,8 @@ Next, we define all Neural Modules participating in our joint intent slot fillin .. code-block:: python - pretrained_bert_model = nemo_nlp.nm.trainables.get_huggingface_model( - bert_config=args.bert_config, pretrained_model_name=args.pretrained_model_name - ) + bert_model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=args.pretrained_model_name) * Create the classifier heads for our task. diff --git a/docs/sources/source/nlp/megatron_finetuning.rst b/docs/sources/source/nlp/megatron_finetuning.rst new file mode 100644 index 000000000000..589e52d646f0 --- /dev/null +++ b/docs/sources/source/nlp/megatron_finetuning.rst @@ -0,0 +1,36 @@ +Megatron-LM for Downstream Tasks +================================ + +Megatron :cite:`nlp-megatron-lm-shoeybi2020megatron` is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. +More details could be found in `Megatron-LM github repo `_. + +In order to finetune a pretrained Megatron BERT language model on NLP downstream tasks from `examples/nlp `_, specify the pretrained_model_name like this: + +.. code-block:: bash + + --pretrained_model_name megatron-bert-345m-uncased + +For example, to finetune SQuAD v1.1 with Megatron-LM, run: + +.. code-block:: bash + + python question_answering_squad.py \ + --train_file PATH_TO_DATA_DIR/squad/v1.1/train-v1.1.json \ + --eval_file PATH_TO_DATA_DIR/squad/v1.1/dev-v1.1.json \ + --pretrained_model_name megatron-bert-345m-uncased + + +If you have a different checkpoint or model configuration, use ``--pretrained_model_name megatron-bert-uncased`` or ``--pretrained_model_name megatron-bert-cased`` and specify ``--bert_config`` and ``--bert_checkpoint`` for your model. + +.. note:: + Megatron-LM has its own set of training arguments (including tokenizer) that are ignored during finetuning in NeMo. Please use downstream task training scripts for all NeMo supported arguments. + + + +References +---------- + +.. bibliography:: nlp_all_refs.bib + :style: plain + :labelprefix: NLP-MEGATRON-LM + :keyprefix: nlp-megatron-lm- \ No newline at end of file diff --git a/docs/sources/source/nlp/ner.rst b/docs/sources/source/nlp/ner.rst index e6611638d992..2a0b897e079d 100644 --- a/docs/sources/source/nlp/ner.rst +++ b/docs/sources/source/nlp/ner.rst @@ -67,12 +67,15 @@ First, we need to create our neural factory with the supported backend. How you Next, we'll need to define our tokenizer and our BERT model. There are a couple of different ways you can do this. Keep in mind that NER benefits from casing ("New York City" is easier to identify than "new york city"), so we recommend you use cased models. -If you're using a standard BERT model, you should do it as follows. To see the full list of BERT model names, check out ``nemo.collections.nlp.nm.trainables.get_bert_models_list()`` +If you're using a standard BERT model, you should do it as follows. To see the full list of BERT model names, check out ``nemo_nlp.nm.trainables.get_pretrained_lm_models_list()`` .. code-block:: python - tokenizer = nemo.collections.nlp.data.NemoBertTokenizer(pretrained_model="bert-base-cased") - bert_model = nemo_nlp.nm.trainables.huggingface.BERT( + bert_model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name="bert-base-cased") + + tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( + tokenizer_name="nemobert", pretrained_model_name="bert-base-cased") See examples/nlp/token_classification/token_classification.py on how to use a BERT model that you pre-trained yourself. diff --git a/docs/sources/source/nlp/nlp_all_refs.bib b/docs/sources/source/nlp/nlp_all_refs.bib index 950fc2e6e7f7..6d963f99bbb2 100644 --- a/docs/sources/source/nlp/nlp_all_refs.bib +++ b/docs/sources/source/nlp/nlp_all_refs.bib @@ -161,3 +161,11 @@ @article{henderson2015machine journal={research.google}, year={2015} } + +@article{shoeybi2020megatron, + title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism}, + author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan}, + journal={arXiv preprint arXiv:1909.08053}, + year={2020} +} + diff --git a/docs/sources/source/nlp/punctuation.rst b/docs/sources/source/nlp/punctuation.rst index 60b32f5b0060..f2269196f2a5 100644 --- a/docs/sources/source/nlp/punctuation.rst +++ b/docs/sources/source/nlp/punctuation.rst @@ -102,12 +102,15 @@ Next, we'll need to define our tokenizer and our BERT model. Currently, there ar BERT, ALBERT and RoBERTa. These are pretrained model checkpoints from `transformers `__ . Apart from these, the user can also do fine-tuning on a custom BERT checkpoint, specified by the `--bert_checkpoint` argument in the training script. The pretrained back-bone models can be specified `--pretrained_model_name`. -See the list of available pre-trained models by calling `nemo.collections.nlp.nm.trainables.get_bert_models_list()`. \ +See the list of available pre-trained models by calling `nemo_nlp.nm.trainables.get_pretrained_lm_models_list()`. \ .. code-block:: python - tokenizer = nemo.collections.nlp.data.NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL) - bert_model = nemo_nlp.nm.trainables.huggingface.BERT( + bert_model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=PRETRAINED_BERT_MODEL) + + tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( + tokenizer_name="nemobert", pretrained_model_name=PRETRAINED_BERT_MODEL) Now, create the train and evaluation data layers: diff --git a/docs/sources/source/speaker_recognition/datasets.rst b/docs/sources/source/speaker_recognition/datasets.rst new file mode 100644 index 000000000000..2e81b8fe3830 --- /dev/null +++ b/docs/sources/source/speaker_recognition/datasets.rst @@ -0,0 +1,19 @@ +Datasets +======== + +HI-MIA +-------- + +Run the script to download and process hi-mia dataset in order to generate files in the supported format of `nemo_asr`. You should set the data folder of +hi-mia using `--data_root`. These scripts are present in /scripts + +.. code-block:: bash + + python get_hi-mia_data.py --data_root= + +After download and conversion, your `data` folder should contain directories with follwing set of files as: + +* `data//train.json` +* `data//dev.json` +* `data//{set}_all.json` +* `data//utt2spk` \ No newline at end of file diff --git a/docs/sources/source/speaker_recognition/installation_link.rst b/docs/sources/source/speaker_recognition/installation_link.rst new file mode 100644 index 000000000000..cef1c53239c9 --- /dev/null +++ b/docs/sources/source/speaker_recognition/installation_link.rst @@ -0,0 +1 @@ +.. include:: ../asr/installation.rst diff --git a/docs/sources/source/speaker_recognition/intro.rst b/docs/sources/source/speaker_recognition/intro.rst new file mode 100644 index 000000000000..242d1a6c4db2 --- /dev/null +++ b/docs/sources/source/speaker_recognition/intro.rst @@ -0,0 +1,17 @@ +.. _speaker-recognition-docs: + + +Speaker Recognition +=================== + +.. toctree:: + :maxdepth: 8 + + installation_link + tutorial + datasets + models + + + + diff --git a/docs/sources/source/speaker_recognition/models.rst b/docs/sources/source/speaker_recognition/models.rst new file mode 100644 index 000000000000..7890d621e474 --- /dev/null +++ b/docs/sources/source/speaker_recognition/models.rst @@ -0,0 +1,15 @@ +Models +==================== + +.. toctree:: + :maxdepth: 8 + + quartznet + +References +---------- + +.. bibliography:: speaker.bib + :style: plain + :labelprefix: SPEAKER-TUT + :keyprefix: speaker-tut- \ No newline at end of file diff --git a/docs/sources/source/speaker_recognition/quartznet.rst b/docs/sources/source/speaker_recognition/quartznet.rst new file mode 100644 index 000000000000..803cf69602c4 --- /dev/null +++ b/docs/sources/source/speaker_recognition/quartznet.rst @@ -0,0 +1,39 @@ +QuartzNet +--------- + +QuartzNet is a version of Jasper model with separable convolutions and larger filters. It can achieve performance +similar to Jasper but with an order of magnitude less parameters. +Similarly to Jasper, QuartzNet family of models are denoted as QuartzNet_[BxR] where B is the number of blocks, and R - the number of convolutional sub-blocks within a block. Each sub-block contains a +1-D *separable* convolution, batch normalization, ReLU, and dropout: + +We use a Quartznet encoder of 3x2 size with narrow filters. This encoder is connected to the decoder by using a statistics pooling layer. +We experimented with various statistics pooling layers like a gram layer, a x-vector pooling layer, or super vector layer which is combination of the gram and x-vector layers. +the xvector layer is based on Mean and variance based statistics pooling, it is faster to train and very stable. + + .. image:: ../asr/quartz_vertical.png + :align: center + :alt: quartznet model + +`QuartzNet paper `_. + +on average for 417 hrs of data should finish 25 epochs in under 8 hours on single Quadro GV100. + +============== ================= ===================== ====================== ========== +Network Trained Evaluated cosine similarity PLDA + Dataset trial-set EER EER +============== ================= ===================== ====================== ========== +QuartzNet3x2 hi-mia hi-mia 8.72% 6.32% +QuartzNet3x2 voxceleb1 ffsvc-dev 14.22% 7.12% + hi-mia + aishell + voxceleb2 +============== ================= ===================== ====================== ========== + + +References +---------- + + .. bibliography:: speaker.bib + :style: plain + :labelprefix: SPEAKER-TUT + :keyprefix: speaker-tut- diff --git a/docs/sources/source/speaker_recognition/speaker.bib b/docs/sources/source/speaker_recognition/speaker.bib new file mode 100644 index 000000000000..4a4a3f00a6f4 --- /dev/null +++ b/docs/sources/source/speaker_recognition/speaker.bib @@ -0,0 +1,30 @@ +@article{kriman2019quartznet, + title={Quartznet: Deep automatic speech recognition with 1d time-channel separable convolutions}, + author={Kriman, Samuel and Beliaev, Stanislav and Ginsburg, Boris and Huang, Jocelyn and Kuchaiev, Oleksii and Lavrukhin, Vitaly and Leary, Ryan and Li, Jason and Zhang, Yang}, + journal={arXiv preprint arXiv:1910.10261}, + year={2019} +} + +@article{nagrani2017voxceleb, + title={Voxceleb: a large-scale speaker identification dataset}, + author={Nagrani, Arsha and Chung, Joon Son and Zisserman, Andrew}, + journal={arXiv preprint arXiv:1706.08612}, + year={2017} +} + +@misc{himia, + title={HI-MIA : A Far-field Text-Dependent Speaker Verification Database and the Baselines}, + author={Xiaoyi Qin and Hui Bu and Ming Li}, + year={2019}, + eprint={1912.01231}, + archivePrefix={arXiv}, + primaryClass={cs.SD} +} + + +@article{li2019jasper, + title={Jasper: An end-to-end convolutional neural acoustic model}, + author={Li, Jason and Lavrukhin, Vitaly and Ginsburg, Boris and Leary, Ryan and Kuchaiev, Oleksii and Cohen, Jonathan M and Nguyen, Huyen and Gadde, Ravi Teja}, + journal={arXiv preprint arXiv:1904.03288}, + year={2019} +} \ No newline at end of file diff --git a/docs/sources/source/speaker_recognition/tutorial.rst b/docs/sources/source/speaker_recognition/tutorial.rst new file mode 100644 index 000000000000..32e498c49c80 --- /dev/null +++ b/docs/sources/source/speaker_recognition/tutorial.rst @@ -0,0 +1,43 @@ +Tutorial +======== + +Make sure you have installed ``nemo`` and the ``nemo_asr`` collection. +See the :ref:`installation` section. + +.. note:: + + You need to have ``nemo`` and the ``nemo_asr`` collection for this tutorial. + It is also necessary to install `torchaudio` in order to use MFCC preprocessing. + + +Introduction +------------ + +Speaker Recognition (SR) is a broad research area which solves two major tasks: speaker identification (who is speaking?) and +speaker verification (is the speaker who she claims to be?). In this work, we focus on the far-field, +text-independent speaker recognition when the identity of the speaker is based on how speech is spoken, +not necessarily in what is being said. Typically such SR systems operate on unconstrained speech utterances, +which are converted into vector of fixed length, called speaker embedding. Speaker embeddings are also used in +automatic speech recognition (ASR) and speech synthesis. + +As the goal of most speaker related systems is to get good speaker level embeddings that could help distinguish from other speakers, we shall first train these embeddings in end-to-end +manner optimizing the QuatzNet based :cite:`speaker-tut-kriman2019quartznet` encoder model on cross-entropy loss. +We modeify the decoder to get these fixed size embeddings irrespective of the length of input audio. We employ mean and variance +based statistics pooling method to grab these embeddings. + +In this tutorial we shall first train these embeddings on speaker related datasets and then get speaker embeddings from a +pretrained network for a new dataset, then followed by scoring them using cosine similarity method or optionally with PLDA backend. + + +Jupyter Notebooks containing all the steps to download the dataset, train a model and evaluate its results +is available at : `Speaker Recognition an4 example `_ +also for advanced hi-mia dataset at: `Speaker Recognition hi-mia example `_ + + +References +---------- + +.. bibliography:: speaker.bib + :style: plain + :labelprefix: SPEAKER-TUT + :keyprefix: speaker-tut- diff --git a/examples/asr/contextnet.py b/examples/asr/contextnet.py new file mode 100644 index 000000000000..6e6845142d8f --- /dev/null +++ b/examples/asr/contextnet.py @@ -0,0 +1,324 @@ +# Copyright (C) NVIDIA CORPORATION. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License.**** + +import argparse +import copy +import os +from functools import partial + +from ruamel.yaml import YAML + +import nemo +import nemo.collections.asr as nemo_asr +import nemo.utils.argparse as nm_argparse +from nemo.collections.asr.helpers import monitor_asr_train_progress, process_evaluation_batch, process_evaluation_epoch +from nemo.utils.lr_policies import CosineAnnealing + +logging = nemo.logging + + +def parse_args(): + parser = argparse.ArgumentParser( + parents=[nm_argparse.NemoArgParser()], description='ContextNet', conflict_handler='resolve', + ) + parser.set_defaults( + checkpoint_dir=None, + optimizer="novograd", + batch_size=32, + eval_batch_size=64, + lr=0.01, + weight_decay=0.001, + amp_opt_level="O0", + create_tb_writer=True, + ) + + # Overwrite default args + parser.add_argument( + "--num_epochs", + type=int, + default=None, + required=True, + help="number of epochs to train. You should specify either num_epochs or max_steps", + ) + parser.add_argument( + "--model_config", type=str, required=True, help="model configuration file: model.yaml", + ) + + # Create new args + parser.add_argument("--exp_name", default="ContextNet", type=str) + parser.add_argument("--project", default=None, type=str) + parser.add_argument("--beta1", default=0.95, type=float) + parser.add_argument("--beta2", default=0.5, type=float) + parser.add_argument("--warmup_steps", default=1000, type=int) + parser.add_argument("--warmup_ratio", default=None, type=float) + parser.add_argument('--min_lr', default=1e-5, type=float) + parser.add_argument("--load_dir", default=None, type=str) + parser.add_argument("--synced_bn", action='store_true', help="Use synchronized batch norm") + parser.add_argument("--synced_bn_groupsize", default=0, type=int) + parser.add_argument("--update_freq", default=50, type=int, help="Metrics update freq") + parser.add_argument("--eval_freq", default=1000, type=int, help="Evaluation frequency") + parser.add_argument('--kernel_size_factor', default=1.0, type=float) + + args = parser.parse_args() + if args.max_steps is not None: + raise ValueError("ContextNet uses num_epochs instead of max_steps") + + return args + + +def construct_name(name, lr, batch_size, num_epochs, wd, optimizer, kernel_size_factor): + return "{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}-kf_{6}".format( + name, lr, batch_size, num_epochs, wd, optimizer, kernel_size_factor + ) + + +def create_all_dags(args, neural_factory): + ''' + creates train and eval dags as well as their callbacks + returns train loss tensor and callbacks''' + + # parse the config files + yaml = YAML(typ="safe") + with open(args.model_config) as f: + contextnet_params = yaml.load(f) + + vocab = contextnet_params['labels'] + sample_rate = contextnet_params['sample_rate'] + + # Calculate num_workers for dataloader + total_cpus = os.cpu_count() + cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) + + # create data layer for training + train_dl_params = copy.deepcopy(contextnet_params["AudioToTextDataLayer"]) + train_dl_params.update(contextnet_params["AudioToTextDataLayer"]["train"]) + del train_dl_params["train"] + del train_dl_params["eval"] + # del train_dl_params["normalize_transcripts"] + + data_layer_train = nemo_asr.AudioToTextDataLayer( + manifest_filepath=args.train_dataset, + sample_rate=sample_rate, + labels=vocab, + batch_size=args.batch_size, + num_workers=cpu_per_traindl, + **train_dl_params, + ) + + N = len(data_layer_train) + steps_per_epoch = int(N / (args.batch_size * args.iter_per_step * args.num_gpus)) + + # create separate data layers for eval + # we need separate eval dags for separate eval datasets + # but all other modules in these dags will be shared + + eval_dl_params = copy.deepcopy(contextnet_params["AudioToTextDataLayer"]) + eval_dl_params.update(contextnet_params["AudioToTextDataLayer"]["eval"]) + del eval_dl_params["train"] + del eval_dl_params["eval"] + + data_layers_eval = [] + if args.eval_datasets: + for eval_dataset in args.eval_datasets: + data_layer_eval = nemo_asr.AudioToTextDataLayer( + manifest_filepath=eval_dataset, + sample_rate=sample_rate, + labels=vocab, + batch_size=args.eval_batch_size, + num_workers=cpu_per_traindl, + **eval_dl_params, + ) + + data_layers_eval.append(data_layer_eval) + else: + logging.warning("There were no val datasets passed") + + # create shared modules + + data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( + sample_rate=sample_rate, **contextnet_params["AudioToMelSpectrogramPreprocessor"], + ) + + # Inject the `kernel_size_factor` kwarg to the ContextNet config + # Skip the last layer as that must be a pointwise kernel + for idx in range(len(contextnet_params["ContextNetEncoder"]["jasper"]) - 1): + contextnet_params["ContextNetEncoder"]["jasper"][idx]["kernel_size_factor"] = args.kernel_size_factor + + # (ContextNet uses the Jasper baseline encoder and decoder) + encoder = nemo_asr.ContextNetEncoder( + feat_in=contextnet_params["AudioToMelSpectrogramPreprocessor"]["features"], + **contextnet_params["ContextNetEncoder"], + ) + + decoder = nemo_asr.JasperDecoderForCTC( + feat_in=contextnet_params["ContextNetEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab), + ) + + ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab), zero_infinity=True) + + greedy_decoder = nemo_asr.GreedyCTCDecoder() + + # create augmentation modules (only used for training) if their configs + # are present + + multiply_batch_config = contextnet_params.get('MultiplyBatch', None) + if multiply_batch_config: + multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config) + + spectr_augment_config = contextnet_params.get('SpectrogramAugmentation', None) + if spectr_augment_config: + data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config) + + # assemble train DAG + + (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t,) = data_layer_train() + + processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t) + + if multiply_batch_config: + (processed_signal_t, p_length_t, transcript_t, transcript_len_t,) = multiply_batch( + in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t, + ) + + if spectr_augment_config: + processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t) + + encoded_t, encoded_len_t = encoder(audio_signal=processed_signal_t, length=p_length_t) + log_probs_t = decoder(encoder_output=encoded_t) + predictions_t = greedy_decoder(log_probs=log_probs_t) + loss_t = ctc_loss( + log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t, + ) + + # create train callbacks + train_callback = nemo.core.SimpleLossLoggerCallback( + tensors=[loss_t, predictions_t, transcript_t, transcript_len_t], + print_func=partial(monitor_asr_train_progress, labels=vocab), + get_tb_values=lambda x: [["loss", x[0]]], + tb_writer=neural_factory.tb_writer, + step_freq=args.update_freq, + ) + + callbacks = [train_callback] + + if args.checkpoint_dir or args.load_dir: + chpt_callback = nemo.core.CheckpointCallback( + folder=args.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq, + ) + + callbacks.append(chpt_callback) + + # Log training metrics to wandb + if args.project is not None: + wand_callback = nemo.core.WandbCallback( + train_tensors=[loss_t], + wandb_name=args.exp_name, + wandb_project=args.project, + update_freq=args.update_freq, + args=args, + ) + callbacks.append(wand_callback) + + # assemble eval DAGs + for i, eval_dl in enumerate(data_layers_eval): + (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e,) = eval_dl() + processed_signal_e, p_length_e = data_preprocessor(input_signal=audio_signal_e, length=a_sig_length_e) + encoded_e, encoded_len_e = encoder(audio_signal=processed_signal_e, length=p_length_e) + log_probs_e = decoder(encoder_output=encoded_e) + predictions_e = greedy_decoder(log_probs=log_probs_e) + loss_e = ctc_loss( + log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, + ) + + # create corresponding eval callback + tagname = os.path.basename(args.eval_datasets[i]).split(".")[0] + + eval_callback = nemo.core.EvaluatorCallback( + eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e,], + user_iter_callback=partial(process_evaluation_batch, labels=vocab), + user_epochs_done_callback=partial(process_evaluation_epoch, tag=tagname), + eval_step=args.eval_freq, + tb_writer=neural_factory.tb_writer, + ) + + callbacks.append(eval_callback) + + return loss_t, callbacks, steps_per_epoch + + +def main(): + args = parse_args() + + name = construct_name( + args.exp_name, + args.lr, + args.batch_size, + args.num_epochs, + args.weight_decay, + args.optimizer, + args.kernel_size_factor, + ) + work_dir = name + if args.work_dir: + work_dir = os.path.join(args.work_dir, name) + + # instantiate Neural Factory with supported backend + neural_factory = nemo.core.NeuralModuleFactory( + backend=nemo.core.Backend.PyTorch, + local_rank=args.local_rank, + optimization_level=args.amp_opt_level, + log_dir=work_dir, + checkpoint_dir=args.checkpoint_dir, + create_tb_writer=args.create_tb_writer, + files_to_copy=[args.model_config, __file__], + cudnn_benchmark=args.cudnn_benchmark, + tensorboard_dir=args.tensorboard_dir, + ) + args.num_gpus = neural_factory.world_size + + args.checkpoint_dir = neural_factory.checkpoint_dir + + if args.local_rank is not None: + logging.info('Doing ALL GPU') + + # build dags + train_loss, callbacks, steps_per_epoch = create_all_dags(args, neural_factory) + + # train model + neural_factory.train( + tensors_to_optimize=[train_loss], + callbacks=callbacks, + lr_policy=CosineAnnealing( + args.num_epochs * steps_per_epoch, + warmup_steps=args.warmup_steps, + warmup_ratio=args.warmup_ratio, + min_lr=args.min_lr, + ), + optimizer=args.optimizer, + optimization_params={ + "num_epochs": args.num_epochs, + "lr": args.lr, + "betas": (args.beta1, args.beta2), + "weight_decay": args.weight_decay, + "grad_norm_clip": None, + "amp_min_loss_scale": 1e-4, + }, + batches_per_step=args.iter_per_step, + synced_batchnorm=args.synced_bn, + synced_batchnorm_groupsize=args.synced_bn_groupsize, + ) + + +if __name__ == '__main__': + main() diff --git a/examples/asr/notebooks/4_Online_Data_Augmentation.ipynb b/examples/asr/notebooks/4_Online_Data_Augmentation.ipynb new file mode 100644 index 000000000000..ddffabfb270c --- /dev/null +++ b/examples/asr/notebooks/4_Online_Data_Augmentation.ipynb @@ -0,0 +1,1024 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "\"\"\"\n", + "# If you're using Google Colab and not running locally, run this cell.\n", + "!pip install wget\n", + "!pip install git+https://github.com/NVIDIA/apex.git\n", + "!pip install nemo_toolkit[asr]\n", + "!pip install unidecode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir configs\n", + "!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/master/examples/asr/configs/quartznet_speech_commands_3x1_v1.yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import some necessary libraries\n", + "import os\n", + "import random\n", + "import argparse\n", + "import copy\n", + "import math\n", + "import os\n", + "import glob\n", + "from functools import partial\n", + "from datetime import datetime\n", + "from ruamel.yaml import YAML" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "Data augmentation is a useful method to improve the performance of models which is applicable across multiple domains. Certain augmentations can also substantially improve robustness of models to noisy samples. \n", + "\n", + "In this notebook, we describe how to construct an augmentation pipeline inside [Neural Modules (NeMo)](https://github.com/NVIDIA/NeMo), enable augmented training of a [MatchboxNet model](https://arxiv.org/abs/2004.08531) (based on QuartzNet from the paper [\"QuartzNet: Deep Automatic Speech Recognition with 1D Time-Channel Separable Convolutions\"](https://arxiv.org/abs/1910.10261)) and finally how to construct custom augmentations to add to NeMo.\n", + "\n", + "The notebook will follow the steps below:\n", + "\n", + " - Dataset preparation: Preparing a noise dataset using an example file.\n", + "\n", + " - Construct a data augmentation pipeline.\n", + " \n", + " - Construct a custom augmentation and register it for use in NeMo." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Note\n", + "Data augmentation is valuable for many datasets, but it comes at the cost of increased training time if samples are augmented during training time. Certain augmentations are particularly costly, in terms of how much time they take to process a single sample. A few examples of slow augmentations available in NeMo are : \n", + "\n", + " - Speed Perturbation\n", + " - Time Stretch Perturbation (Sample level)\n", + " - Noise Perturbation\n", + " - Impulse Perturbation\n", + " - Time Stretch Augmentation (Batch level, Neural Module)\n", + " \n", + "For such augmentations, it is advisable to pre-process the dataset offline for a one time preprocessing cost and then train the dataset on this augmented training set." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Taking a Look at Our Data (AN4)\n", + "\n", + "The AN4 dataset, also known as the Alphanumeric dataset, was collected and published by Carnegie Mellon University. It consists of recordings of people spelling out addresses, names, telephone numbers, etc., one letter or number at a time, as well as their corresponding transcripts. We choose to use AN4 for this tutorial because it is relatively small, with 948 training and 130 test utterances, and so it trains quickly.\n", + "\n", + "Before we get started, let's download and prepare the dataset. The utterances are available as `.sph` files, so we will need to convert them to `.wav` for later processing. Please make sure you have [Sox](http://sox.sourceforge.net/) installed for this step (see the \"Downloads\" section of the main page)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is where the an4/ directory will be placed.\n", + "# Change this if you don't want the data to be extracted in the current directory.\n", + "data_dir = '.'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "import os\n", + "import subprocess\n", + "import tarfile\n", + "import wget\n", + "\n", + "# Download the dataset. This will take a few moments...\n", + "print(\"******\")\n", + "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", + " an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n", + " an4_path = wget.download(an4_url, data_dir)\n", + " print(f\"Dataset downloaded at: {an4_path}\")\n", + "else:\n", + " print(\"Tarfile already exists.\")\n", + " an4_path = data_dir + '/an4_sphere.tar.gz'\n", + "\n", + "# Untar and convert .sph to .wav (using sox)\n", + "tar = tarfile.open(an4_path)\n", + "tar.extractall(path=data_dir)\n", + "\n", + "print(\"Converting .sph to .wav...\")\n", + "sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)\n", + "for sph_path in sph_list:\n", + " wav_path = sph_path[:-4] + '.wav'\n", + " cmd = [\"sox\", sph_path, wav_path]\n", + " subprocess.run(cmd)\n", + "print(\"Finished conversion.\\n******\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare the path to manifest files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset_basedir = os.path.join(data_dir, 'an4')\n", + "\n", + "train_dataset = os.path.join(dataset_basedir, 'train_manifest.json')\n", + "test_dataset = os.path.join(dataset_basedir, 'test_manifest.json')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read a few rows of the manifest file \n", + "\n", + "Manifest files are the data structure used by NeMo to declare a few important details about the data :\n", + "\n", + "1) `audio_filepath`: Refers to the path to the raw audio file
\n", + "2) `text`: The text transcript of this sample
\n", + "3) `duration`: The length of the audio file, in seconds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!head -n 5 {train_dataset}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Augmentation Pipeline\n", + "\n", + "Constructing a data augmentation pipeline in NeMo is as simple as composing a nested dictionary that describes two things :\n", + "\n", + "1) The probability of that augmentation occuring - using the `prob` keyword
\n", + "2) The keyword arguments required by that augmentation class\n", + "\n", + "Below, we show a few samples of these augmentations. Note, in order to distinguish between the original sample and the perturbed sample, we exaggerate the perturbation strength significantly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import librosa\n", + "import json\n", + "import IPython.display as ipd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Audio file preparation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the data augmentation component from ASR collection\n", + "from nemo.collections.asr.parts import perturb, segment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets see the available perturbations\n", + "perturb.perturbation_types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Obtain a baseline audio file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filepath = librosa.util.example_audio_file()\n", + "sample, sr = librosa.core.load(filepath)\n", + "\n", + "ipd.Audio(sample, rate=sr)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convert to WAV format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import soundfile as sf\n", + "\n", + "# lets convert this ogg file into a wave to be compatible with NeMo\n", + "if not os.path.exists('./media'):\n", + " os.makedirs('./media/')\n", + " \n", + "filename = 'Kevin_MacLeod_-_Vibe_Ace.wav'\n", + "filepath = os.path.join('media', filename)\n", + "\n", + "sf.write(filepath, sample, samplerate=sr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample, sr = librosa.core.load(filepath)\n", + "ipd.Audio(sample, rate=sr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# NeMo has its own support class for loading wav files\n", + "def load_audio() -> segment.AudioSegment:\n", + " filename = 'Kevin_MacLeod_-_Vibe_Ace.wav'\n", + " filepath = os.path.join('media', filename)\n", + " sample_segment = segment.AudioSegment.from_file(filepath, target_sr=sr)\n", + " return sample_segment\n", + "\n", + "sample_segment = load_audio()\n", + "ipd.Audio(sample_segment.samples, rate=sr)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## White Noise Perturbation\n", + "\n", + "White Noise perturbation is performed by the following steps :
\n", + "1) Randomly sample the amplitude of the noise from a uniformly distributed range (defined in dB)
\n", + "2) Sample gaussian noise (mean = 0, std = 1) with same length as audio signal
\n", + "3) Scale this gaussian noise by the amplitude (in dB scale)
\n", + "4) Add this noise vector to the original sample\n", + "\n", + "Notably, the original signal should not have a \"hissing sound\" constantly present in the perturbed version." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "white_noise = perturb.WhiteNoisePerturbation(min_level=-50, max_level=-30)\n", + "\n", + "# Perturb the audio file\n", + "sample_segment = load_audio()\n", + "white_noise.perturb(sample_segment)\n", + "\n", + "ipd.Audio(sample_segment.samples, rate=sr)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Gain Perturbation\n", + "\n", + "Gain perturbation is performed by the following steps :
\n", + "1) Randomly sample the gain factor of the signal from a uniformly distributed range (defined in dB)
\n", + "2) Scale this original signal by the gain factor (in dB scale)
\n", + "\n", + "Notably, the tone of the original audio should sound slightly different as compared to the gain perturbed sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gain = perturb.GainPerturbation(min_gain_dbfs=25, max_gain_dbfs=50)\n", + "\n", + "# Perturb the audio file \n", + "sample_segment = load_audio()\n", + "gain.perturb(sample_segment)\n", + "\n", + "ipd.Audio(sample_segment.samples, rate=sr)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Shift Perturbation\n", + "\n", + "Shift perturbation is performed by the following steps :
\n", + "1) Randomly sample the shift factor of the signal from a uniformly distributed range (defined in milliseconds)
\n", + "2) Depending on the sign of the shift, we shift the original signal to the left or the right.
\n", + "3) The boundary locations are filled with zeros after the shift of the signal
\n", + "\n", + "Notably, the perturbed signal below skips the first 25 to 50 seconds of the original audio below, and the remainder of the time is simply silence. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shift = perturb.ShiftPerturbation(min_shift_ms=25000.0, max_shift_ms=50000.0)\n", + "\n", + "# Perturb the audio file \n", + "sample_segment = load_audio()\n", + "shift.perturb(sample_segment)\n", + "\n", + "ipd.Audio(sample_segment.samples, rate=sr)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Dependent Perturbations\n", + "\n", + "Some perturbations require an external data source in order to perturb the original sample. Noise Perturbation is a perfect example of one such augmentation that requires an external noise source dataset in order to pertur the original data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets prepare a manifest file using the baseline file itself, cut into 1 second segments\n", + "\n", + "def write_manifest(filepath, data_dir='./media/', manifest_name='noise_manifest', duration_max=None, duration_stride=1.0, filter_long=False, duration_limit=10.0):\n", + " if duration_max is None:\n", + " duration_max = 1e9\n", + " \n", + " with open(os.path.join(data_dir, manifest_name + '.json'), 'w') as fout:\n", + " \n", + " try:\n", + " x, _sr = librosa.load(filepath)\n", + " duration = librosa.get_duration(x, sr=_sr)\n", + "\n", + " except Exception:\n", + " print(f\"\\n>>>>>>>>> WARNING: Librosa failed to load file {filepath}. Skipping this file !\\n\")\n", + " return\n", + "\n", + " if filter_long and duration > duration_limit:\n", + " print(f\"Skipping sound sample {filepath}, exceeds duration limit of {duration_limit}\")\n", + " return\n", + "\n", + " offsets = []\n", + " durations = []\n", + "\n", + " if duration > duration_max:\n", + " current_offset = 0.0\n", + "\n", + " while current_offset < duration:\n", + " difference = duration - current_offset\n", + " segment_duration = min(duration_max, difference)\n", + "\n", + " offsets.append(current_offset)\n", + " durations.append(segment_duration)\n", + "\n", + " current_offset += duration_stride\n", + "\n", + " else:\n", + " offsets.append(0.0)\n", + " durations.append(duration)\n", + "\n", + "\n", + " for duration, offset in zip(durations, offsets):\n", + " metadata = {\n", + " 'audio_filepath': filepath,\n", + " 'duration': duration,\n", + " 'label': 'noise',\n", + " 'text': '_', # for compatibility with ASRAudioText collection\n", + " 'offset': offset,\n", + " }\n", + "\n", + " json.dump(metadata, fout)\n", + " fout.write('\\n')\n", + " fout.flush()\n", + "\n", + " print(f\"Wrote {len(durations)} segments for filename {filename}\")\n", + " \n", + " print(\"Finished preparing manifest !\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filename = 'Kevin_MacLeod_-_Vibe_Ace.wav'\n", + "filepath = os.path.join('media', filename)\n", + "\n", + "# Write a \"noise\" manifest file\n", + "write_manifest(filepath, manifest_name='noise_1s', duration_max=1.0, duration_stride=1.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets read this noise manifest file\n", + "noise_manifest_path = os.path.join('media', 'noise_1s.json')\n", + "\n", + "!head -n 5 {noise_manifest_path}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets create a helper method to load the first file in the train dataset of AN4\n", + "# Load the first sample in the manifest\n", + "def load_gsc_sample() -> segment.AudioSegment:\n", + " with open(train_dataset, 'r') as f:\n", + " line = f.readline()\n", + " \n", + " line = json.loads(line)\n", + " gsc_filepath = line['audio_filepath']\n", + " sample_segment = segment.AudioSegment.from_file(gsc_filepath)\n", + " return sample_segment\n", + "\n", + "gsc_sample_segment = load_gsc_sample()\n", + "ipd.Audio(gsc_sample_segment.samples, rate=16000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Noise Augmentation\n", + "\n", + "Noise perturbation is performed by the following steps :
\n", + "1) Randomly sample the amplitude scale of the noise sample from a uniformly distributed range (defined in dB)
\n", + "2) Randomly choose an audio clip from the set of noise audio samples available
\n", + "3) Compute the gain (in dB) required for the noise clip as compared to the original sample and scale the noise by this factor
\n", + "4) If the noise snippet is of shorter duration than the original audio, then randomly select an index in time from the original sample, where the noise snippet will be added
\n", + "5) If instead the noise snippet is longer than the duration of the original audio, then randomly subsegment the noise snippet and add the full snippet to the original audio
\n", + "\n", + "Notably, the noise perturbed sample should sound as if there are two sounds playing at the same time (overlapping audio) as compared to the original signal. The magnitude of the noise will be dependent on step (3) and the location where the noise is added will depend on steps (4) and (5)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rng = random.Random(0)\n", + "noise = perturb.NoisePerturbation(manifest_path=noise_manifest_path,\n", + " min_snr_db=-10, max_snr_db=-10,\n", + " max_gain_db=300.0, rng=rng)\n", + "\n", + "# Perturb the audio file \n", + "sample_segment = load_gsc_sample()\n", + "noise.perturb(sample_segment)\n", + "\n", + "ipd.Audio(sample_segment.samples, rate=16000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Speed Perturbation\n", + "\n", + "Speed perturbation changes the speed of the speech, but does not preserve pitch of the sound. Try a few random augmentations to see how the pitch changes with change in duration of the audio file.\n", + "\n", + "**Note**: This is a very slow augmentation and is not advised to perform online augmentation for large datasets as it can dramatically increase training time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resample_type = 'kaiser_best' # Can be ['kaiser_best', 'kaiser_fast', 'fft', 'scipy']\n", + "speed = perturb.SpeedPerturbation(sr, resample_type, min_speed_rate=0.5, max_speed_rate=2.0, num_rates=-1)\n", + "\n", + "# Perturb the audio file \n", + "sample_segment = load_gsc_sample()\n", + "speed.perturb(sample_segment)\n", + "\n", + "ipd.Audio(sample_segment.samples, rate=16000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Time Stretch Perturbation\n", + "\n", + "Time Stretch perturbation changes the speed of the speech, and also preserve pitch of the sound. \n", + "Try a few random augmentations to see how the pitch remains close to the same with change in duration of the audio file." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Note about speed optimizations\n", + "\n", + "Time stretch is a costly augmentation, and can easily cause training time to increase drastically. It is suggested that one installs the `numba` library using conda to use a more optimized augmentation kernel.\n", + "\n", + "```python\n", + "conda install numba\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "time_stretch = perturb.TimeStretchPerturbation(min_speed_rate=0.5, max_speed_rate=2.0, num_rates=-1)\n", + "\n", + "# Perturb the audio file \n", + "sample_segment = load_gsc_sample()\n", + "time_stretch.perturb(sample_segment)\n", + "\n", + "ipd.Audio(sample_segment.samples, rate=16000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Augmentation Pipeline\n", + "\n", + "The augmentation pipeline can be constructed in multiple ways, either explicitly by instantiating the objects of these perturbations or implicitly by providing the arguments to these augmentations as a nested dictionary.\n", + "\n", + "We will show both approaches in the following sections" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicit definition" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the data augmentation component from ASR collection\n", + "from nemo.collections.asr.parts import perturb, segment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Instantiate the perturbations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "perturbations = [\n", + " perturb.WhiteNoisePerturbation(min_level=-90, max_level=-46),\n", + " perturb.GainPerturbation(min_gain_dbfs=0, max_gain_dbfs=50),\n", + " perturb.NoisePerturbation(manifest_path=noise_manifest_path,\n", + " min_snr_db=0, max_snr_db=50, max_gain_db=300.0)\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Select chance of perturbations being applied" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "probas = [1.0, 1.0, 0.5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare the audio augmentation object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "augmentations = list(zip(probas, perturbations))\n", + "\n", + "audio_augmentations = perturb.AudioAugmentor(augmentations)\n", + "audio_augmentations._pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Implicit definition" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "perturb.perturbation_types # Available perturbations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare the nested dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "audio_augmentations = dict(\n", + " white_noise = dict(\n", + " prob=1.0,\n", + " min_level=-90,\n", + " max_level=-46\n", + " ),\n", + " gain = dict(\n", + " prob=1.0,\n", + " min_gain_dbfs=0,\n", + " max_gain_dbfs=50\n", + " ),\n", + " noise = dict(\n", + " prob=0.5,\n", + " manifest_path=noise_manifest_path,\n", + " min_snr_db=0,\n", + " max_snr_db=50,\n", + " max_gain_db=300.0\n", + " )\n", + ")\n", + "\n", + "audio_augmentations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Supply `audio_augmentations` as an argument to AudioToTextDataLayer or AudioToSpeechLabelDataLayer\n", + "\n", + "Both of these data layers accept an optional keyword argument `augmentor`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections import asr as nemo_asr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "?nemo_asr.AudioToTextDataLayer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "?nemo_asr.AudioToSpeechLabelDataLayer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training - Application of augmentations\n", + "\n", + "We will be describing the data loaders for a MatchboxNet model from the paper \"[MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition](https://arxiv.org/abs/2004.08531)\". The benefit of MatchboxNet over JASPER models is that they use Separable Convolutions, which greatly reduce the number of parameters required to get good model accuracy.\n", + "\n", + "Care must be taken not to apply augmentations to the test set.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets load the config file for the QuartzNet 3x1 model\n", + "# Here we will be using separable convolutions\n", + "# with 3 blocks (k=3 repeated once r=1 from the picture above)\n", + "yaml = YAML(typ=\"safe\")\n", + "with open(\"configs/quartznet_speech_commands_3x1_v1.yaml\") as f:\n", + " jasper_params = yaml.load(f)\n", + "\n", + "# Pre-define a set of labels that this model must learn to predict\n", + "labels = jasper_params['labels']\n", + "\n", + "# Get the sampling rate of the data\n", + "sample_rate = jasper_params['sample_rate']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import NeMo core functionality\n", + "# NeMo's \"core\" package\n", + "import nemo\n", + "# NeMo's ASR collection\n", + "import nemo.collections.asr as nemo_asr\n", + "# NeMo's learning rate policy\n", + "from nemo.utils.lr_policies import CosineAnnealing\n", + "from nemo.collections.asr.helpers import (\n", + " monitor_classification_training_progress,\n", + " process_classification_evaluation_batch,\n", + " process_classification_evaluation_epoch,\n", + ")\n", + "from nemo.collections.asr.metrics import classification_accuracy\n", + "\n", + "logging = nemo.logging" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define the NeMo components" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a Neural Factory\n", + "# It creates log files and tensorboard writers for us among other functions\n", + "neural_factory = nemo.core.NeuralModuleFactory(\n", + " log_dir='./{0}/quartznet-3x1-v1'.format(dataset_basedir),\n", + " create_tb_writer=True)\n", + "tb_writer = neural_factory.tb_writer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Provide the augmentations to the Training AudioToSpeechDataLayer or AudioToTextDataLayer\n", + "\n", + "Thats it ! Now your training samples will be augmented during training !" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Build the input data layer and the preprocessing layers for the train set\n", + "train_data_layer = nemo_asr.AudioToTextDataLayer(\n", + " manifest_filepath=train_dataset,\n", + " labels=labels,\n", + " sample_rate=sample_rate,\n", + " batch_size=32,\n", + " num_workers=os.cpu_count(),\n", + " shuffle=True\n", + " augmentor=audio_augmentations, # Add your augmentations (implicit or explicit)\n", + ")\n", + "\n", + " # Build the input data layer and the preprocessing layers for the test set\n", + "eval_data_layer = nemo_asr.AudioToTextDataLayer(\n", + " manifest_filepath=test_dataset,\n", + " sample_rate=sample_rate,\n", + " labels=labels,\n", + " batch_size=32,\n", + " num_workers=os.cpu_count(),\n", + " shuffle=False, \n", + " augmentor=None # Make sure not to add augmentations to the test set !\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Custom Perturbations\n", + "\n", + "We can define and use custom perturbations as required simply by extending the `Perturbation` class. \n", + "\n", + "Lets look at how we can build a custom Noise Perturbation that we can use to evaluate the effect of noise at inference time, in order to analyse the model's robustness to noise\n", + "\n", + "In evaluation mode, we want to set an explicit value for the `snr_db` parameter instead of uniformly sample it from a range. This allows us to control the signal to noise ratio without relying on randomness from the training implementation of `NoisePerturbation`.\n", + "\n", + "Further, we force a random seed in order to produce reproduceable results on the evaluation set.\n", + "\n", + "With this combination, we can easily evaluate each sample in the test set `S` times (`S` being the number of random seeds), and can evaluate each of these samples at `D` levels of Signal to Noise Ratio (in dB). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We use a NeMo utility to parse the manifest file for us\n", + "from nemo.collections.asr.parts import collections, parsers\n", + "\n", + "class NoisePerturbationEval(perturb.Perturbation):\n", + " def __init__(\n", + " self, manifest_path=None, snr_db=40, max_gain_db=300.0, seed=None,\n", + " ):\n", + " seed = seed if seed is not None else 0\n", + " self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]))\n", + " self._snr_db = snr_db\n", + " self._max_gain_db = max_gain_db\n", + " self._rng = random.Random(seed)\n", + " \n", + " # This is mostly obtained from the original NoisePerturbation class itself\n", + " def perturb(self, data):\n", + " snr_db = self._snr_db\n", + " noise_record = self._rng.sample(self._manifest.data, 1)[0]\n", + " noise = AudioSegment.from_file(noise_record.audio_file, target_sr=data.sample_rate)\n", + " noise_gain_db = min(data.rms_db - noise.rms_db - snr_db, self._max_gain_db)\n", + "\n", + " # calculate noise segment to use\n", + " start_time = 0.0\n", + " if noise.duration > (start_time + data.duration):\n", + " noise.subsegment(start_time=start_time, end_time=start_time + data.duration)\n", + "\n", + " # adjust gain for snr purposes and superimpose\n", + " noise.gain_db(noise_gain_db)\n", + "\n", + " if noise._samples.shape[0] < data._samples.shape[0]:\n", + " noise_idx = data._samples.shape[0] // 2 # midpoint of audio\n", + " while (noise_idx + noise._samples.shape[0]) > data._samples.shape[0]:\n", + " noise_idx = noise_idx // 2 # half the initial starting point\n", + "\n", + " data._samples[noise_idx: noise_idx + noise._samples.shape[0]] += noise._samples\n", + "\n", + " else:\n", + " data._samples += noise._samples\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Registering augmentations\n", + "\n", + "We can use either approach to submit this test time augmentation to the Data Loaders.\n", + "\n", + "In order to obtain the convenience of the implicit method, we must register this augmentation into NeMo's directory of available augmentations. This can be done as follows -" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "perturb.register_perturbation(name='noise_eval', perturbation=NoisePerturbationEval)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets check the registry of allowed perturbations !\n", + "perturb.perturbation_types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Note\n", + "\n", + "It is not allowed to overwrite already registered perturbations using the `perturb.register_perturbation` method. It will raise a `ValueError` in order to prevent overwriting the pre-existing perturbation types" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.7.6 64-bit ('NeMo': conda)", + "language": "python", + "name": "python37664bitnemoconda43f94a748a2e4953b0129556ecdf4f62" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py b/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py index 9ec19b766ce8..83a153a632b5 100644 --- a/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py +++ b/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py @@ -83,7 +83,6 @@ parser = argparse.ArgumentParser(description="GLUE_with_pretrained_BERT") -# Parsing arguments parser.add_argument( "--data_dir", default="COLA", @@ -104,7 +103,7 @@ default="bert-base-uncased", type=str, help="Name of the pre-trained model", - choices=nemo_nlp.nm.trainables.get_bert_models_list(), + choices=nemo_nlp.nm.trainables.get_pretrained_lm_models_list(), ) parser.add_argument("--bert_checkpoint", default=None, type=str, help="Path to model checkpoint") parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") @@ -121,6 +120,13 @@ choices=["nemobert", "sentencepiece"], help="tokenizer to use, only relevant when using custom pretrained checkpoint.", ) +parser.add_argument("--vocab_file", default=None, help="Path to the vocab file.") +parser.add_argument( + "--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models. " + + "Only applicable when tokenizer is build with vocab file", +) parser.add_argument( "--max_seq_length", default=128, @@ -147,6 +153,7 @@ type=str, help="The output directory where the model predictions and checkpoints will be written.", ) +parser.add_argument("--no_time_to_log_dir", action="store_true", help="whether to add time to work_dir or not") parser.add_argument( "--save_epoch_freq", default=1, @@ -164,6 +171,13 @@ "--no_data_cache", action="store_true", help="When specified do not load and store cache preprocessed data.", ) parser.add_argument("--no_shuffle_data", action="store_false", dest="shuffle_data") +parser.add_argument( + "--wandb_project", default=None, type=str, help='Project name for tracking with Weights and Biases' +) +parser.add_argument( + "--wandb_experiment", default=None, type=str, help='Experiment name for tracking with Weights and Biases' +) + args = parser.parse_args() if not os.path.exists(args.data_dir): @@ -195,22 +209,26 @@ log_dir=args.work_dir, create_tb_writer=True, files_to_copy=[__file__], - add_time_to_log_dir=True, + add_time_to_log_dir=not args.no_time_to_log_dir, ) logging.info(f"{args}") -model = nemo_nlp.nm.trainables.get_huggingface_model( - bert_config=args.bert_config, pretrained_model_name=args.pretrained_model_name +model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=args.pretrained_model_name, + config=args.bert_config, + vocab=args.vocab_file, + checkpoint=args.bert_checkpoint, ) tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( tokenizer_name=args.tokenizer, pretrained_model_name=args.pretrained_model_name, tokenizer_model=args.tokenizer_model, + vocab_file=args.vocab_file, + do_lower_case=args.do_lower_case, ) - hidden_size = model.hidden_size # uses [CLS] token for classification (the first token) @@ -221,10 +239,6 @@ pooler = SequenceClassifier(hidden_size=hidden_size, num_classes=num_labels, log_softmax=False) glue_loss = CrossEntropyLossNM() -if args.bert_checkpoint is not None: - model.restore_from(args.bert_checkpoint) - logging.info(f"model restored from {args.bert_checkpoint}") - def create_pipeline( max_seq_length=args.max_seq_length, @@ -281,6 +295,8 @@ def create_pipeline( user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, args.work_dir, eval_task_names[0]), tb_writer=nf.tb_writer, eval_step=steps_per_epoch, + wandb_name=args.wandb_experiment, + wandb_project=args.wandb_project, ) ] @@ -313,14 +329,27 @@ def create_pipeline( folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq ) +callbacks = [callback_train, ckpt_callback] + callbacks_eval + +if args.wandb_project and args.wandb_experiment: + wand_callback = nemo.core.WandbCallback( + train_tensors=[train_loss], + wandb_name=args.wandb_experiment, + wandb_project=args.wandb_project, + update_freq=args.loss_step_freq if args.loss_step_freq > 0 else steps_per_epoch, + args=args, + ) + callbacks.append(wand_callback) + lr_policy_fn = get_lr_policy( args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion ) + nf.train( tensors_to_optimize=[train_loss], - callbacks=[callback_train, ckpt_callback] + callbacks_eval, + callbacks=callbacks, lr_policy=lr_policy_fn, optimizer=args.optimizer_kind, - optimization_params={"num_epochs": args.num_epochs, "lr": args.lr}, + optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay}, ) diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py index e8110313de40..c2d38b8e3c0f 100644 --- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py +++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py @@ -48,10 +48,35 @@ def get_preds(logits): parser.add_argument("--checkpoint_dir", required=True, help="your checkpoint folder", type=str) parser.add_argument("--data_dir", default='data/atis', type=str) parser.add_argument("--eval_file_prefix", default='test', type=str) -parser.add_argument("--pretrained_model_name", default="bert-base-uncased", type=str) -parser.add_argument("--bert_config", default=None, type=str) +parser.add_argument( + "--pretrained_model_name", + default="bert-base-uncased", + type=str, + help="Name of the pre-trained model", + choices=nemo_nlp.nm.trainables.get_pretrained_lm_models_list(), +) +parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") +parser.add_argument( + "--tokenizer", + default="nemobert", + type=str, + choices=["nemobert", "sentencepiece"], + help="tokenizer to use, only relevant when using custom pretrained checkpoint.", +) +parser.add_argument( + "--tokenizer_model", + default=None, + type=str, + help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece", +) +parser.add_argument("--vocab_file", default=None, help="Path to the vocab file.") +parser.add_argument( + "--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models. " + + "Only applicable when tokenizer is build with vocab file", +) parser.add_argument("--batch_size", default=128, type=int) -parser.add_argument("--do_lower_case", action='store_false') parser.add_argument("--max_seq_length", default=64, type=int) parser.add_argument("--local_rank", default=None, type=int) @@ -62,10 +87,17 @@ def get_preds(logits): nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank) -pretrained_bert_model = nemo_nlp.nm.trainables.get_huggingface_model( - bert_config=args.bert_config, pretrained_model_name=args.pretrained_model_name +pretrained_bert_model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=args.pretrained_model_name, config=args.bert_config, vocab=args.vocab_file +) + +tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( + tokenizer_name=args.tokenizer, + pretrained_model_name=args.pretrained_model_name, + tokenizer_model=args.tokenizer_model, + vocab_file=args.vocab_file, + do_lower_case=args.do_lower_case, ) -tokenizer = nemo_nlp.data.NemoBertTokenizer(pretrained_model=args.pretrained_model_name) hidden_size = pretrained_bert_model.hidden_size diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py index b3b0bcc131ca..1b1ae2c423ae 100644 --- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py +++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py @@ -32,19 +32,51 @@ parser.add_argument("--query", required=True, type=str) parser.add_argument("--data_dir", default='data/atis', type=str) parser.add_argument("--checkpoint_dir", required=True, help="path to your checkpoint folder", type=str) -parser.add_argument("--pretrained_model_name", default="bert-base-uncased", type=str) -parser.add_argument("--bert_config", default=None, type=str) -parser.add_argument("--do_lower_case", action='store_false') +parser.add_argument( + "--pretrained_model_name", + default="bert-base-uncased", + type=str, + help="Name of the pre-trained model", + choices=nemo_nlp.nm.trainables.get_pretrained_lm_models_list(), +) +parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") +parser.add_argument( + "--tokenizer", + default="nemobert", + type=str, + choices=["nemobert", "sentencepiece"], + help="tokenizer to use, only relevant when using custom pretrained checkpoint.", +) +parser.add_argument( + "--tokenizer_model", + default=None, + type=str, + help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece", +) +parser.add_argument("--vocab_file", default=None, help="Path to the vocab file.") +parser.add_argument( + "--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models. " + + "Only applicable when tokenizer is build with vocab file", +) parser.add_argument("--max_seq_length", default=64, type=int) args = parser.parse_args() nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch) -pretrained_bert_model = nemo_nlp.nm.trainables.get_huggingface_model( - bert_config=args.bert_config, pretrained_model_name=args.pretrained_model_name +pretrained_bert_model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=args.pretrained_model_name, config=args.bert_config, vocab=args.vocab_file +) + +tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( + tokenizer_name=args.tokenizer, + pretrained_model_name=args.pretrained_model_name, + tokenizer_model=args.tokenizer_model, + vocab_file=args.vocab_file, + do_lower_case=args.do_lower_case, ) -tokenizer = nemo_nlp.data.NemoBertTokenizer(pretrained_model=args.pretrained_model_name) hidden_size = pretrained_bert_model.hidden_size diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py index 1635308b837c..dbb6b350eb21 100644 --- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py +++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py @@ -48,10 +48,26 @@ default='bert-base-uncased', type=str, help='Name of the pre-trained model for the encoder', - choices=nemo_nlp.nm.trainables.get_bert_models_list(), + choices=nemo_nlp.nm.trainables.get_pretrained_lm_models_list(), ) -parser.add_argument("--bert_checkpoint", default=None, type=str) -parser.add_argument("--bert_config", default=None, type=str) +parser.add_argument("--bert_checkpoint", default=None, type=str, help="Path to pretrained bert model") +parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") + +parser.add_argument("--vocab_file", default=None, help="Path to the vocab file.") +parser.add_argument( + "--tokenizer", + default="nemobert", + type=str, + choices=["nemobert", "sentencepiece"], + help="tokenizer to use, only relevant when using custom pretrained checkpoint.", +) +parser.add_argument( + "--tokenizer_model", + default=None, + type=str, + help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece", +) + parser.add_argument("--train_file_prefix", default='train', type=str) parser.add_argument("--eval_file_prefix", default='test', type=str) @@ -70,7 +86,12 @@ parser.add_argument("--intent_loss_weight", default=0.6, type=float) parser.add_argument("--class_balancing", default="regular", type=str, choices=["regular", "weighted_loss"]) -parser.add_argument("--do_lower_case", action='store_true') +parser.add_argument( + "--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models. " + + "For tokenizer only applicable when tokenizer is build with vocab file", +) parser.add_argument( "--no_shuffle_data", action='store_false', dest="shuffle_data", help="Shuffle is enabled by default." ) @@ -101,16 +122,22 @@ add_time_to_log_dir=True, ) -pretrained_bert_model = nemo_nlp.nm.trainables.get_huggingface_model( - bert_config=args.bert_config, pretrained_model_name=args.pretrained_model_name +model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=args.pretrained_model_name, + config=args.bert_config, + vocab=args.vocab_file, + checkpoint=args.bert_checkpoint, ) -tokenizer = nemo_nlp.data.NemoBertTokenizer(pretrained_model=args.pretrained_model_name) -if args.bert_checkpoint: - pretrained_bert_model.restore_from(args.bert_checkpoint) - logging.info(f"Model restored from {args.bert_checkpoint}") +tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( + tokenizer_name=args.tokenizer, + pretrained_model_name=args.pretrained_model_name, + tokenizer_model=args.tokenizer_model, + vocab_file=args.vocab_file, + do_lower_case=args.do_lower_case, +) -hidden_size = pretrained_bert_model.hidden_size +hidden_size = model.hidden_size data_desc = JointIntentSlotDataDesc( data_dir=args.data_dir, none_slot_label=args.none_slot_label, pad_label=args.pad_label @@ -165,7 +192,7 @@ def create_pipeline(num_samples=-1, batch_size=32, data_prefix='train', is_train steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus)) logging.info(f"Steps_per_epoch = {steps_per_epoch}") - hidden_states = pretrained_bert_model( + hidden_states = model( input_ids=input_data.input_ids, token_type_ids=input_data.input_type_ids, attention_mask=input_data.input_mask ) diff --git a/examples/nlp/question_answering/question_answering_squad.py b/examples/nlp/question_answering/question_answering_squad.py index db7ab14968d6..74f0f6d09208 100755 --- a/examples/nlp/question_answering/question_answering_squad.py +++ b/examples/nlp/question_answering/question_answering_squad.py @@ -116,7 +116,7 @@ def parse_args(): default='roberta-base', type=str, help='Name of the pre-trained model', - choices=nemo_nlp.nm.trainables.get_bert_models_list(), + choices=nemo_nlp.nm.trainables.get_pretrained_lm_models_list(), ) parser.add_argument("--checkpoint_dir", default=None, type=str, help="Checkpoint directory for inference.") parser.add_argument( @@ -140,6 +140,7 @@ def parse_args(): help="tokenizer to use, only relevant when using custom pretrained checkpoint.", ) parser.add_argument("--optimizer", default="adam_w", type=str, help="Optimizer kind") + parser.add_argument("--vocab_file", default=None, help="Path to the vocab file.") parser.add_argument("--lr_policy", default="WarmupAnnealing", type=str) parser.add_argument("--lr", default=3e-5, type=float, help="The initial learning rate.") parser.add_argument("--lr_warmup_proportion", default=0.0, type=float) @@ -338,24 +339,27 @@ def create_pipeline( add_time_to_log_dir=False, ) - model = nemo_nlp.nm.trainables.get_huggingface_model( - bert_config=args.bert_config, pretrained_model_name=args.pretrained_model_name + model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=args.pretrained_model_name, + config=args.bert_config, + vocab=args.vocab_file, + checkpoint=args.bert_checkpoint, ) - hidden_size = model.hidden_size - tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( tokenizer_name=args.tokenizer, pretrained_model_name=args.pretrained_model_name, tokenizer_model=args.tokenizer_model, + vocab_file=args.vocab_file, + do_lower_case=args.do_lower_case, ) + hidden_size = model.hidden_size + qa_head = nemo_nlp.nm.trainables.TokenClassifier( hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False ) squad_loss = nemo_nlp.nm.losses.SpanningLoss() - if args.bert_checkpoint is not None: - model.restore_from(args.bert_checkpoint) if args.head_checkpoint is not None: qa_head.restore_from(args.head_checkpoint) diff --git a/examples/nlp/text_classification/text_classification_with_bert.py b/examples/nlp/text_classification/text_classification_with_bert.py index 9ce1bceb7cc9..4f4aeaf58b6b 100644 --- a/examples/nlp/text_classification/text_classification_with_bert.py +++ b/examples/nlp/text_classification/text_classification_with_bert.py @@ -33,10 +33,10 @@ default='roberta-base', type=str, help='Name of the pre-trained model', - choices=nemo_nlp.nm.trainables.get_bert_models_list(), + choices=nemo_nlp.nm.trainables.get_pretrained_lm_models_list(), ) parser.add_argument("--bert_checkpoint", default=None, type=str) -parser.add_argument("--bert_config", default=None, type=str) +parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") parser.add_argument( "--tokenizer", default="nemobert", @@ -51,6 +51,12 @@ type=str, help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece", ) +parser.add_argument( + "--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models. " + + "For tokenizer only applicable when tokenizer is build with vocab file.", +) parser.add_argument("--batch_size", default=32, type=int) parser.add_argument("--max_seq_length", default=36, type=int) parser.add_argument("--num_gpus", default=1, type=int) @@ -70,12 +76,10 @@ ) parser.add_argument("--train_file_prefix", default='train', type=str) parser.add_argument("--eval_file_prefix", default='dev', type=str) -parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--class_balancing", default="None", type=str, choices=["None", "weighted_loss"]) parser.add_argument( "--no_shuffle_data", action='store_false', dest="shuffle_data", help="Shuffle is enabled by default." ) - parser.add_argument("--save_epoch_freq", default=1, type=int) parser.add_argument("--save_step_freq", default=-1, type=int) parser.add_argument('--loss_step_freq', default=25, type=int, help='Frequency of printing loss') @@ -94,12 +98,13 @@ add_time_to_log_dir=True, ) -model = nemo_nlp.nm.trainables.get_huggingface_model( - bert_config=args.bert_config, pretrained_model_name=args.pretrained_model_name +model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=args.pretrained_model_name, + config=args.bert_config, + vocab=args.vocab_file, + checkpoint=args.bert_checkpoint, ) -hidden_size = model.hidden_size - tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( tokenizer_name=args.tokenizer, pretrained_model_name=args.pretrained_model_name, @@ -108,6 +113,8 @@ do_lower_case=args.do_lower_case, ) +hidden_size = model.hidden_size + data_desc = TextClassificationDataDesc(data_dir=args.data_dir, modes=[args.train_file_prefix, args.eval_file_prefix]) # Create sentence classification loss on top @@ -119,9 +126,6 @@ log_softmax=False, ) -if args.bert_checkpoint: - model.restore_from(args.bert_checkpoint) - logging.info(f"model restored from {args.bert_checkpoint}") if args.class_balancing == 'weighted_loss': # You may need to increase the number of epochs for convergence. diff --git a/examples/nlp/token_classification/NERWithBERT.ipynb b/examples/nlp/token_classification/NERWithBERT.ipynb index 96b58ed67338..e2da6723e760 100644 --- a/examples/nlp/token_classification/NERWithBERT.ipynb +++ b/examples/nlp/token_classification/NERWithBERT.ipynb @@ -80,10 +80,14 @@ "outputs": [], "source": [ "# If you're using a standard BERT model, you should do it like this. To see the full\n", - "# list of BERT/ALBERT/RoBERTa model names, call nemo_nlp.nm.trainables.get_bert_models_list()\n", + "# list of MegatronBERT/BERT/ALBERT/RoBERTa model names, call nemo_nlp.nm.trainables.get_pretrained_lm_models_list()\n", "\n", - "tokenizer = NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL)\n", - "bert_model = nemo_nlp.nm.trainables.get_huggingface_model(pretrained_model_name=PRETRAINED_BERT_MODEL)" + "bert_model = nemo_nlp.nm.trainables.get_pretrained_lm_model(\n", + " pretrained_model_name=PRETRAINED_BERT_MODEL)\n", + "\n", + "tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(\n", + " tokenizer_name=\"nemobert\",\n", + " pretrained_model_name=PRETRAINED_BERT_MODEL)" ] }, { @@ -234,4 +238,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/examples/nlp/token_classification/PunctuationWithBERT.ipynb b/examples/nlp/token_classification/PunctuationWithBERT.ipynb index 1a0734156a7d..77a62e563b70 100644 --- a/examples/nlp/token_classification/PunctuationWithBERT.ipynb +++ b/examples/nlp/token_classification/PunctuationWithBERT.ipynb @@ -120,10 +120,14 @@ "outputs": [], "source": [ "# If you're using a standard BERT model, you should do it like this. To see the full\n", - "# list of BERT/ALBERT/RoBERTa model names, call nemo_nlp.nm.trainables.get_bert_models_list()\n", + "# list of MegatronBERT/BERT/ALBERT/RoBERTa model names, call nemo_nlp.nm.trainables.get_pretrained_lm_models_list()\n", "\n", - "tokenizer = NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL)\n", - "bert_model = nemo_nlp.nm.trainables.get_huggingface_model(pretrained_model_name=PRETRAINED_BERT_MODEL)" + "bert_model = nemo_nlp.nm.trainables.get_pretrained_lm_model(\n", + " pretrained_model_name=PRETRAINED_BERT_MODEL)\n", + "\n", + "tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(\n", + " tokenizer_name=\"nemobert\",\n", + " pretrained_model_name=PRETRAINED_BERT_MODEL)" ] }, { @@ -494,4 +498,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/examples/nlp/token_classification/punctuation_capitalization.py b/examples/nlp/token_classification/punctuation_capitalization.py index 9557c6e070d3..549c60486cd9 100644 --- a/examples/nlp/token_classification/punctuation_capitalization.py +++ b/examples/nlp/token_classification/punctuation_capitalization.py @@ -58,7 +58,7 @@ default="bert-base-uncased", type=str, help="Name of the pre-trained model", - choices=nemo_nlp.nm.trainables.get_bert_models_list(), + choices=nemo_nlp.nm.trainables.get_pretrained_lm_models_list(), ) parser.add_argument("--bert_checkpoint", default=None, type=str) parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") @@ -79,6 +79,15 @@ help="tokenizer to use, \ only relevant when using custom pretrained checkpoint.", ) +parser.add_argument( + "--vocab_file", default=None, help="Path to the vocab file. Required for pretrained Megatron models" +) +parser.add_argument( + "--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models. " + + "Only applicable when tokenizer is build with vocab file", +) parser.add_argument( "--work_dir", default='output', @@ -128,20 +137,21 @@ output_file = f'{nf.work_dir}/output.txt' -model = nemo_nlp.nm.trainables.get_huggingface_model( - bert_config=args.bert_config, pretrained_model_name=args.pretrained_model_name +model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=args.pretrained_model_name, + config=args.bert_config, + vocab=args.vocab_file, + checkpoint=args.bert_checkpoint, ) tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( tokenizer_name=args.tokenizer, pretrained_model_name=args.pretrained_model_name, tokenizer_model=args.tokenizer_model, + vocab_file=args.vocab_file, + do_lower_case=args.do_lower_case, ) -if args.bert_checkpoint is not None: - model.restore_from(args.bert_checkpoint) - logging.info(f"model restored from {args.bert_checkpoint}") - hidden_size = model.hidden_size @@ -272,6 +282,7 @@ def create_pipeline( tensors=losses + train_logits, print_func=lambda x: logging.info("Loss: {:.3f}".format(x[0].item())), get_tb_values=lambda x: [["loss", x[0]]], + step_freq=args.loss_step_freq, tb_writer=nf.tb_writer, ) @@ -298,5 +309,5 @@ def create_pipeline( callbacks=[train_callback, eval_callback, ckpt_callback], lr_policy=lr_policy_fn, optimizer=args.optimizer_kind, - optimization_params={"num_epochs": args.num_epochs, "lr": args.lr}, + optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay}, ) diff --git a/examples/nlp/token_classification/punctuation_capitalization_infer.py b/examples/nlp/token_classification/punctuation_capitalization_infer.py index 734b91ce5413..01783ee22c60 100644 --- a/examples/nlp/token_classification/punctuation_capitalization_infer.py +++ b/examples/nlp/token_classification/punctuation_capitalization_infer.py @@ -34,7 +34,7 @@ default="bert-base-uncased", type=str, help="Name of the pre-trained model", - choices=nemo_nlp.nm.trainables.get_bert_models_list(), + choices=nemo_nlp.nm.trainables.get_pretrained_lm_models_list(), ) parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") parser.add_argument( @@ -50,6 +50,13 @@ choices=["nemobert", "sentencepiece"], help="tokenizer to use, only relevant when using custom pretrained checkpoint.", ) +parser.add_argument("--vocab_file", default=None, help="Path to the vocab file.") +parser.add_argument( + "--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models. " + + "Only applicable when tokenizer is build with vocab file", +) parser.add_argument("--none_label", default='O', type=str) parser.add_argument( "--queries", @@ -104,20 +111,19 @@ capit_labels_dict = get_vocab(args.capit_labels_dict) -""" Load the pretrained BERT parameters -See the list of pretrained models, call: -nemo.collections.nlp.BERT.list_pretrained_models() -""" -pretrained_bert_model = nemo_nlp.nm.trainables.get_huggingface_model( - bert_config=args.bert_config, pretrained_model_name=args.pretrained_model_name +model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=args.pretrained_model_name, config=args.bert_config, vocab=args.vocab_file ) tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( tokenizer_name=args.tokenizer, pretrained_model_name=args.pretrained_model_name, tokenizer_model=args.tokenizer_model, + vocab_file=args.vocab_file, + do_lower_case=args.do_lower_case, ) -hidden_size = pretrained_bert_model.hidden_size + +hidden_size = model.hidden_size data_layer = BertTokenClassificationInferDataLayer( queries=args.queries, tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1 @@ -136,7 +142,7 @@ input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = data_layer() -hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) +hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) punct_logits = punct_classifier(hidden_states=hidden_states) capit_logits = capit_classifier(hidden_states=hidden_states) diff --git a/examples/nlp/token_classification/token_classification.py b/examples/nlp/token_classification/token_classification.py index 15fd505b252e..2a04b11332f8 100644 --- a/examples/nlp/token_classification/token_classification.py +++ b/examples/nlp/token_classification/token_classification.py @@ -33,43 +33,59 @@ from nemo.utils.lr_policies import get_lr_policy # Parsing arguments +"""Provide extra arguments required for tasks.""" parser = argparse.ArgumentParser(description="Token classification with pretrained BERT") parser.add_argument("--local_rank", default=None, type=int) -parser.add_argument("--batch_size", default=8, type=int) -parser.add_argument("--max_seq_length", default=128, type=int) + +# training arguments +parser.add_argument( + "--work_dir", + default='output', + type=str, + help="The output directory where the model prediction and checkpoints will be written.", +) +parser.add_argument("--no_time_to_log_dir", action="store_true", help="whether to add time to work_dir or not") parser.add_argument("--num_gpus", default=1, type=int) parser.add_argument("--num_epochs", default=5, type=int) +parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"]) +parser.add_argument( + "--save_epoch_freq", + default=1, + type=int, + help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved", +) +parser.add_argument( + "--save_step_freq", + default=-1, + type=int, + help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved", +) +parser.add_argument("--eval_step_freq", default=100, type=int, help="Frequency of evaluation") +parser.add_argument("--loss_step_freq", default=250, type=int, help="Frequency of printing loss") +parser.add_argument("--use_weighted_loss", action='store_true', help="Flag to indicate whether to use weighted loss") + +# learning rate arguments parser.add_argument("--lr_warmup_proportion", default=0.1, type=float) parser.add_argument("--lr", default=5e-5, type=float) parser.add_argument("--lr_policy", default="WarmupAnnealing", type=str) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--optimizer_kind", default="adam", type=str) -parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"]) -parser.add_argument("--data_dir", default="/data", type=str) + +# task specific arguments parser.add_argument("--fc_dropout", default=0.5, type=float) parser.add_argument("--num_fc_layers", default=2, type=int) + +# data arguments +parser.add_argument("--data_dir", default="/data", type=str) +parser.add_argument("--max_seq_length", default=128, type=int) parser.add_argument("--ignore_start_end", action='store_false') parser.add_argument("--ignore_extra_tokens", action='store_false') parser.add_argument("--none_label", default='O', type=str) parser.add_argument("--mode", default='train_eval', choices=["train_eval", "train"], type=str) parser.add_argument("--no_shuffle_data", action='store_false', dest="shuffle_data") -parser.add_argument("--no_time_to_log_dir", action="store_true", help="whether to add time to work_dir or not") +parser.add_argument("--use_cache", action='store_true', help="Whether to cache preprocessed data") +parser.add_argument("--batch_size", default=8, type=int, help="Batch size") parser.add_argument("--batches_per_step", default=1, type=int, help="Number of iterations per step.") -parser.add_argument( - "--pretrained_model_name", - default="bert-base-uncased", - type=str, - help="Name of the pre-trained model", - choices=nemo_nlp.nm.trainables.get_bert_models_list(), -) -parser.add_argument("--bert_checkpoint", default=None, type=str) -parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") -parser.add_argument( - "--tokenizer_model", - default=None, - type=str, - help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece", -) parser.add_argument( "--tokenizer", default="nemobert", @@ -77,32 +93,36 @@ choices=["nemobert", "sentencepiece"], help="tokenizer to use, only relevant when using custom pretrained checkpoint.", ) -parser.add_argument("--vocab_file", default=None, help="Path to the vocab file.") -parser.add_argument("--do_lower_case", action='store_true') parser.add_argument( - "--work_dir", - default='output', + "--vocab_file", default=None, help="Path to the vocab file. Required for pretrained Megatron models" +) +parser.add_argument( + "--tokenizer_model", + default=None, type=str, - help="The output directory where the model prediction and checkpoints will be written.", + help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece", ) -parser.add_argument("--use_cache", action='store_true', help="Whether to cache preprocessed data") parser.add_argument( - "--save_epoch_freq", - default=1, - type=int, - help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved", + "--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models. " + + "Only applicable when tokenizer is build with vocab file", ) + +# model arguments parser.add_argument( - "--save_step_freq", - default=-1, - type=int, - help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved", + "--pretrained_model_name", + default="bert-base-uncased", + type=str, + help="Name of the pre-trained model", + choices=nemo_nlp.nm.trainables.get_pretrained_lm_models_list(), ) -parser.add_argument("--loss_step_freq", default=250, type=int, help="Frequency of printing loss") -parser.add_argument("--eval_step_freq", default=100, type=int, help="Frequency of evaluation") -parser.add_argument("--use_weighted_loss", action='store_true', help="Flag to indicate whether to use weighted loss") +parser.add_argument("--bert_checkpoint", default=None, type=str, help="Path to bert pretrained checkpoint") +parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") + args = parser.parse_args() +logging.info(args) if not os.path.exists(args.data_dir): raise FileNotFoundError( @@ -122,14 +142,18 @@ add_time_to_log_dir=not args.no_time_to_log_dir, ) -logging.info(args) - output_file = f'{nf.work_dir}/output.txt' -model = nemo_nlp.nm.trainables.get_huggingface_model( - bert_config=args.bert_config, pretrained_model_name=args.pretrained_model_name + +model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=args.pretrained_model_name, + config=args.bert_config, + vocab=args.vocab_file, + checkpoint=args.bert_checkpoint, ) +hidden_size = model.hidden_size + tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( tokenizer_name=args.tokenizer, pretrained_model_name=args.pretrained_model_name, @@ -138,12 +162,6 @@ do_lower_case=args.do_lower_case, ) -if args.bert_checkpoint is not None: - model.restore_from(args.bert_checkpoint) - logging.info(f"model restored from {args.bert_checkpoint}") - -hidden_size = model.hidden_size - def create_pipeline( pad_label=args.none_label, @@ -232,6 +250,7 @@ def create_pipeline( tensors=train_tensors, print_func=lambda x: logging.info("Loss: {:.3f}".format(x[0].item())), get_tb_values=lambda x: [["loss", x[0]]], + step_freq=args.loss_step_freq, tb_writer=nf.tb_writer, ) callbacks.append(train_callback) diff --git a/examples/nlp/token_classification/token_classification_infer.py b/examples/nlp/token_classification/token_classification_infer.py index 06c948abe219..66bfa0f25f06 100644 --- a/examples/nlp/token_classification/token_classification_infer.py +++ b/examples/nlp/token_classification/token_classification_infer.py @@ -33,9 +33,16 @@ default="bert-base-uncased", type=str, help="Name of the pre-trained model", - choices=nemo_nlp.nm.trainables.get_bert_models_list(), + choices=nemo_nlp.nm.trainables.get_pretrained_lm_models_list(), ) parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") +parser.add_argument( + "--tokenizer", + default="nemobert", + type=str, + choices=["nemobert", "sentencepiece"], + help="tokenizer to use, only relevant when using custom pretrained checkpoint.", +) parser.add_argument( "--tokenizer_model", default=None, @@ -43,11 +50,13 @@ help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece", ) parser.add_argument( - "--tokenizer", - default="nemobert", - type=str, - choices=["nemobert", "sentencepiece"], - help="tokenizer to use, only relevant when using custom pretrained checkpoint.", + "--vocab_file", default=None, help="Path to the vocab file. Required for pretrained Megatron models" +) +parser.add_argument( + "--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models. " + + "Only applicable when tokenizer is build with vocab file", ) parser.add_argument("--none_label", default='O', type=str) parser.add_argument( @@ -83,20 +92,18 @@ labels_dict = get_vocab(args.labels_dict) -""" Load the pretrained BERT parameters -See the list of pretrained models, call: -nemo_nlp.huggingface.BERT.list_pretrained_models() -""" -pretrained_bert_model = nemo_nlp.nm.trainables.get_huggingface_model( - bert_config=args.bert_config, pretrained_model_name=args.pretrained_model_name +model = nemo_nlp.nm.trainables.get_pretrained_lm_model( + pretrained_model_name=args.pretrained_model_name, config=args.bert_config, vocab=args.vocab_file ) tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer( tokenizer_name=args.tokenizer, pretrained_model_name=args.pretrained_model_name, tokenizer_model=args.tokenizer_model, + vocab_file=args.vocab_file, + do_lower_case=args.do_lower_case, ) -hidden_size = pretrained_bert_model.hidden_size +hidden_size = model.hidden_size data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer( @@ -107,7 +114,7 @@ input_ids, input_type_ids, input_mask, _, subtokens_mask = data_layer() -hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) +hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) ########################################################################### diff --git a/examples/speaker_recognition/configs/quartznet_spkr_3x1x512_xvector.yaml b/examples/speaker_recognition/configs/quartznet_spkr_3x1x512_xvector.yaml new file mode 100644 index 000000000000..868b9a0f01ff --- /dev/null +++ b/examples/speaker_recognition/configs/quartznet_spkr_3x1x512_xvector.yaml @@ -0,0 +1,81 @@ +model: "GramVoxNet" +sample_rate: &sample_rate 16000 +dropout: &drop 0.5 +repeat: &rep 1 +time_length: 8 +n_filters: &n_filters 512 + +AudioToSpeechLabelDataLayer: + sample_rate: *sample_rate + train: + min_duration: 0.1 + shuffle: true + eval: + min_duration: 0.01 + shuffle: false + +AudioToMelSpectrogramPreprocessor: + normalize: "per_feature" + window_size: 0.02 + window_stride: 0.01 + window: "hann" + features: &n_mels 64 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + stft_conv: false + +JasperEncoder: + feat_in: *n_mels + activation: "relu" + + jasper: + - filters: *n_filters + repeat: 1 + kernel: [3] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [5] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [7] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [9] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: &enc_feat_out 1500 + repeat: 1 + kernel: [1] + stride: [1] + dilation: [1] + dropout: 0.0 + residual: false + seperable: true + +JasperDecoderForSpkrClass: + feat_in: *enc_feat_out + pool_mode: 'xvector' + emb_sizes: 1024,1024 diff --git a/examples/speaker_recognition/configs/quartznet_spkr_3x2x512_xvector.yaml b/examples/speaker_recognition/configs/quartznet_spkr_3x2x512_xvector.yaml new file mode 100644 index 000000000000..ebe0f8a400a7 --- /dev/null +++ b/examples/speaker_recognition/configs/quartznet_spkr_3x2x512_xvector.yaml @@ -0,0 +1,81 @@ +model: "GramVoxNet" +sample_rate: &sample_rate 16000 +dropout: &drop 0.5 +repeat: &rep 2 +time_length: 8 +n_filters: &n_filters 512 + +AudioToSpeechLabelDataLayer: + sample_rate: *sample_rate + train: + min_duration: 0.1 + shuffle: true + eval: + min_duration: 0.01 + shuffle: false + +AudioToMelSpectrogramPreprocessor: + normalize: "per_feature" + window_size: 0.02 + window_stride: 0.01 + window: "hann" + features: &n_mels 64 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + stft_conv: false + +JasperEncoder: + feat_in: *n_mels + activation: "relu" + + jasper: + - filters: *n_filters + repeat: 1 + kernel: [3] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [5] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [7] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [9] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: &enc_feat_out 1500 + repeat: 1 + kernel: [1] + stride: [1] + dilation: [1] + dropout: 0.0 + residual: false + seperable: true + +JasperDecoderForSpkrClass: + feat_in: *enc_feat_out + pool_mode: 'xvector' + emb_sizes: 1024,1024 diff --git a/examples/speaker_recognition/configs/quartznet_spkr_5x1x512_xvector.yaml b/examples/speaker_recognition/configs/quartznet_spkr_5x1x512_xvector.yaml new file mode 100644 index 000000000000..aa3855ac1f1a --- /dev/null +++ b/examples/speaker_recognition/configs/quartznet_spkr_5x1x512_xvector.yaml @@ -0,0 +1,99 @@ +model: "GramVoxNet" +sample_rate: &sample_rate 16000 +dropout: &drop 0.5 +repeat: &rep 1 +time_length: 8 +n_filters: &n_filters 512 + +AudioToSpeechLabelDataLayer: + sample_rate: *sample_rate + train: + min_duration: 0.1 + shuffle: true + eval: + min_duration: 0.01 + shuffle: false + +AudioToMelSpectrogramPreprocessor: + normalize: "per_feature" + window_size: 0.02 + window_stride: 0.01 + window: "hann" + features: &n_mels 64 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + stft_conv: false + +JasperEncoder: + feat_in: *n_mels + activation: "relu" + + jasper: + - filters: *n_filters + repeat: 1 + kernel: [3] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [3] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [5] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [5] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [7] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [9] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: &enc_feat_out 1500 + repeat: 1 + kernel: [1] + stride: [1] + dilation: [1] + dropout: 0.0 + residual: false + seperable: true + +JasperDecoderForSpkrClass: + feat_in: *enc_feat_out + pool_mode: xvector + emb_sizes: 1024,1024 diff --git a/examples/speaker_recognition/hi-mia_eval.py b/examples/speaker_recognition/hi-mia_eval.py new file mode 100644 index 000000000000..9e35e2d64d05 --- /dev/null +++ b/examples/speaker_recognition/hi-mia_eval.py @@ -0,0 +1,126 @@ +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import numpy as np +from scipy.interpolate import interp1d +from scipy.optimize import brentq +from sklearn.metrics import roc_curve +from tqdm import tqdm + + +""" +This script faciliates to get EER % based on cosine-smilarity +for HI-MIA dataset. + +Args: + data_root str: Path to embeddings file and also make sure trails_1m file is also + placed in this path + emb : test embedding file path + emb_labels : embedding labels file path + emb_size :help="Embeddings size +""" + + +def get_acc(data_root='./myExps/hi-mia/', emb='', emb_labels='', emb_size=512): + basename = os.path.dirname(emb) + X_test = np.load(emb) + label_files = np.load(emb_labels) + + assert len(X_test) == len(label_files) + trail_file = root + 'trials_1m' + + test_list = {} + speaker_list = {} + + for idx, line in enumerate(label_files): + line = line.strip() + speaker = line.split('.')[0].split('_')[0] + test_list[line] = idx + + if speaker in speaker_list: + speaker_list[speaker].append(idx) + else: + speaker_list[speaker] = [idx] + + emb = int(emb_size) + + tmp_file = open(trail_file, 'r').readlines() + trail_score = open('trial_score.txt', 'w') + + trial_embs = [] + keys = [] + all_scores = [] + all_keys = [] + + # for each of trails in trial file + for line in tqdm(tmp_file): + line = line.strip() + x_speaker = line.split(' ')[0] + y_speaker = line.split(' ')[1] + + X = np.zeros(emb,) + for idx in speaker_list[x_speaker]: + X = X + X_test[idx] + + X = X / len(speaker_list[x_speaker]) + + if x_speaker not in keys: + keys.append(x_speaker) + trial_embs.extend([X]) + + Y = np.zeros(emb,) + for idx in speaker_list[y_speaker]: + Y = Y + X_test[idx] + + Y = Y / len(speaker_list[y_speaker]) + + if y_speaker not in keys: + keys.append(y_speaker) + trial_embs.extend([Y]) + + # X=Y + score = (X @ Y.T) / (((X @ X.T) * (Y @ Y.T)) ** 0.5) + score = (score + 1) / 2 + + all_scores.append(score) + truth = 0 if line.split(' ')[-1] == 'nontarget' else 1 + + all_keys.append(truth) + + trail_score.write(str(score) + "\t" + line.split(' ')[-1]) + trail_score.write('\n') + + np.save(basename + '/all_embs_himia.npy', np.asarray(trial_embs)) + np.save(basename + '/all_ids_himia.npy', np.asarray(keys)) + + return np.asarray(all_scores), np.asarray(all_keys) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data_root", help="directory of embeddings location", type=str, required=True) + parser.add_argument("--emb", help="test embedding file path", type=str, required=True) + parser.add_argument("--emb_labels", help="embedding labels file path", type=str, required=True) + parser.add_argument("--emb_size", help="Embeddings size", type=int, required=True) + args = parser.parse_args() + root, emb, emb_labels, emb_size = args.data_root, args.emb, args.emb_labels, args.emb_size + + y_score, y = get_acc(data_root=root, emb=emb, emb_labels=emb_labels, emb_size=emb_size) + fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=1) + + eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0) + print("EER: {:.2f}%".format(eer * 100)) diff --git a/examples/speaker_recognition/kaldi_plda.py b/examples/speaker_recognition/kaldi_plda.py new file mode 100644 index 000000000000..92e1f397801a --- /dev/null +++ b/examples/speaker_recognition/kaldi_plda.py @@ -0,0 +1,56 @@ +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import subprocess + +import numpy as np +from kaldi_python_io import ArchiveWriter + + +def write_scp(root, filename, lines, train): + assert len(lines) == len(train) + filename = os.path.join(root, filename) + with ArchiveWriter(filename + '.ark', filename + '.scp') as writer: + for key, mat in zip(lines, train): + writer.write(key, mat) + print("wrote {}.ark".format(filename)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--root", help="embeddings root path", type=str, required=True) + parser.add_argument("--train_embs", help="npy of train embs for PLDA training", type=str, required=True) + parser.add_argument("--train_labels", help="npy of train labels for PLDA training", type=str, required=True) + parser.add_argument("--eval_embs", help="npy of eval embb for PLDA testing", type=str, required=True) + parser.add_argument("--eval_labels", help="npy of eval labels for PLDA testing", type=str, required=True) + parser.add_argument("--stage", help="1 for test on already trained PLDA 2 otherwise", type=str, required=True) + args = parser.parse_args() + + root = args.root + + if int(args.stage) < 2: + train = np.load(args.train_embs) + labels = np.load(args.train_labels) + + write_scp(root, 'train', labels, train) + + eval = np.load(args.eval_embs) + labels = np.load(args.eval_labels) + + write_scp(root, 'dev', labels, eval) + + cmd = ['bash', 'train_plda.sh', root, args.stage] + subprocess.run(cmd) diff --git a/examples/speaker_recognition/notebooks/Speaker_Recognition_an4.ipynb b/examples/speaker_recognition/notebooks/Speaker_Recognition_an4.ipynb new file mode 100644 index 000000000000..75f62f1c050c --- /dev/null +++ b/examples/speaker_recognition/notebooks/Speaker_Recognition_an4.ipynb @@ -0,0 +1,740 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "colab_type": "code", + "id": "kUlQMiPZxfS_", + "outputId": "cee17d53-c44c-4821-ebeb-4fa347c316b2" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "\"\"\"\n", + "import os\n", + "# If you're using Google Colab and not running locally, run this cell.\n", + "!pip install wget\n", + "!apt-get install sox\n", + "# !pip install nemo_toolkit[asr]==0.10.0b10\n", + "!git clone https://github.com/NVIDIA/NeMo.git\n", + "os.chdir('NeMo')\n", + "!bash reinstall.sh\n", + "!pip install unidecode" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "VgTR8CMlxu3p" + }, + "source": [ + "# **SPEAKER RECOGNITION** \n", + "\n", + "Speaker Recognition (SR) is an broad research area which solves two major tasks: speaker identification (who is speaking?) and speaker verification (is the speaker who she claims to be?). In this work, we focmus on the far-field, text-independent speaker recognition when the identity of the speaker is based on how speech is spoken, not necessarily in what is being said. Typically such SR systems operate on unconstrained speech utterances, \n", + "which are converted into vectors of fixed length, called speaker embeddings. Speaker embeddings are also used in automatic speech recognition (ASR) and speech synthesis.\n", + "\n", + "As the goal of most speaker related systems is to get good speaker level embeddings that could help distinguish from other speakers, we shall first train these embeddings in end-to-end manner optimizing the [QuatzNet](https://arxiv.org/abs/1910.10261) based encoder model on cross-entropy loss. We modify the original quartznet based decoder to get these fixed size embeddings irrespective of the length of the input audio. We employ mean and variance based statistics pooling method to grab these embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "KzzOC5rpx9y6" + }, + "source": [ + "In this tutorial we shall first train these embeddings on speaker related datasets and then get speaker embeddings from a pretrained network for a new dataset. Since Google Colab has very slow read-write speeds I'll be demonstarting this tutorial using [an4](http://www.speech.cs.cmu.edu/databases/an4/). \n", + "\n", + "Instead if you'd like to try on a bigger dataset like [hi-mia](https://arxiv.org/abs/1912.01231) use the [get_hi-mia-data.py](https://github.com/NVIDIA/NeMo/blob/master/scripts/get_hi-mia_data.py) script to download the necessary files, extract them, also re-sample to 16Khz if any of these samples are not at 16Khz. We do also provide scripts to score these embeddings for a speaker-verification task like hi-mia dataset. To do that follow this detailed [tutorial](https://nvidia.github.io/NeMo/) or [notebook](https://github.com/NVIDIA/NeMo/blob/master/examples/speaker_recognition/notebooks/Speaker_Recognition_hi-mia.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 119 + }, + "colab_type": "code", + "id": "UO_hAhMx0rwv", + "outputId": "493bd23a-d07a-46db-e634-d38a09f70ef3" + }, + "outputs": [], + "source": [ + "print(os.getcwd())\n", + "data_dir = 'data'\n", + "!mkdir $data_dir\n", + "import glob\n", + "import subprocess\n", + "import tarfile\n", + "import wget\n", + "\n", + "# Download the dataset. This will take a few moments...\n", + "print(\"******\")\n", + "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", + " an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'\n", + " an4_path = wget.download(an4_url, data_dir)\n", + " print(f\"Dataset downloaded at: {an4_path}\")\n", + "else:\n", + " print(\"Tarfile already exists.\")\n", + " an4_path = data_dir + '/an4_sphere.tar.gz'\n", + "\n", + "# Untar and convert .sph to .wav (using sox)\n", + "tar = tarfile.open(an4_path)\n", + "tar.extractall(path=data_dir)\n", + "\n", + "print(\"Converting .sph to .wav...\")\n", + "sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)\n", + "for sph_path in sph_list:\n", + " wav_path = sph_path[:-4] + '.wav'\n", + " cmd = [\"sox\", sph_path, wav_path]\n", + " subprocess.run(cmd)\n", + "print(\"Finished conversion.\\n******\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LEKDkOSimsKr" + }, + "source": [ + "Since an4 is not designed for speaker recognition, this facilitates the oppurtunity to demostrate how you can generate manifest files that are necessary for training. These methods can be applied to any dataset to get similar training manifest files. \n", + "\n", + "First get a scp file(s) which has all the wav files with absolute path for each of train,dev and test set. This can be easily done by the `find` bash command" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "0e6nuOFN8Pfv", + "scrolled": true + }, + "outputs": [], + "source": [ + "!find $PWD/data/an4/wav/an4_clstk -iname \"*.wav\" > data/an4/wav/an4_clstk/train_all.scp" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "7168Z9eXn4st" + }, + "source": [ + "Let's look at the first 3 lines of scp file for train. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + }, + "colab_type": "code", + "id": "SQupCVpZIvtL", + "outputId": "e45cf645-42fc-4f4f-bd94-964848e04145" + }, + "outputs": [], + "source": [ + "!head -n 3 $data_dir/an4/wav/an4_clstk/train_all.scp" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "cN09z0XFoDjN" + }, + "source": [ + "Since we created the scp file for train, we use `scp_to_manifest.py` to convert this scp file to a manifest file and then optionally split the files to train \\& dev for evaluating the models while training by using the `--split` flag. So as you guessed we wouldn't be needing the `--split` option for test folder. \n", + "Accordingly please mention the `id` number, which is the field num seperated by `/` to be considered as speaker label " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 85 + }, + "colab_type": "code", + "id": "fNXZwNexIkAo", + "outputId": "ca06c4be-c0f6-4ec7-8198-a26347ea4b1e" + }, + "outputs": [], + "source": [ + "!python scripts/scp_to_manifest.py --scp $data_dir/an4/wav/an4_clstk/train_all.scp --id -2 --out $data_dir/an4/wav/an4_clstk/all_manifest.json --split" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dxUL_g77oned" + }, + "source": [ + "Generate the scp for test folder and then convert to a manifest type. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "QShlVwEIO64D", + "outputId": "291d7dce-e202-4062-9eee-e43224084cb5" + }, + "outputs": [], + "source": [ + "!find $PWD/data/an4/wav/an4test_clstk -iname \"*.wav\" > data/an4/wav/an4test_clstk/test_all.scp\n", + "!python scripts/scp_to_manifest.py --scp data/an4/wav/an4test_clstk/test_all.scp --id -2 --out data/an4/wav/an4test_clstk/test.json" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "F4rBMntjpPph" + }, + "source": [ + "Import necessary packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 187 + }, + "colab_type": "code", + "id": "4mSWNvdZPIwR", + "outputId": "83455882-4924-4d18-afd3-d2c8ee8ed78d" + }, + "outputs": [], + "source": [ + "from ruamel.yaml import YAML\n", + "\n", + "import nemo\n", + "import nemo.collections.asr as nemo_asr\n", + "import copy\n", + "from functools import partial" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "CeKfJQ-YpTOv" + }, + "source": [ + "# Building Training and Evaluation DAGs with NeMo\n", + "Building a model using NeMo consists of \n", + "\n", + "1. Instantiating the neural modules we need\n", + "2. specifying the DAG by linking them together.\n", + "\n", + "In NeMo, the training and inference pipelines are managed by a NeuralModuleFactory, which takes care of checkpointing, callbacks, and logs, along with other details in training and inference. We set its log_dir argument to specify where our model logs and outputs will be written, and can set other training and inference settings in its constructor. For instance, if we were resuming training from a checkpoint, we would set the argument checkpoint_dir=``.\n", + "\n", + "Along with logs in NeMo, you can optionally view the tensorboard logs with the create_tb_writer=True argument to the NeuralModuleFactory. By default all the tensorboard log files will be stored in {log_dir}/tensorboard, but you can change this with the tensorboard_dir argument. One can load tensorboard logs through tensorboard by running tensorboard --logdir=`` in the terminal." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "uyn2xrR7R1K_" + }, + "outputs": [], + "source": [ + "exp_name = 'quartznet3x1_an4'\n", + "work_dir = './myExps/'\n", + "neural_factory = nemo.core.NeuralModuleFactory(\n", + " log_dir=work_dir+\"/as4_logdir/\",\n", + " checkpoint_dir=\"./myExps/checkpoints/\" + exp_name,\n", + " create_tb_writer=True,\n", + " random_seed=42,\n", + " tensorboard_dir=work_dir+'/tensorboard/',\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "k-juqc40p8KN" + }, + "source": [ + "Now that we have our neural module factory, we can specify our **neural modules and instantiate them**. Here, we load the parameters for each module from the configuration file. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "mC-KPOy-rpLA", + "outputId": "1d902505-6e35-4eb8-aebf-8401c1bdd39c" + }, + "outputs": [], + "source": [ + "logging = nemo.logging\n", + "yaml = YAML(typ=\"safe\")\n", + "with open('../configs/quartznet_spkr_3x1x512_xvector.yaml') as f:\n", + " spkr_params = yaml.load(f)\n", + "\n", + "sample_rate = spkr_params[\"sample_rate\"]\n", + "time_length = spkr_params.get(\"time_length\", 8)\n", + "logging.info(\"max time length considered for each file is {} sec\".format(time_length))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5VgzNS1lrrqS" + }, + "source": [ + "Instantiating train data_layer using config arguments. `labels = None` automatically creates output labels from manifest files, if you would like to pass those speaker names you can use the labels option. So while instantiating eval data_layer, we can use pass labels to the class in order to match same the speaker output labels as we have in the training data layer. This comes in handy while training on multiple datasets with more than one manifest file. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 153 + }, + "colab_type": "code", + "id": "dC9QOenNPoUs", + "outputId": "786aac99-57f6-4066-e9a3-4908dc6e3d7a" + }, + "outputs": [], + "source": [ + "train_dl_params = copy.deepcopy(spkr_params[\"AudioToSpeechLabelDataLayer\"])\n", + "train_dl_params.update(spkr_params[\"AudioToSpeechLabelDataLayer\"][\"train\"])\n", + "del train_dl_params[\"train\"]\n", + "del train_dl_params[\"eval\"]\n", + "\n", + "batch_size=64\n", + "data_layer_train = nemo_asr.AudioToSpeechLabelDataLayer(\n", + " manifest_filepath='../data/an4/wav/an4_clstk/train.json',\n", + " labels=None,\n", + " batch_size=batch_size,\n", + " time_length=time_length,\n", + " **train_dl_params,\n", + " )\n", + "\n", + "eval_dl_params = copy.deepcopy(spkr_params[\"AudioToSpeechLabelDataLayer\"])\n", + "eval_dl_params.update(spkr_params[\"AudioToSpeechLabelDataLayer\"][\"eval\"])\n", + "del eval_dl_params[\"train\"]\n", + "del eval_dl_params[\"eval\"]\n", + "\n", + "data_layer_eval = nemo_asr.AudioToSpeechLabelDataLayer(\n", + " manifest_filepath=\"../data/an4/wav/an4_clstk/dev.json\",\n", + " labels=data_layer_train.labels,\n", + " batch_size=batch_size,\n", + " time_length=time_length,\n", + " **eval_dl_params,\n", + ")\n", + "\n", + "data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(\n", + " sample_rate=sample_rate, **spkr_params[\"AudioToMelSpectrogramPreprocessor\"],\n", + " )\n", + "encoder = nemo_asr.JasperEncoder(**spkr_params[\"JasperEncoder\"],)\n", + "\n", + "decoder = nemo_asr.JasperDecoderForSpkrClass(\n", + " feat_in=spkr_params[\"JasperEncoder\"][\"jasper\"][-1][\"filters\"],\n", + " num_classes=data_layer_train.num_classes,\n", + " pool_mode=spkr_params[\"JasperDecoderForSpkrClass\"]['pool_mode'],\n", + " emb_sizes=spkr_params[\"JasperDecoderForSpkrClass\"][\"emb_sizes\"].split(\",\"),\n", + " )\n", + "\n", + "xent_loss = nemo_asr.CrossEntropyLossNM(weight=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9bAP70DqsXGY" + }, + "source": [ + "The next step is to assemble our training DAG by specifying the inputs to each neural module." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "colab_type": "code", + "id": "1raBGmd5Vshl", + "outputId": "33a128f4-193f-4913-9c82-fd27610dfb9a" + }, + "outputs": [], + "source": [ + "audio_signal, audio_signal_len, label, label_len = data_layer_train()\n", + "processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len)\n", + "encoded, encoded_len = encoder(audio_signal=processed_signal, length=processed_signal_len)\n", + "logits, _ = decoder(encoder_output=encoded)\n", + "loss = xent_loss(logits=logits, labels=label)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "uwnZT8ycsYMa" + }, + "source": [ + "We would like to be able to evaluate our model on the dev set, as well, so let's set up the evaluation DAG.\n", + "\n", + "Our evaluation DAG will reuse most of the parts of the training DAG with the exception of the data layer, since we are loading the evaluation data from a different file but evaluating on the same model. Note that if we were using data augmentation in training, we would also leave that out in the evaluation DAG." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "colab_type": "code", + "id": "sPPyiNtLWDyf", + "outputId": "3c37a7dd-b85c-4d29-edfe-cfd0cd16abe4" + }, + "outputs": [], + "source": [ + "audio_signal_test, audio_len_test, label_test, _ = data_layer_eval()\n", + "processed_signal_test, processed_len_test = data_preprocessor(\n", + " input_signal=audio_signal_test, length=audio_len_test\n", + " )\n", + "encoded_test, encoded_len_test = encoder(audio_signal=processed_signal_test, length=processed_len_test)\n", + "logits_test, _ = decoder(encoder_output=encoded_test)\n", + "loss_test = xent_loss(logits=logits_test, labels=label_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8m7dz1-usp1S" + }, + "source": [ + "# Creating CallBacks\n", + "\n", + "We would like to be able to monitor our model while it's training, so we use callbacks. In general, callbacks are functions that are called at specific intervals over the course of training or inference, such as at the start or end of every n iterations, epochs, etc. The callbacks we'll be using for this are the SimpleLossLoggerCallback, which reports the training loss (or another metric of your choosing, such as \\% accuracy for speaker recognition tasks), and the EvaluatorCallback, which regularly evaluates the model on the dev set. Both of these callbacks require you to pass in the tensors to be evaluated--these would be the final outputs of the training and eval DAGs above.\n", + "\n", + "Another useful callback is the CheckpointCallback, for saving checkpoints at set intervals. We create one here just to demonstrate how it works." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LFlXnbRaWTVl" + }, + "outputs": [], + "source": [ + "from nemo.collections.asr.helpers import (\n", + " monitor_classification_training_progress,\n", + " process_classification_evaluation_batch,\n", + " process_classification_evaluation_epoch,\n", + ")\n", + "from nemo.utils.lr_policies import CosineAnnealing\n", + "\n", + "train_callback = nemo.core.SimpleLossLoggerCallback(\n", + " tensors=[loss, logits, label],\n", + " print_func=partial(monitor_classification_training_progress, eval_metric=[1]),\n", + " step_freq=40,\n", + " get_tb_values=lambda x: [(\"train_loss\", x[0])],\n", + " tb_writer=neural_factory.tb_writer,\n", + " )\n", + "\n", + "callbacks = [train_callback]\n", + "\n", + "chpt_callback = nemo.core.CheckpointCallback(\n", + " folder=\"./myExps/checkpoints/\" + exp_name,\n", + " load_from_folder=\"./myExps/checkpoints/\" + exp_name,\n", + " step_freq=100,\n", + " )\n", + "callbacks.append(chpt_callback)\n", + "\n", + "tagname = \"an4_dev\"\n", + "eval_callback = nemo.core.EvaluatorCallback(\n", + " eval_tensors=[loss_test, logits_test, label_test],\n", + " user_iter_callback=partial(process_classification_evaluation_batch, top_k=1),\n", + " user_epochs_done_callback=partial(process_classification_evaluation_epoch, tag=tagname),\n", + " eval_step=100, # How often we evaluate the model on the test set\n", + " tb_writer=neural_factory.tb_writer,\n", + " )\n", + "\n", + "callbacks.append(eval_callback)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "a8EFjLsWs_jM" + }, + "source": [ + "Now that we have our model and callbacks set up, how do we run it?\n", + "\n", + "Once we create our neural factory and the callbacks for the information that we want to see, we can start training by simply calling the train function on the tensors we want to optimize and our callbacks! Since this notebook is for you to get started, by an4 as dataset is small it would quickly get higher accuracies. For better models use bigger datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "colab_type": "code", + "id": "xHTEtz7yXVMK", + "outputId": "bd53ae06-cd0d-4291-da66-1af3079cbd86" + }, + "outputs": [], + "source": [ + "# train model\n", + "num_epochs=100\n", + "N = len(data_layer_train)\n", + "steps_per_epoch = N // batch_size\n", + "\n", + "logging.info(\"Number of steps per epoch {}\".format(steps_per_epoch))\n", + "\n", + "neural_factory.train(\n", + " tensors_to_optimize=[loss],\n", + " callbacks=callbacks,\n", + " lr_policy=CosineAnnealing(\n", + " num_epochs * steps_per_epoch, warmup_steps=0.1 * num_epochs * steps_per_epoch,\n", + " ),\n", + " optimizer=\"novograd\",\n", + " optimization_params={\n", + " \"num_epochs\": num_epochs,\n", + " \"lr\": 0.01,\n", + " \"betas\": (0.95, 0.5),\n", + " \"weight_decay\": 0.001,\n", + " \"grad_norm_clip\": None,\n", + " }\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BB6s19pmxGfX" + }, + "source": [ + "Now that we trained our embeddings, we shall extract these embeddings using our pretrained checkpoint present at `checkpoint_dir`. As we can see from the neural architecture, we extract the embeddings after the `emb1` layer. \n", + "![Speaker Recognition Layers](./speaker_reco.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "oSIDu6jkym66" + }, + "source": [ + "Now use the test manifest to get the embeddings. As we saw before, let's create a new `data_layer` for test. Use previously instiated models and attach the DAGs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 258 + }, + "colab_type": "code", + "id": "5JqUVbKDY32a", + "outputId": "dd835e02-8882-4287-9639-c249ac3dfc94" + }, + "outputs": [], + "source": [ + "eval_dl_params = copy.deepcopy(spkr_params[\"AudioToSpeechLabelDataLayer\"])\n", + "eval_dl_params.update(spkr_params[\"AudioToSpeechLabelDataLayer\"][\"eval\"])\n", + "del eval_dl_params[\"train\"]\n", + "del eval_dl_params[\"eval\"]\n", + "eval_dl_params['shuffle'] = False # To grab the file names without changing data_layer\n", + "\n", + "test_dataset = '../data/an4/wav/an4test_clstk/test.json'\n", + "data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer(\n", + " manifest_filepath=test_dataset,\n", + " labels=None,\n", + " batch_size=batch_size,\n", + " **eval_dl_params,\n", + " )\n", + "\n", + "audio_signal_test, audio_len_test, label_test, _ = data_layer_test()\n", + "processed_signal_test, processed_len_test = data_preprocessor(\n", + " input_signal=audio_signal_test, length=audio_len_test)\n", + "encoded_test, _ = encoder(audio_signal=processed_signal_test, length=processed_len_test)\n", + "_, embeddings = decoder(encoder_output=encoded_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dwEifkD9zfpl" + }, + "source": [ + "Now get the embeddings using neural_factor infer command, that just does forward pass of all our modules. And save our embeddings in `/embeddings`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 153 + }, + "colab_type": "code", + "id": "wGxYiFpJze5h", + "outputId": "dbbc7204-28bc-43e9-b6aa-f3f757f5d4b5" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import json\n", + "eval_tensors = neural_factory.infer(tensors=[embeddings, label_test], checkpoint_dir=\"./myExps/checkpoints/\" + exp_name)\n", + " # inf_loss , inf_emb, inf_logits, inf_label = eval_tensors\n", + "inf_emb, inf_label = eval_tensors\n", + "whole_embs = []\n", + "whole_labels = []\n", + "manifest = open(test_dataset, 'r').readlines()\n", + "\n", + "for line in manifest:\n", + " line = line.strip()\n", + " dic = json.loads(line)\n", + " filename = dic['audio_filepath'].split('/')[-1]\n", + " whole_labels.append(filename)\n", + "\n", + "for idx in range(len(inf_label)):\n", + " whole_embs.extend(inf_emb[idx].numpy())\n", + "\n", + "embedding_dir = './myExps/embeddings/'\n", + "if not os.path.exists(embedding_dir):\n", + " os.mkdir(embedding_dir)\n", + "\n", + "filename = os.path.basename(test_dataset).split('.')[0]\n", + "name = embedding_dir + filename\n", + "\n", + "np.save(name + '.npy', np.asarray(whole_embs))\n", + "np.save(name + '_labels.npy', np.asarray(whole_labels))\n", + "logging.info(\"Saved embedding files to {}\".format(embedding_dir))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "SKKVIb7e6vel", + "outputId": "a3fa3703-da6c-4a07-c20c-c83df11a8f25" + }, + "outputs": [], + "source": [ + "ls myExps/embeddings/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "A_7S4Yja7A8V" + }, + "source": [ + "Since an4 doesn't have trails files to demonstrate cosine and PLDA scoring. Tutorial for that can be found at\n", + "[hi-mia notebook](https://github.com/NVIDIA/NeMo/blob/master/examples/speaker_recognition/notebooks/Speaker_Recognition_an4.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "Speaker_Recognition_dataset.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/speaker_recognition/notebooks/Speaker_Recognition_hi-mia.ipynb b/examples/speaker_recognition/notebooks/Speaker_Recognition_hi-mia.ipynb new file mode 100644 index 000000000000..234bd53cbde0 --- /dev/null +++ b/examples/speaker_recognition/notebooks/Speaker_Recognition_hi-mia.ipynb @@ -0,0 +1,761 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "colab_type": "code", + "id": "kUlQMiPZxfS_", + "outputId": "cee17d53-c44c-4821-ebeb-4fa347c316b2" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "\"\"\"\n", + "# If you're using Google Colab and not running locally, run this cell.\n", + "import os\n", + "!pip install wget\n", + "!apt-get install sox\n", + "\n", + "!git clone https://github.com/NVIDIA/NeMo.git\n", + "os.chdir('NeMo')\n", + "!bash reinstall.sh\n", + "\n", + "!pip install unidecode" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "VgTR8CMlxu3p" + }, + "source": [ + "# **SPEAKER RECOGNITION** \n", + "\n", + "Speaker Recognition (SR) is an broad research area which solves two major tasks: speaker identification (who is speaking?) and speaker verification (is the speaker who she claims to be?). In this work, we focmus on the far-field, text-independent speaker recognition when the identity of the speaker is based on how speech is spoken, not necessarily in what is being said. Typically such SR systems operate on unconstrained speech utterances, \n", + "which are converted into vectors of fixed length, called speaker embeddings. Speaker embeddings are also used in automatic speech recognition (ASR) and speech synthesis.\n", + "\n", + "As the goal of most speaker related systems is to get good speaker level embeddings that could help distinguish from other speakers, we shall first train these embeddings in end-to-end manner optimizing the [QuatzNet](https://arxiv.org/abs/1910.10261) based encoder model on cross-entropy loss. We modify the original quartznet based decoder to get these fixed size embeddings irrespective of the length of the input audio. We employ mean and variance based statistics pooling method to grab these embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "KzzOC5rpx9y6" + }, + "source": [ + "In this tutorial we shall first train these embeddings on speaker related datasets and then get speaker embeddings from a pretrained network for a new dataset. Since Google Colab has very slow read-write speeds, Please run this locally for training on [hi-mia](https://arxiv.org/abs/1912.01231). \n", + "\n", + "We use the [get_hi-mia-data.py](https://github.com/NVIDIA/NeMo/blob/master/scripts/get_hi-mia_data.py) script to download the necessary files, extract them, also re-sample to 16Khz if any of these samples are not at 16Khz. We do also provide scripts to score these embeddings for a speaker-verification task like hi-mia dataset at the end. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 119 + }, + "colab_type": "code", + "id": "UO_hAhMx0rwv", + "outputId": "493bd23a-d07a-46db-e634-d38a09f70ef3" + }, + "outputs": [], + "source": [ + "data_dir = 'scripts/data/'\n", + "!mkdir $data_dir\n", + "\n", + "# Download and process dataset. This will take a few moments...\n", + "!python scripts/get_hi-mia_data.py --data_root=$data_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After download and conversion, your `data` folder should contain directories with manifest files as:\n", + "\n", + "* `data//train.json`\n", + "* `data//dev.json` \n", + "* `data//{set}_all.json` \n", + "\n", + "Also for each set we also create utt2spk files, these files later would be used in PLDA training.\n", + "\n", + "Each line in manifest file describes a training sample - `audio_filepath` contains path to the wav file, `duration` it's duration in seconds, and `label` is the speaker class label:\n", + "\n", + "`{\"audio_filepath\": \"/data/train/SPEECHDATA/wav/SV0184/SV0184_6_04_N3430.wav\", \"duration\": 1.22, \"label\": \"SV0184\"}` \n", + "\n", + "`{\"audio_filepath\": \"/data/train/SPEECHDATA/wav/SV0184/SV0184_5_03_F2037.wav\", duration\": 1.375, \"label\": \"SV0184\"}`\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "F4rBMntjpPph" + }, + "source": [ + "Import necessary packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 187 + }, + "colab_type": "code", + "id": "4mSWNvdZPIwR", + "outputId": "83455882-4924-4d18-afd3-d2c8ee8ed78d" + }, + "outputs": [], + "source": [ + "from ruamel.yaml import YAML\n", + "\n", + "import nemo\n", + "import nemo.collections.asr as nemo_asr\n", + "import copy\n", + "from functools import partial" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "CeKfJQ-YpTOv" + }, + "source": [ + "# Building Training and Evaluation DAGs with NeMo\n", + "Building a model using NeMo consists of \n", + "\n", + "1. Instantiating the neural modules we need\n", + "2. specifying the DAG by linking them together.\n", + "\n", + "In NeMo, the training and inference pipelines are managed by a NeuralModuleFactory, which takes care of checkpointing, callbacks, and logs, along with other details in training and inference. We set its log_dir argument to specify where our model logs and outputs will be written, and can set other training and inference settings in its constructor. For instance, if we were resuming training from a checkpoint, we would set the argument checkpoint_dir=``.\n", + "\n", + "Along with logs in NeMo, you can optionally view the tensorboard logs with the create_tb_writer=True argument to the NeuralModuleFactory. By default all the tensorboard log files will be stored in {log_dir}/tensorboard, but you can change this with the tensorboard_dir argument. One can load tensorboard logs through tensorboard by running tensorboard --logdir=`` in the terminal." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "uyn2xrR7R1K_" + }, + "outputs": [], + "source": [ + "exp_name = 'quartznet3x2_hi-mia'\n", + "work_dir = './myExps/'\n", + "neural_factory = nemo.core.NeuralModuleFactory(\n", + " log_dir=work_dir+\"/hi-mia_logdir/\",\n", + " checkpoint_dir=\"./myExps/checkpoints/\" + exp_name,\n", + " create_tb_writer=True,\n", + " random_seed=42,\n", + " tensorboard_dir=work_dir+'/tensorboard/',\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "k-juqc40p8KN" + }, + "source": [ + "Now that we have our neural module factory, we can specify our **neural modules and instantiate them**. Here, we load the parameters for each module from the configuration file. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "mC-KPOy-rpLA", + "outputId": "1d902505-6e35-4eb8-aebf-8401c1bdd39c" + }, + "outputs": [], + "source": [ + "logging = nemo.logging\n", + "yaml = YAML(typ=\"safe\")\n", + "with open('examples/speaker_recognition/configs/quartznet_spkr_3x2x512_xvector.yaml') as f:\n", + " spkr_params = yaml.load(f)\n", + "\n", + "sample_rate = spkr_params[\"sample_rate\"]\n", + "time_length = spkr_params.get(\"time_length\", 8)\n", + "logging.info(\"max time length considered for each file is {} sec\".format(time_length))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5VgzNS1lrrqS" + }, + "source": [ + "Instantiating train data_layer using config arguments. `labels = None` automatically creates output labels from manifest files, if you would like to pass those speaker names you can use the labels option. So while instantiating eval data_layer, we can use pass labels to the class in order to match same the speaker output labels as we have in the training data layer. This comes in handy while training on multiple datasets with more than one manifest file. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 153 + }, + "colab_type": "code", + "id": "dC9QOenNPoUs", + "outputId": "786aac99-57f6-4066-e9a3-4908dc6e3d7a" + }, + "outputs": [], + "source": [ + "train_dl_params = copy.deepcopy(spkr_params[\"AudioToSpeechLabelDataLayer\"])\n", + "train_dl_params.update(spkr_params[\"AudioToSpeechLabelDataLayer\"][\"train\"])\n", + "del train_dl_params[\"train\"]\n", + "del train_dl_params[\"eval\"]\n", + "\n", + "batch_size=64\n", + "data_layer_train = nemo_asr.AudioToSpeechLabelDataLayer(\n", + " manifest_filepath=data_dir+'/train/train.json',\n", + " labels=None,\n", + " batch_size=batch_size,\n", + " time_length=time_length,\n", + " **train_dl_params,\n", + " )\n", + "\n", + "eval_dl_params = copy.deepcopy(spkr_params[\"AudioToSpeechLabelDataLayer\"])\n", + "eval_dl_params.update(spkr_params[\"AudioToSpeechLabelDataLayer\"][\"eval\"])\n", + "del eval_dl_params[\"train\"]\n", + "del eval_dl_params[\"eval\"]\n", + "\n", + "data_layer_eval = nemo_asr.AudioToSpeechLabelDataLayer(\n", + " manifest_filepath=data_dir+'/train/dev.json\",\n", + " labels=data_layer_train.labels,\n", + " batch_size=batch_size,\n", + " time_length=time_length,\n", + " **eval_dl_params,\n", + ")\n", + "\n", + "data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(\n", + " sample_rate=sample_rate, **spkr_params[\"AudioToMelSpectrogramPreprocessor\"],\n", + " )\n", + "encoder = nemo_asr.JasperEncoder(**spkr_params[\"JasperEncoder\"],)\n", + "\n", + "decoder = nemo_asr.JasperDecoderForSpkrClass(\n", + " feat_in=spkr_params[\"JasperEncoder\"][\"jasper\"][-1][\"filters\"],\n", + " num_classes=data_layer_train.num_classes,\n", + " pool_mode=spkr_params[\"JasperDecoderForSpkrClass\"]['pool_mode'],\n", + " emb_sizes=spkr_params[\"JasperDecoderForSpkrClass\"][\"emb_sizes\"].split(\",\"),\n", + " )\n", + "\n", + "xent_loss = nemo_asr.CrossEntropyLossNM(weight=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9bAP70DqsXGY" + }, + "source": [ + "The next step is to assemble our training DAG by specifying the inputs to each neural module." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "colab_type": "code", + "id": "1raBGmd5Vshl", + "outputId": "33a128f4-193f-4913-9c82-fd27610dfb9a" + }, + "outputs": [], + "source": [ + "audio_signal, audio_signal_len, label, label_len = data_layer_train()\n", + "processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len)\n", + "encoded, encoded_len = encoder(audio_signal=processed_signal, length=processed_signal_len)\n", + "logits, _ = decoder(encoder_output=encoded)\n", + "loss = xent_loss(logits=logits, labels=label)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "uwnZT8ycsYMa" + }, + "source": [ + "We would like to be able to evaluate our model on the dev set, as well, so let's set up the evaluation DAG.\n", + "\n", + "Our evaluation DAG will reuse most of the parts of the training DAG with the exception of the data layer, since we are loading the evaluation data from a different file but evaluating on the same model. Note that if we were using data augmentation in training, we would also leave that out in the evaluation DAG." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "colab_type": "code", + "id": "sPPyiNtLWDyf", + "outputId": "3c37a7dd-b85c-4d29-edfe-cfd0cd16abe4" + }, + "outputs": [], + "source": [ + "audio_signal_test, audio_len_test, label_test, _ = data_layer_eval()\n", + "processed_signal_test, processed_len_test = data_preprocessor(\n", + " input_signal=audio_signal_test, length=audio_len_test\n", + " )\n", + "encoded_test, encoded_len_test = encoder(audio_signal=processed_signal_test, length=processed_len_test)\n", + "logits_test, _ = decoder(encoder_output=encoded_test)\n", + "loss_test = xent_loss(logits=logits_test, labels=label_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8m7dz1-usp1S" + }, + "source": [ + "# Creating CallBacks\n", + "\n", + "We would like to be able to monitor our model while it's training, so we use callbacks. In general, callbacks are functions that are called at specific intervals over the course of training or inference, such as at the start or end of every n iterations, epochs, etc. The callbacks we'll be using for this are the SimpleLossLoggerCallback, which reports the training loss (or another metric of your choosing, such as \\% accuracy for speaker recognition tasks), and the EvaluatorCallback, which regularly evaluates the model on the dev set. Both of these callbacks require you to pass in the tensors to be evaluated--these would be the final outputs of the training and eval DAGs above.\n", + "\n", + "Another useful callback is the CheckpointCallback, for saving checkpoints at set intervals. We create one here just to demonstrate how it works." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LFlXnbRaWTVl" + }, + "outputs": [], + "source": [ + "from nemo.collections.asr.helpers import (\n", + " monitor_classification_training_progress,\n", + " process_classification_evaluation_batch,\n", + " process_classification_evaluation_epoch,\n", + ")\n", + "from nemo.utils.lr_policies import CosineAnnealing\n", + "\n", + "train_callback = nemo.core.SimpleLossLoggerCallback(\n", + " tensors=[loss, logits, label],\n", + " print_func=partial(monitor_classification_training_progress, eval_metric=[1]),\n", + " step_freq=1000,\n", + " get_tb_values=lambda x: [(\"train_loss\", x[0])],\n", + " tb_writer=neural_factory.tb_writer,\n", + " )\n", + "\n", + "callbacks = [train_callback]\n", + "\n", + "chpt_callback = nemo.core.CheckpointCallback(\n", + " folder=\"./myExps/checkpoints/\" + exp_name,\n", + " load_from_folder=\"./myExps/checkpoints/\" + exp_name,\n", + " step_freq=1000,\n", + " )\n", + "callbacks.append(chpt_callback)\n", + "\n", + "tagname = \"hi-mia_dev\"\n", + "eval_callback = nemo.core.EvaluatorCallback(\n", + " eval_tensors=[loss_test, logits_test, label_test],\n", + " user_iter_callback=partial(process_classification_evaluation_batch, top_k=1),\n", + " user_epochs_done_callback=partial(process_classification_evaluation_epoch, tag=tagname),\n", + " eval_step=1000, # How often we evaluate the model on the test set\n", + " tb_writer=neural_factory.tb_writer,\n", + " )\n", + "\n", + "callbacks.append(eval_callback)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "a8EFjLsWs_jM" + }, + "source": [ + "Now that we have our model and callbacks set up, how do we run it?\n", + "\n", + "Once we create our neural factory and the callbacks for the information that we want to see, we can start training by simply calling the train function on the tensors we want to optimize and our callbacks! Since this notebook is for you to get started, by an4 as dataset is small it would quickly get higher accuracies. For better models use bigger datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "colab_type": "code", + "id": "xHTEtz7yXVMK", + "outputId": "bd53ae06-cd0d-4291-da66-1af3079cbd86" + }, + "outputs": [], + "source": [ + "# train model\n", + "num_epochs=25\n", + "N = len(data_layer_train)\n", + "steps_per_epoch = N // batch_size\n", + "\n", + "logging.info(\"Number of steps per epoch {}\".format(steps_per_epoch))\n", + "\n", + "neural_factory.train(\n", + " tensors_to_optimize=[loss],\n", + " callbacks=callbacks,\n", + " lr_policy=CosineAnnealing(\n", + " num_epochs * steps_per_epoch, warmup_steps=0.1 * num_epochs * steps_per_epoch,\n", + " ),\n", + " optimizer=\"novograd\",\n", + " optimization_params={\n", + " \"num_epochs\": num_epochs,\n", + " \"lr\": 0.02,\n", + " \"betas\": (0.95, 0.5),\n", + " \"weight_decay\": 0.001,\n", + " \"grad_norm_clip\": None,\n", + " }\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BB6s19pmxGfX" + }, + "source": [ + "Now that we trained our embeddings, we shall extract these embeddings using our pretrained checkpoint present at `checkpoint_dir`. As we can see from the neural architecture, we extract the embeddings after the `emb1` layer. \n", + "![Speaker Recognition Layers](./speaker_reco.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "oSIDu6jkym66" + }, + "source": [ + "Now use the test manifest to get the embeddings. As we saw before, let's create a new `data_layer` for test. Use previously instiated models and attach the DAGs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 258 + }, + "colab_type": "code", + "id": "5JqUVbKDY32a", + "outputId": "dd835e02-8882-4287-9639-c249ac3dfc94" + }, + "outputs": [], + "source": [ + "eval_dl_params = copy.deepcopy(spkr_params[\"AudioToSpeechLabelDataLayer\"])\n", + "eval_dl_params.update(spkr_params[\"AudioToSpeechLabelDataLayer\"][\"eval\"])\n", + "del eval_dl_params[\"train\"]\n", + "del eval_dl_params[\"eval\"]\n", + "eval_dl_params['shuffle'] = False # To grab the file names without changing data_layer\n", + "\n", + "test_dataset = data_dir+'/test/test_all.json',\n", + "data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer(\n", + " manifest_filepath=test_dataset,\n", + " labels=None,\n", + " batch_size=batch_size,\n", + " **eval_dl_params,\n", + " )\n", + "\n", + "audio_signal_test, audio_len_test, label_test, _ = data_layer_test()\n", + "processed_signal_test, processed_len_test = data_preprocessor(\n", + " input_signal=audio_signal_test, length=audio_len_test)\n", + "encoded_test, _ = encoder(audio_signal=processed_signal_test, length=processed_len_test)\n", + "_, embeddings = decoder(encoder_output=encoded_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dwEifkD9zfpl" + }, + "source": [ + "Now get the embeddings using neural_factor infer command, that just does forward pass of all our modules. And save our embeddings in `/embeddings`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 153 + }, + "colab_type": "code", + "id": "wGxYiFpJze5h", + "outputId": "dbbc7204-28bc-43e9-b6aa-f3f757f5d4b5" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import json\n", + "eval_tensors = neural_factory.infer(tensors=[embeddings, label_test], checkpoint_dir=\"./myExps/checkpoints/\" + exp_name)\n", + "\n", + "inf_emb, inf_label = eval_tensors\n", + "whole_embs = []\n", + "whole_labels = []\n", + "manifest = open(test_dataset, 'r').readlines()\n", + "\n", + "for line in manifest:\n", + " line = line.strip()\n", + " dic = json.loads(line)\n", + " filename = dic['audio_filepath'].split('/')[-1]\n", + " whole_labels.append(filename)\n", + "\n", + "for idx in range(len(inf_label)):\n", + " whole_embs.extend(inf_emb[idx].numpy())\n", + "\n", + "embedding_dir = './myExps/embeddings/'\n", + "if not os.path.exists(embedding_dir):\n", + " os.mkdir(embedding_dir)\n", + "\n", + "filename = os.path.basename(test_dataset).split('.')[0]\n", + "name = embedding_dir + filename\n", + "\n", + "np.save(name + '.npy', np.asarray(whole_embs))\n", + "np.save(name + '_labels.npy', np.asarray(whole_labels))\n", + "logging.info(\"Saved embedding files to {}\".format(embedding_dir))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "SKKVIb7e6vel", + "outputId": "a3fa3703-da6c-4a07-c20c-c83df11a8f25" + }, + "outputs": [], + "source": [ + "!ls $embedding_dir" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "A_7S4Yja7A8V" + }, + "source": [ + "# Cosine Similarity Scoring\n", + "\n", + "Here we provide a script scoring on hi-mia whose trial file has structure ` ` . First copy the `trails_1m` file present in test folder to our embeddings directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!cp $data_dir/test/trails_1m $embedding_dir/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "the below command would output the EER% based on cosine similarity score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python examples/speaker_recognition/hi-mia_eval.py --data_root $embedding_dir --emb $embedding_dir/test_all.npy --emb_labels $embedding_dir/test_all_labels.npy --emb_size 1024\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PLDA Backend\n", + "To finetune our speaker embeddings further, we used kaldi PLDA scripts to train PLDA and evaluate as well. so from this point going forward, please make sure you installed kaldi and was added to your path as KALDI_ROOT.\n", + "\n", + "To train PLDA, we can either use dev set or training set. Let's use the training set embeddings to train PLDA and further use this trained PLDA model to score in test embeddings. in order to do that we should get embeddings for our training data as well. As similar to above steps, generate the train embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_dataset = data_dir+'/train/train.json',\n", + "\n", + "data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer(\n", + " manifest_filepath=test_dataset,\n", + " labels=None,\n", + " batch_size=batch_size,\n", + " **eval_dl_params,\n", + " )\n", + "\n", + "audio_signal_test, audio_len_test, label_test, _ = data_layer_test()\n", + "processed_signal_test, processed_len_test = data_preprocessor(\n", + " input_signal=audio_signal_test, length=audio_len_test)\n", + "encoded_test, _ = encoder(audio_signal=processed_signal_test, length=processed_len_test)\n", + "_, embeddings = decoder(encoder_output=encoded_test)\n", + "\n", + "eval_tensors = neural_factory.infer(tensors=[embeddings, label_test], checkpoint_dir=\"./myExps/checkpoints/\" + exp_name)\n", + "\n", + "inf_emb, inf_label = eval_tensors\n", + "whole_embs = []\n", + "whole_labels = []\n", + "manifest = open(test_dataset, 'r').readlines()\n", + "\n", + "for line in manifest:\n", + " line = line.strip()\n", + " dic = json.loads(line)\n", + " filename = dic['audio_filepath'].split('/')[-1]\n", + " whole_labels.append(filename)\n", + "\n", + "for idx in range(len(inf_label)):\n", + " whole_embs.extend(inf_emb[idx].numpy())\n", + "\n", + "if not os.path.exists(embedding_dir):\n", + " os.mkdir(embedding_dir)\n", + "\n", + "filename = os.path.basename(test_dataset).split('.')[0]\n", + "name = embedding_dir + filename\n", + "\n", + "np.save(name + '.npy', np.asarray(whole_embs))\n", + "np.save(name + '_labels.npy', np.asarray(whole_labels))\n", + "logging.info(\"Saved embedding files to {}\".format(embedding_dir))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As part of kaldi necessary files we need `utt2spk` \\& `spk2utt` file to get ark file for PLDA training. to do that, copy the generated utt2spk file from `data_dir` train folder to create spk2utt file using \n", + "\n", + "`utt2spk_to_spk2utt.pl $data_dir/train/utt2spk > $embedding_dir/spk2utt`\n", + "\n", + "Then run the below python script to get EER score using PLDA backend scoring. This script does both data preparation for kaldi followed by PLDA scoring. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python examples/speaker_recognition/kaldi_plda.py --root $embedding_dir --train_embs $embedding_dir/train.npy --train_labels $embedding_dir/train_labels.npy \n", + "--eval_embs $embedding_dir/all_embs_himia.npy --eval_labels $embedding_dir/all_ids_himia.npy --stage=1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here `--stage = 1` trains PLDA model but if you already have a trained PLDA then you can directly evaluate on it by `--stage=2` option.\n", + "\n", + "This should output an EER of 6.32% with minDCF: 0.455" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Performance Improvement\n", + "\n", + "To improve your embeddings performance:\n", + " \n", + "* Add more data and Train longer (100 epochs)\n", + "\n", + "* Try adding the augmentation –see config file\n", + "\n", + "* Use larger model\n", + "\n", + "* Train on several GPUs and use mixed precision (on NVIDIA Volta and Turing GPUs)\n", + "\n", + "* Start with pre-trained checkpoints" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "Speaker_Recognition_dataset.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/speaker_recognition/notebooks/speaker_reco.jpg b/examples/speaker_recognition/notebooks/speaker_reco.jpg new file mode 100644 index 000000000000..71350e0c4c5c Binary files /dev/null and b/examples/speaker_recognition/notebooks/speaker_reco.jpg differ diff --git a/examples/speaker_recognition/speaker_reco.py b/examples/speaker_recognition/speaker_reco.py new file mode 100644 index 000000000000..3c6cf1b84985 --- /dev/null +++ b/examples/speaker_recognition/speaker_reco.py @@ -0,0 +1,295 @@ +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import copy +import os +from functools import partial + +from ruamel.yaml import YAML + +import nemo +import nemo.collections.asr as nemo_asr +import nemo.utils.argparse as nm_argparse +from nemo.collections.asr.helpers import ( + monitor_classification_training_progress, + process_classification_evaluation_batch, + process_classification_evaluation_epoch, +) +from nemo.utils.lr_policies import CosineAnnealing + +logging = nemo.logging + + +def parse_args(): + parser = argparse.ArgumentParser( + parents=[nm_argparse.NemoArgParser()], description="SpeakerRecognition", conflict_handler="resolve", + ) + parser.set_defaults( + checkpoint_dir=None, + optimizer="novograd", + batch_size=32, + eval_batch_size=64, + lr=0.01, + weight_decay=0.001, + amp_opt_level="O1", + create_tb_writer=True, + ) + + # Overwrite default args + parser.add_argument( + "--num_epochs", + type=int, + default=None, + required=True, + help="number of epochs to train. You should specify either num_epochs or max_steps", + ) + parser.add_argument( + "--model_config", type=str, required=True, help="model configuration file: model.yaml", + ) + + # Create new args + parser.add_argument("--exp_name", default="SpkrReco_GramMatrix", type=str) + parser.add_argument("--beta1", default=0.95, type=float) + parser.add_argument("--beta2", default=0.5, type=float) + parser.add_argument("--warmup_steps", default=1000, type=int) + parser.add_argument("--load_dir", default=None, type=str) + parser.add_argument("--synced_bn", action="store_true", help="Use synchronized batch norm") + parser.add_argument("--emb_size", default=256, type=int) + parser.add_argument("--synced_bn_groupsize", default=0, type=int) + parser.add_argument("--print_freq", default=256, type=int) + + args = parser.parse_args() + if args.max_steps is not None: + raise ValueError("QuartzNet uses num_epochs instead of max_steps") + + return args + + +def construct_name(name, lr, batch_size, num_epochs, wd, optimizer, emb_size): + return "{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}-embsize_{6}".format( + name, lr, batch_size, num_epochs, wd, optimizer, emb_size + ) + + +def create_all_dags(args, neural_factory): + """ + creates train and eval dags as well as their callbacks + returns train loss tensor and callbacks""" + + # parse the config files + yaml = YAML(typ="safe") + with open(args.model_config) as f: + spkr_params = yaml.load(f) + + sample_rate = spkr_params["sample_rate"] + time_length = spkr_params.get("time_length", 8) + logging.info("max time length considered is {} sec".format(time_length)) + + # Calculate num_workers for dataloader + total_cpus = os.cpu_count() + cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) // 2 + + # create data layer for training + train_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"]) + train_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["train"]) + del train_dl_params["train"] + del train_dl_params["eval"] + audio_augmentor = spkr_params.get("AudioAugmentor", None) + # del train_dl_params["normalize_transcripts"] + + data_layer_train = nemo_asr.AudioToSpeechLabelDataLayer( + manifest_filepath=args.train_dataset, + labels=None, + batch_size=args.batch_size, + num_workers=cpu_per_traindl, + augmentor=audio_augmentor, + time_length=time_length, + **train_dl_params, + # normalize_transcripts=False + ) + + N = len(data_layer_train) + steps_per_epoch = int(N / (args.batch_size * args.iter_per_step * args.num_gpus)) + + logging.info("Number of steps per epoch {}".format(steps_per_epoch)) + # create separate data layers for eval + # we need separate eval dags for separate eval datasets + # but all other modules in these dags will be shared + + eval_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"]) + eval_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["eval"]) + del eval_dl_params["train"] + del eval_dl_params["eval"] + + data_layers_test = [] + for test_set in args.eval_datasets: + + data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer( + manifest_filepath=test_set, + labels=data_layer_train.labels, + batch_size=args.batch_size, + num_workers=cpu_per_traindl, + time_length=time_length, + **eval_dl_params, + # normalize_transcripts=False + ) + data_layers_test.append(data_layer_test) + # create shared modules + + data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( + sample_rate=sample_rate, **spkr_params["AudioToMelSpectrogramPreprocessor"], + ) + + spectr_augment_config = spkr_params.get("SpectrogramAugmentation", None) + if spectr_augment_config: + data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config) + # (QuartzNet uses the Jasper baseline encoder and decoder) + encoder = nemo_asr.JasperEncoder(**spkr_params["JasperEncoder"],) + + decoder = nemo_asr.JasperDecoderForSpkrClass( + feat_in=spkr_params["JasperEncoder"]["jasper"][-1]["filters"], + num_classes=data_layer_train.num_classes, + pool_mode=spkr_params["JasperDecoderForSpkrClass"]['pool_mode'], + emb_sizes=spkr_params["JasperDecoderForSpkrClass"]["emb_sizes"].split(","), + ) + if os.path.exists(args.checkpoint_dir + "/JasperEncoder-STEP-100.pt"): + encoder.restore_from(args.checkpoint_dir + "/JasperEncoder-STEP-100.pt") + logging.info("Pretrained Encoder loaded") + + weight = None + xent_loss = nemo_asr.CrossEntropyLossNM(weight=weight) + + # assemble train DAG + + audio_signal, audio_signal_len, label, label_len = data_layer_train() + + processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len) + + if spectr_augment_config: + processed_signal = data_spectr_augmentation(input_spec=processed_signal) + + encoded, encoded_len = encoder(audio_signal=processed_signal, length=processed_signal_len) + + logits, _ = decoder(encoder_output=encoded) + loss = xent_loss(logits=logits, labels=label) + + # create train callbacks + train_callback = nemo.core.SimpleLossLoggerCallback( + tensors=[loss, logits, label], + print_func=partial(monitor_classification_training_progress, eval_metric=[1]), + step_freq=args.print_freq, + get_tb_values=lambda x: [("train_loss", x[0])], + tb_writer=neural_factory.tb_writer, + ) + + callbacks = [train_callback] + + if args.checkpoint_dir or args.load_dir: + chpt_callback = nemo.core.CheckpointCallback( + folder=args.checkpoint_dir, + load_from_folder=args.checkpoint_dir, # load dir + step_freq=args.checkpoint_save_freq, + checkpoints_to_keep=125, + ) + + callbacks.append(chpt_callback) + + # --- Assemble Validation DAG --- # + + for i, eval_layer in enumerate(data_layers_test): + + audio_signal_test, audio_len_test, label_test, _ = eval_layer() + processed_signal_test, processed_len_test = data_preprocessor( + input_signal=audio_signal_test, length=audio_len_test + ) + encoded_test, encoded_len_test = encoder(audio_signal=processed_signal_test, length=processed_len_test) + logits_test, _ = decoder(encoder_output=encoded_test) + loss_test = xent_loss(logits=logits_test, labels=label_test) + + tagname = os.path.dirname(args.eval_datasets[i]).split("/")[-1] + "_" + str(i) + print(tagname) + eval_callback = nemo.core.EvaluatorCallback( + eval_tensors=[loss_test, logits_test, label_test], + user_iter_callback=partial(process_classification_evaluation_batch, top_k=1), + user_epochs_done_callback=partial(process_classification_evaluation_epoch, tag=tagname), + eval_step=args.eval_freq, # How often we evaluate the model on the test set + tb_writer=neural_factory.tb_writer, + ) + + callbacks.append(eval_callback) + + return loss, callbacks, steps_per_epoch, loss_test, logits_test, label_test + + +def main(): + args = parse_args() + + print(args) + emb_size = 1024 + name = construct_name( + args.exp_name, args.lr, args.batch_size, args.num_epochs, args.weight_decay, args.optimizer, emb_size=emb_size, + ) + work_dir = name + if args.work_dir: + work_dir = os.path.join(args.work_dir, name) + + # instantiate Neural Factory with supported backend + neural_factory = nemo.core.NeuralModuleFactory( + backend=nemo.core.Backend.PyTorch, + local_rank=args.local_rank, + optimization_level=args.amp_opt_level, + log_dir=work_dir, + checkpoint_dir=args.checkpoint_dir + "/" + args.exp_name, + create_tb_writer=args.create_tb_writer, + files_to_copy=[args.model_config, __file__], + random_seed=42, + cudnn_benchmark=args.cudnn_benchmark, + tensorboard_dir=args.tensorboard_dir + "/" + name, + ) + args.num_gpus = neural_factory.world_size + + args.checkpoint_dir = neural_factory.checkpoint_dir + + if args.local_rank is not None: + logging.info("Doing ALL GPU") + + # build dags + (train_loss, callbacks, steps_per_epoch, loss_test, logits_test, label_test,) = create_all_dags( + args, neural_factory + ) + + # train model + neural_factory.train( + tensors_to_optimize=[train_loss], + callbacks=callbacks, + lr_policy=CosineAnnealing( + args.num_epochs * steps_per_epoch, warmup_steps=0.1 * args.num_epochs * steps_per_epoch, + ), + optimizer=args.optimizer, + optimization_params={ + "num_epochs": args.num_epochs, + "lr": args.lr, + "betas": (args.beta1, args.beta2), + "weight_decay": args.weight_decay, + "grad_norm_clip": None, + }, + batches_per_step=args.iter_per_step, + synced_batchnorm=args.synced_bn, + synced_batchnorm_groupsize=args.synced_bn_groupsize, + ) + + +if __name__ == "__main__": + main() diff --git a/examples/speaker_recognition/spkr_get_emb.py b/examples/speaker_recognition/spkr_get_emb.py new file mode 100644 index 000000000000..db93f638979f --- /dev/null +++ b/examples/speaker_recognition/spkr_get_emb.py @@ -0,0 +1,208 @@ +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import copy +import json +import os + +import numpy as np +from ruamel.yaml import YAML + +import nemo +import nemo.collections.asr as nemo_asr +import nemo.utils.argparse as nm_argparse + +logging = nemo.logging + + +def parse_args(): + parser = argparse.ArgumentParser( + parents=[nm_argparse.NemoArgParser()], description='SpeakerRecognition', conflict_handler='resolve', + ) + parser.set_defaults( + checkpoint_dir=None, + optimizer="novograd", + batch_size=32, + eval_batch_size=64, + lr=0.01, + weight_decay=0.001, + amp_opt_level="O0", + create_tb_writer=True, + ) + + # Overwrite default args + parser.add_argument( + "--num_epochs", + type=int, + default=None, + required=True, + help="number of epochs to train. You should specify either num_epochs or max_steps", + ) + parser.add_argument( + "--model_config", type=str, required=True, help="model configuration file: model.yaml", + ) + + # Create new args + parser.add_argument("--exp_name", default="SpkrReco_GramMatrix", type=str) + parser.add_argument("--beta1", default=0.95, type=float) + parser.add_argument("--beta2", default=0.5, type=float) + parser.add_argument("--warmup_steps", default=1000, type=int) + parser.add_argument("--load_dir", default=None, type=str) + parser.add_argument("--synced_bn", action='store_true', help="Use synchronized batch norm") + parser.add_argument("--synced_bn_groupsize", default=0, type=int) + parser.add_argument("--emb_size", default=256, type=int) + parser.add_argument("--print_freq", default=256, type=int) + + args = parser.parse_args() + if args.max_steps is not None: + raise ValueError("QuartzNet uses num_epochs instead of max_steps") + + return args + + +def construct_name(name, lr, batch_size, num_epochs, wd, optimizer, emb_size): + return "{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}-embsize_{6}".format( + name, lr, batch_size, num_epochs, wd, optimizer, emb_size + ) + + +def create_all_dags(args, neural_factory): + ''' + creates train and eval dags as well as their callbacks + returns train loss tensor and callbacks''' + + # parse the config files + yaml = YAML(typ="safe") + with open(args.model_config) as f: + spkr_params = yaml.load(f) + + sample_rate = spkr_params['sample_rate'] + + # Calculate num_workers for dataloader + total_cpus = os.cpu_count() + cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) + + # create separate data layers for eval + # we need separate eval dags for separate eval datasets + # but all other modules in these dags will be shared + + eval_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"]) + eval_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["eval"]) + del eval_dl_params["train"] + del eval_dl_params["eval"] + eval_dl_params['shuffle'] = False # To grab the file names without changing data_layer + + data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer( + manifest_filepath=args.eval_datasets[0], + labels=None, + batch_size=args.batch_size, + num_workers=cpu_per_traindl, + **eval_dl_params, + # normalize_transcripts=False + ) + # create shared modules + + data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( + sample_rate=sample_rate, **spkr_params["AudioToMelSpectrogramPreprocessor"], + ) + + # (QuartzNet uses the Jasper baseline encoder and decoder) + encoder = nemo_asr.JasperEncoder(**spkr_params["JasperEncoder"],) + + decoder = nemo_asr.JasperDecoderForSpkrClass( + feat_in=spkr_params['JasperEncoder']['jasper'][-1]['filters'], + num_classes=254, + emb_sizes=spkr_params['JasperDecoderForSpkrClass']['emb_sizes'].split(','), + pool_mode=spkr_params["JasperDecoderForSpkrClass"]['pool_mode'], + ) + + # --- Assemble Validation DAG --- # + audio_signal_test, audio_len_test, label_test, _ = data_layer_test() + + processed_signal_test, processed_len_test = data_preprocessor( + input_signal=audio_signal_test, length=audio_len_test + ) + + encoded_test, _ = encoder(audio_signal=processed_signal_test, length=processed_len_test) + + _, embeddings = decoder(encoder_output=encoded_test) + + return embeddings, label_test + + +def main(): + args = parse_args() + + print(args) + + name = construct_name( + args.exp_name, args.lr, args.batch_size, args.num_epochs, args.weight_decay, args.optimizer, args.emb_size + ) + work_dir = name + if args.work_dir: + work_dir = os.path.join(args.work_dir, name) + + # instantiate Neural Factory with supported backend + neural_factory = nemo.core.NeuralModuleFactory( + backend=nemo.core.Backend.PyTorch, + local_rank=args.local_rank, + optimization_level=args.amp_opt_level, + log_dir=work_dir, + checkpoint_dir=args.checkpoint_dir + "/" + args.exp_name, + create_tb_writer=False, + files_to_copy=[args.model_config, __file__], + random_seed=42, + cudnn_benchmark=args.cudnn_benchmark, + ) + args.num_gpus = neural_factory.world_size + + args.checkpoint_dir = neural_factory.checkpoint_dir + + if args.local_rank is not None: + logging.info('Doing ALL GPU') + + # build dags + embeddings, label_test = create_all_dags(args, neural_factory) + + eval_tensors = neural_factory.infer(tensors=[embeddings, label_test], checkpoint_dir=args.checkpoint_dir) + # inf_loss , inf_emb, inf_logits, inf_label = eval_tensors + inf_emb, inf_label = eval_tensors + whole_embs = [] + whole_labels = [] + manifest = open(args.eval_datasets[0], 'r').readlines() + + for line in manifest: + line = line.strip() + dic = json.loads(line) + filename = dic['audio_filepath'].split('/')[-1] + whole_labels.append(filename) + + for idx in range(len(inf_label)): + whole_embs.extend(inf_emb[idx].numpy()) + + embedding_dir = args.work_dir + './embeddings/' + if not os.path.exists(embedding_dir): + os.mkdir(embedding_dir) + + filename = os.path.basename(args.eval_datasets[0]).split('.')[0] + name = embedding_dir + filename + + np.save(name + '.npy', np.asarray(whole_embs)) + np.save(name + '_labels.npy', np.asarray(whole_labels)) + logging.info("Saved embedding files to {}".format(embedding_dir)) + + +if __name__ == '__main__': + main() diff --git a/examples/speaker_recognition/train_plda.sh b/examples/speaker_recognition/train_plda.sh new file mode 100755 index 000000000000..6466dfc81dd5 --- /dev/null +++ b/examples/speaker_recognition/train_plda.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +lda_dim=200 +DIR=$1 +stage=$2 +train_scp=$DIR/train.scp +dev_scp=$DIR/dev.scp + +trail_file=$DIR/trials_1m +cd $KALDI_ROOT/egs/voxceleb/v2 +. path.sh +. cmd.sh +cd - + +if [ $stage -le 1 ]; then + ivector-mean scp:$train_scp $DIR/mean.vec + + echo "Training LDA" + $train_cmd $DIR/log ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \ + "ark:ivector-subtract-global-mean scp:$train_scp ark:- |" \ + ark:$DIR/utt2spk $DIR/transform.mat || exit 1; + + echo "TRAINING PLDA" + $train_cmd $DIR/log ivector-compute-plda ark:$DIR/spk2utt \ + "ark:ivector-subtract-global-mean scp:$train_scp ark:- | transform-vec $DIR/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" $DIR/plda || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "SCORING" + sed 's/{}/average/' $trail_file > $DIR/temp_trail + $train_cmd $DIR/log ivector-plda-scoring --normalize-length=true \ + "ivector-copy-plda --smoothing=0.0 $DIR/plda - |" \ + "ark:ivector-subtract-global-mean $DIR/mean.vec scp:$dev_scp ark:- | transform-vec $DIR/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "ark:ivector-subtract-global-mean $DIR/mean.vec scp:$dev_scp ark:- | transform-vec $DIR/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "cat '$DIR/temp_trail' | cut -d\ --fields=1,2 |" $DIR/scores || exit 1; + + paste -d' ' <(awk '{print $3}' $DIR/scores) <(awk '{print $3}' $trail_file) > $DIR/final_score + + eer=`compute-eer <($KALDI_ROOT/egs/voxceleb/v2/local/prepare_for_eer.py $DIR/temp_trail $DIR/scores) 2> /dev/null` + # eer=`compute-eer $DIR/final_score 2> /dev/null` + mindcf=`$KALDI_ROOT/egs/voxceleb/v2/sid/compute_min_dcf.py $DIR/scores $DIR/temp_trail 2> /dev/null` + echo "EER: $eer%" + echo "minDCF: $mindcf" + +fi diff --git a/nemo/backends/__init__.py b/nemo/backends/__init__.py index d4611fa8af89..dfb329d0d74e 100644 --- a/nemo/backends/__init__.py +++ b/nemo/backends/__init__.py @@ -1 +1,22 @@ -from . import pytorch +# ============================================================================= +# Copyright (c) 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from .load_backend import backend + +# Load backend specific classes, functions etc. +if backend() == 'pytorch': + from .torch_backend import save, load, get_state_dict, set_state_dict + from . import pytorch diff --git a/nemo/backends/load_backend.py b/nemo/backends/load_backend.py new file mode 100644 index 000000000000..8b1c2db802af --- /dev/null +++ b/nemo/backends/load_backend.py @@ -0,0 +1,26 @@ +# ============================================================================= +# Copyright (c) 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# Set the default backend to PyTorch. +_BACKEND = 'pytorch' + + +def backend() -> str: + """ + Returns: + Name of the currently used backend. + """ + return _BACKEND diff --git a/nemo/backends/pytorch/actions.py b/nemo/backends/pytorch/actions.py index cae2f6740da9..47240c22f47a 100644 --- a/nemo/backends/pytorch/actions.py +++ b/nemo/backends/pytorch/actions.py @@ -1003,22 +1003,6 @@ def deployment_export(module, output: str, d_format: DeploymentFormat, input_exa output_example=output_example, ) - def _check_nan_or_inf(self, placement_gpu, nan_or_inf, steps_per_nan_check=None): - # Note that nan_or_inf only gets set if stop_on_nan loss is True, or if using O0/not using apex.amp. - if not placement_gpu: - return - if steps_per_nan_check is None or self.step % steps_per_nan_check == 0: - world_size = dist.get_world_size() - # We use dtype=int because nccl backend doesn't support torch.bool - nan_inf_tensor = torch.tensor(nan_or_inf, dtype=int).cuda() - nan_inf_results = [] - for _ in range(world_size): - nan_inf_results.append(torch.empty_like(nan_inf_tensor)) - dist.all_gather(nan_inf_results, nan_inf_tensor) - for nan_inf in nan_inf_results: - if nan_inf: - raise ValueError('Terminating due to previous NaN or inf.') - def train( self, tensors_to_optimize=None, @@ -1029,7 +1013,6 @@ def train( lr_policy=None, batches_per_step=None, stop_on_nan_loss=False, - steps_per_nan_check=100, synced_batchnorm=False, synced_batchnorm_groupsize=0, gradient_predivide=False, @@ -1424,8 +1407,6 @@ def save_state_to(self, path): # Do action start callbacks _perform_on_action_start(callbacks, get_state(self)) - nan_or_inf = False - # MAIN TRAINING LOOP # iteration over epochs while num_epochs is None or self.epoch_num < num_epochs: @@ -1489,26 +1470,22 @@ def save_state_to(self, path): curr_tensors_to_optimize = training_loop[self.step % len(training_loop)][1] final_loss = 0 for tensor in curr_tensors_to_optimize: - if ( - torch.isnan(self._training_state.tensor_dict[tensor.unique_name]).any() - or torch.isinf(self._training_state.tensor_dict[tensor.unique_name]).any() - ): - if ( - (stop_on_nan_loss) - or (self._optim_level not in AmpOptimizations) - or (self._optim_level == Optimization.mxprO0) - ): - # Set flag here and terminate at next all_gather check. - nan_or_inf = True - logging.warning( - 'Loss is NaN or inf at step %d, will terminate within the' - ' next steps_per_nan_check steps', - self.step, - ) - else: - logging.warning('Loss is NaN or inf, continuing training') final_loss += self._training_state.tensor_dict[tensor.unique_name] + # Check for NaN/inf loss (across workers if applicable) + loss_nan_inf_checker = final_loss.clone() + if placement_gpu: + dist.all_reduce(loss_nan_inf_checker, torch.distributed.ReduceOp.MAX) + if torch.isnan(loss_nan_inf_checker).any() or torch.isinf(loss_nan_inf_checker).any(): + if stop_on_nan_loss: + raise ValueError('Loss is NaN or inf - exiting') + if self._optim_level in AmpOptimizations and self._optim_level != Optimization.mxprO0: + logging.warning('Loss is NaN or inf.') + else: + # Skip this step across workers if loss is NaN/inf and using fp32 + logging.warning('Loss is NaN or inf. Skipping update.') + continue + if self._optim_level in AmpOptimizations and self._optim_level != Optimization.mxprO0: with amp.scale_loss(final_loss, curr_optimizer, delay_unscale=disable_allreduce) as scaled_loss: if disable_allreduce: @@ -1531,15 +1508,12 @@ def save_state_to(self, path): final_loss.backward(bps_scale.to(final_loss.get_device())) # single device (CPU or GPU) else: - # Fix (workaround?) enabling to backpropagate gradiens on CPUs. + # Fix (workaround?) enabling to backpropagate gradients on CPUs. if final_loss.get_device() < 0: final_loss.backward(bps_scale) else: final_loss.backward(bps_scale.to(final_loss.get_device())) - # Check if we should terminate due to NaN/inf on any workers. - self._check_nan_or_inf(placement_gpu, nan_or_inf, steps_per_nan_check=steps_per_nan_check) - batch_counter += 1 if batch_counter == batches_per_step: @@ -1559,10 +1533,6 @@ def save_state_to(self, path): # Register epochs end with callbacks _perform_on_epoch_end(callbacks, get_state(self)) self.epoch_num += 1 - - # Check again if we should stop on NaN/inf - self._check_nan_or_inf(placement_gpu, nan_or_inf) - _perform_on_action_end(callbacks, get_state(self)) def infer( diff --git a/nemo/backends/pytorch/nm.py b/nemo/backends/pytorch/nm.py index 734ab9d60b50..7b06fb1104ab 100644 --- a/nemo/backends/pytorch/nm.py +++ b/nemo/backends/pytorch/nm.py @@ -6,7 +6,7 @@ import torch as t import torch.nn as nn -from nemo.core import DeviceType, NeuralModule, WeightShareTransform +from nemo.core import DeviceType, ModuleType, NeuralModule, WeightShareTransform from nemo.utils.helpers import get_cuda_device, rgetattr, rsetattr @@ -35,8 +35,12 @@ def __init__(self): """ def __init__(self, pretrained_model_name=None, name=None): - NeuralModule.__init__(self, name) # For NeuralModule API + # Initialize nn.Module first - important for the inspect during the init_params collection. nn.Module.__init__(self) # For PyTorch API + NeuralModule.__init__(self, name) # For NeuralModule API + + # Set module type. + self._type = ModuleType.trainable self._device = get_cuda_device(self.placement) @@ -132,6 +136,8 @@ class NonTrainableNM(NeuralModule): def __init__(self, name=None): NeuralModule.__init__(self, name) # For NeuralModule API self._device = get_cuda_device(self.placement) + # Set module type. + self._type = ModuleType.nontrainable def __call__(self, force_pt=False, *input, **kwargs): pt_call = len(input) > 0 or force_pt @@ -191,6 +197,10 @@ class DataLayerNM(NeuralModule): def __init__(self, name=None): NeuralModule.__init__(self, name) # For NeuralModule API + + # Set module type. + self._type = ModuleType.datalayer + self._device = get_cuda_device(self.placement) # if 'batch_size' not in kwargs: @@ -326,6 +336,10 @@ class LossNM(NeuralModule): def __init__(self, name=None): NeuralModule.__init__(self, name) # For NeuralModule API + + # Set module type. + self._type = ModuleType.loss + self._device = get_cuda_device(self.placement) def get_weights(self): diff --git a/nemo/backends/torch_backend.py b/nemo/backends/torch_backend.py new file mode 100644 index 000000000000..cbc1730a2275 --- /dev/null +++ b/nemo/backends/torch_backend.py @@ -0,0 +1,71 @@ +# ============================================================================= +# Copyright (c) 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from os.path import expanduser +from typing import Any, Dict + +import torch + + +def save(checkpoint: Dict[str, Any], filename: str) -> None: + """ + A proxy function that saves the checkpoint to a given file. + + Args: + checkpoint: Checkpoint to be stored. + filename: Name of the file containing checkpoint. + """ + # Get the absolute path and save. + abs_filename = expanduser(filename) + torch.save(checkpoint, abs_filename) + + +def load(filename: str) -> Dict[str, Any]: + """ + A proxy function that loads checkpoint from a given file. + + Args: + filename: Name of the file containing checkpoint. + Returns: + Loaded checkpoint. + """ + # Get the absolute path and save. + abs_filename = expanduser(filename) + # Use map location to be able to load CUDA-trained modules on CPU. + return torch.load(abs_filename, map_location=lambda storage, loc: storage) + + +def get_state_dict(model: torch.nn.Module) -> Dict[str, Any]: + """ + A proxy function that gets the state dictionary. + + Args: + model: Torch model. + Returns: + State dictionary containing model weights. + """ + return model.state_dict() + + +def set_state_dict(model: torch.nn.Module, state_dict: Dict[str, Any]) -> None: + """ + A proxy function that sets the state dictionary. + + Args: + model: Torch model. + state_dict: State dictionary containing model weights. + """ + model.load_state_dict(state_dict) diff --git a/nemo/collections/asr/__init__.py b/nemo/collections/asr/__init__.py index afc4d7ecbf31..b1fe49531777 100644 --- a/nemo/collections/asr/__init__.py +++ b/nemo/collections/asr/__init__.py @@ -14,6 +14,7 @@ # ============================================================================= from .audio_preprocessing import * from .beam_search_decoder import BeamSearchDecoderWithLM +from .contextnet import ContextNetDecoderForCTC, ContextNetEncoder from .data_layer import ( AudioToSpeechLabelDataLayer, AudioToTextDataLayer, @@ -22,7 +23,7 @@ TranscriptDataLayer, ) from .greedy_ctc_decoder import GreedyCTCDecoder -from .jasper import JasperDecoderForClassification, JasperDecoderForCTC, JasperEncoder +from .jasper import JasperDecoderForClassification, JasperDecoderForCTC, JasperDecoderForSpkrClass, JasperEncoder from .las.misc import JasperRNNConnector from .losses import CTCLossNM from nemo.backends.pytorch.common.losses import CrossEntropyLossNM @@ -48,7 +49,10 @@ 'JasperEncoder', 'JasperDecoderForCTC', 'JasperDecoderForClassification', + 'JasperDecoderForSpkrClass', 'JasperRNNConnector', + 'ContextNetEncoder', + 'ContextNetDecoderForCTC', 'CTCLossNM', 'CrossEntropyLossNM', ] diff --git a/nemo/collections/asr/contextnet.py b/nemo/collections/asr/contextnet.py new file mode 100644 index 000000000000..c09be485d67a --- /dev/null +++ b/nemo/collections/asr/contextnet.py @@ -0,0 +1,213 @@ +# Copyright (c) 2019 NVIDIA Corporation +from typing import Any, Dict, List, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import nemo +from .jasper import JasperEncoder +from .parts.jasper import init_weights +from nemo.backends.pytorch.nm import TrainableNM +from nemo.core.neural_types import * +from nemo.utils.decorators import add_port_docs + +logging = nemo.logging + + +class ContextNetEncoder(JasperEncoder): + """ + ContextNet Encoder creates the pre-processing (prologue), QuartzNet convolution + block, and the additional pre and post processing layers as described in + ContextNet (https://arxiv.org/abs/2005.03191) + + Args: + jasper (list): A list of dictionaries. Each element in the list + represents the configuration of one Jasper Block. Each element + should contain:: + + { + # Required parameters + 'filters' (int) # Number of output channels, + 'repeat' (int) # Number of sub-blocks, + 'kernel' (int) # Size of conv kernel, + 'stride' (int) # Conv stride + 'dilation' (int) # Conv dilation + 'dropout' (float) # Dropout probability + 'residual' (bool) # Whether to use residual or not. + # Optional parameters + 'residual_dense' (bool) # Whether to use Dense Residuals + # or not. 'residual' must be True for 'residual_dense' + # to be enabled. + # Defaults to False. + 'separable' (bool) # Whether to use separable convolutions. + # Defaults to False + 'groups' (int) # Number of groups in each conv layer. + # Defaults to 1 + 'heads' (int) # Sharing of separable filters + # Defaults to -1 + 'tied' (bool) # Whether to use the same weights for all + # sub-blocks. + # Defaults to False + 'se' (bool) # Whether to add Squeeze and Excitation + # sub-blocks. + # Defaults to False + 'se_reduction_ratio' (int) # The reduction ratio of the Squeeze + # sub-module. + # Must be an integer > 1. + # Defaults to 8. + 'se_context_window' (int) # The size of the temporal context + # provided to SE sub-module. + # Must be an integer. If value <= 0, will perform global + # temporal pooling (global context). + # If value >= 1, will perform stride 1 average pooling to + # compute context window. + 'se_interpolation_mode' (str) # Interpolation mode of timestep dimension. + # Used only if context window is > 1. + # The modes available for resizing are: `nearest`, `linear` (3D-only), + # `bilinear`, `area` + 'kernel_size_factor' (float) # Conv kernel size multiplier + # Can be either an int or float + # Kernel size is recomputed as below: + # new_kernel_size = int(max(1, (kernel_size * kernel_width))) + # to prevent kernel sizes than 1. + # Note: If rescaled kernel size is an even integer, + # adds 1 to the rescaled kernel size to allow "same" + # padding. + 'stride_last' (bool) # Bool flag to determine whether each + # of the the repeated sub-blockss will perform a stride, + # or only the last sub-block will perform a strided convolution. + } + + activation (str): Activation function used for each sub-blocks. Can be + one of ["hardtanh", "relu", "selu", "swish"]. + feat_in (int): Number of channels being input to this module + normalization_mode (str): Normalization to be used in each sub-block. + Can be one of ["batch", "layer", "instance", "group"] + Defaults to "batch". + residual_mode (str): Type of residual connection. + Can be "add", "stride_add" or "max". + "stride_add" mode performs strided convolution prior to residual + addition. + Defaults to "add". + norm_groups (int): Number of groups for "group" normalization type. + If set to -1, number of channels is used. + Defaults to -1. + conv_mask (bool): Controls the use of sequence length masking prior + to convolutions. + Defaults to True. + frame_splicing (int): Defaults to 1. + init_mode (str): Describes how neural network parameters are + initialized. Options are ['xavier_uniform', 'xavier_normal', + 'kaiming_uniform','kaiming_normal']. + Defaults to "xavier_uniform". + """ + + length: Optional[torch.Tensor] + + @property + @add_port_docs() + def input_ports(self): + """Returns definitions of module input ports. + """ + return { + # "audio_signal": NeuralType( + # {0: AxisType(BatchTag), 1: AxisType(SpectrogramSignalTag), 2: AxisType(ProcessedTimeTag),} + # ), + # "length": NeuralType({0: AxisType(BatchTag)}), + "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), + "length": NeuralType(tuple('B'), LengthsType()), + } + + @property + @add_port_docs() + def output_ports(self): + """Returns definitions of module output ports. + """ + return { + # "outputs": NeuralType( + # {0: AxisType(BatchTag), 1: AxisType(EncodedRepresentationTag), 2: AxisType(ProcessedTimeTag),} + # ), + # "encoded_lengths": NeuralType({0: AxisType(BatchTag)}), + "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), + "encoded_lengths": NeuralType(tuple('B'), LengthsType()), + } + + def __init__( + self, + jasper: List[Dict[str, Any]], + activation: str, + feat_in: int, + normalization_mode: str = "batch", + residual_mode: str = "add", + norm_groups: int = -1, + conv_mask: bool = False, + frame_splicing: int = 1, + init_mode: str = 'xavier_uniform', + ): + super().__init__( + jasper=jasper, + activation=activation, + feat_in=feat_in, + normalization_mode=normalization_mode, + residual_mode=residual_mode, + norm_groups=norm_groups, + conv_mask=conv_mask, + frame_splicing=frame_splicing, + init_mode=init_mode, + ) + + +class ContextNetDecoderForCTC(TrainableNM): + """ + ContextNet Decoder creates the final layer in ContextNet that maps from the outputs + of ContextNet Encoder to the vocabulary of interest. + + Args: + feat_in (int): Number of channels being input to this module + num_classes (int): Number of characters in ASR model's vocab/labels. + This count should not include the CTC blank symbol. + hidden_size (int): Number of units in the hidden state of the LSTM RNN. + init_mode (str): Describes how neural network parameters are + initialized. Options are ['xavier_uniform', 'xavier_normal', + 'kaiming_uniform','kaiming_normal']. + Defaults to "xavier_uniform". + """ + + @property + @add_port_docs() + def input_ports(self): + """Returns definitions of module input ports. + """ + return { + # "encoder_output": NeuralType( + # {0: AxisType(BatchTag), 1: AxisType(EncodedRepresentationTag), 2: AxisType(ProcessedTimeTag),} + # ) + "encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()) + } + + @property + @add_port_docs() + def output_ports(self): + """Returns definitions of module output ports. + """ + # return {"output": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})} + return {"output": NeuralType(('B', 'T', 'D'), LogprobsType())} + + def __init__(self, feat_in: int, num_classes: int, hidden_size: int = 640, init_mode: str = "xavier_uniform"): + super().__init__() + + self._feat_in = feat_in + # Add 1 for blank char + self._num_classes = num_classes + 1 + + self.rnn = nn.LSTM(feat_in, hidden_size, bias=True, batch_first=True) + self.clf = nn.Linear(hidden_size, self._num_classes) + self.clf.apply(lambda x: init_weights(x, mode=init_mode)) + self.to(self._device) + + def forward(self, encoder_output): + encoder_output = encoder_output.transpose(1, 2) # [B, T, D] + output, states = self.rnn(encoder_output) + logits = self.clf(output) + return F.log_softmax(logits, dim=-1) diff --git a/nemo/collections/asr/data_layer.py b/nemo/collections/asr/data_layer.py index f80c92aa3476..ab94c70d53e4 100644 --- a/nemo/collections/asr/data_layer.py +++ b/nemo/collections/asr/data_layer.py @@ -26,7 +26,14 @@ import nemo from .parts.collections import ASRAudioText -from .parts.dataset import AudioDataset, AudioLabelDataset, KaldiFeatureDataset, TranscriptDataset, seq_collate_fn +from .parts.dataset import ( + AudioDataset, + AudioLabelDataset, + KaldiFeatureDataset, + TranscriptDataset, + fixed_seq_collate_fn, + seq_collate_fn, +) from .parts.features import WaveformFeaturizer from .parts.parsers import make_parser from .parts.perturb import AudioAugmentor, perturbation_types @@ -897,6 +904,7 @@ class AudioToSpeechLabelDataLayer(DataLayerNM): the range [0, 1] of this augmentation being applied. If this keyword is not present, then the augmentation is disabled and a warning is logged. + time_length (int): max seconds to consider in a batch # Pass this only for speaker recognition task """ @property @@ -926,6 +934,7 @@ def __init__( drop_last: bool = False, load_audio: bool = True, augmentor: Optional[Union[AudioAugmentor, Dict[str, Dict[str, Any]]]] = None, + time_length: int = 0, ): super(AudioToSpeechLabelDataLayer, self).__init__() @@ -949,6 +958,9 @@ def __init__( } self._dataset = AudioLabelDataset(**dataset_params) + self.num_classes = self._dataset.num_commands + logging.info("# of classes :{}".format(self.num_classes)) + self.labels = self._dataset.labels # Set up data loader if self._placement == DeviceType.AllGpu: logging.info("Parallelizing Datalayer.") @@ -956,10 +968,15 @@ def __init__( else: sampler = None + if time_length: + collate_func = partial(fixed_seq_collate_fn, fixed_length=time_length * self._sample_rate) + else: + collate_func = partial(seq_collate_fn, token_pad_value=0) + self._dataloader = torch.utils.data.DataLoader( dataset=self._dataset, batch_size=batch_size, - collate_fn=partial(seq_collate_fn, token_pad_value=0), + collate_fn=collate_func, drop_last=drop_last, shuffle=shuffle if sampler is None else False, sampler=sampler, diff --git a/nemo/collections/asr/helpers.py b/nemo/collections/asr/helpers.py index a2c7d0210470..7734b48b9ee7 100644 --- a/nemo/collections/asr/helpers.py +++ b/nemo/collections/asr/helpers.py @@ -270,7 +270,7 @@ def process_classification_evaluation_epoch(global_vars: dict, eval_metric=None, logs = {f"Evaluation_Loss {tag}": eloss} - logging.info(f"==========>>>>>>Evaluation Loss {tag}: {eloss}") + logging.info(f"==========>>>>>>Evaluation Loss {tag}: {eloss:.3f}") for k, acc in zip(top_k, topk_accs): logging.info(f"==========>>>>>>Evaluation Accuracy Top@{k} {tag}: {acc * 100.:3.4f}") logs[f'Evaluation_Accuracy_Top@{k} {tag}'] = acc * 100.0 diff --git a/nemo/collections/asr/jasper.py b/nemo/collections/asr/jasper.py index deab3945e812..b5de8f6b7af4 100644 --- a/nemo/collections/asr/jasper.py +++ b/nemo/collections/asr/jasper.py @@ -6,7 +6,7 @@ import torch.nn.functional as F import nemo -from .parts.jasper import JasperBlock, init_weights, jasper_activations +from .parts.jasper import JasperBlock, StatsPoolLayer, init_weights, jasper_activations from nemo.backends.pytorch.nm import TrainableNM from nemo.core.neural_types import * from nemo.utils.decorators import add_port_docs @@ -54,7 +54,17 @@ class JasperEncoder(TrainableNM): 'se_reduction_ratio' (int) # The reduction ratio of the Squeeze # sub-module. # Must be an integer > 1. - # Defaults to 16 + # Defaults to 8. + 'se_context_window' (int) # The size of the temporal context + # provided to SE sub-module. + # Must be an integer. If value <= 0, will perform global + # temporal pooling (global context). + # If value >= 1, will perform stride 1 average pooling to + # compute context window. + 'se_interpolation_mode' (str) # Interpolation mode of timestep dimension. + # Used only if context window is > 1. + # The modes available for resizing are: `nearest`, `linear` (3D-only), + # `bilinear`, `area` 'kernel_size_factor' (float) # Conv kernel size multiplier # Can be either an int or float # Kernel size is recomputed as below: @@ -63,16 +73,21 @@ class JasperEncoder(TrainableNM): # Note: If rescaled kernel size is an even integer, # adds 1 to the rescaled kernel size to allow "same" # padding. + 'stride_last' (bool) # Bool flag to determine whether each + # of the the repeated sub-blockss will perform a stride, + # or only the last sub-block will perform a strided convolution. } activation (str): Activation function used for each sub-blocks. Can be - one of ["hardtanh", "relu", "selu"]. + one of ["hardtanh", "relu", "selu", "swish"]. feat_in (int): Number of channels being input to this module normalization_mode (str): Normalization to be used in each sub-block. Can be one of ["batch", "layer", "instance", "group"] Defaults to "batch". residual_mode (str): Type of residual connection. - Can be "add" or "max". + Can be "add", "stride_add" or "max". + "stride_add" mode performs strided convolution prior to residual + addition. Defaults to "add". norm_groups (int): Number of groups for "group" normalization type. If set to -1, number of channels is used. @@ -162,9 +177,13 @@ def __init__( groups = lcfg.get('groups', 1) separable = lcfg.get('separable', False) heads = lcfg.get('heads', -1) + residual_mode = lcfg.get('residual_mode', residual_mode) se = lcfg.get('se', False) - se_reduction_ratio = lcfg.get('se_reduction_ratio', 16) + se_reduction_ratio = lcfg.get('se_reduction_ratio', 8) + se_context_window = lcfg.get('se_context_window', -1) + se_interpolation_mode = lcfg.get('se_interpolation_mode', 'nearest') kernel_size_factor = lcfg.get('kernel_size_factor', 1.0) + stride_last = lcfg.get('stride_last', False) encoder_layers.append( JasperBlock( feat_in, @@ -186,7 +205,10 @@ def __init__( conv_mask=conv_mask, se=se, se_reduction_ratio=se_reduction_ratio, + se_context_window=se_context_window, + se_interpolation_mode=se_interpolation_mode, kernel_size_factor=kernel_size_factor, + stride_last=stride_last, ) ) feat_in = lcfg['filters'] @@ -317,3 +339,188 @@ def forward(self, encoder_output): return logits return F.softmax(logits, dim=-1) + + +class JasperDecoderForSpkrClass(TrainableNM): + """ + Jasper Decoder creates the final layer in Jasper that maps from the outputs + of Jasper Encoder to the embedding layer followed by speaker based softmax loss. + + Args: + feat_in (int): Number of channels being input to this module + num_classes (int): Number of unique speakers in dataset + emb_sizes (list) : shapes of intermediate embedding layers (we consider speaker embbeddings from 1st of this layers) + Defaults to [1024,1024] + pool_mode (str) : Pooling stratergy type. options are 'gram','xvector','superVector'. + Defaults to 'xvector' + init_mode (str): Describes how neural network parameters are + initialized. Options are ['xavier_uniform', 'xavier_normal', + 'kaiming_uniform','kaiming_normal']. + Defaults to "xavier_uniform". + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + encoder_output: + 0: AxisType(BatchTag) + + 1: AxisType(EncodedRepresentationTag) + + 2: AxisType(ProcessedTimeTag) + """ + + return {"encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation())} + + @property + def output_ports(self): + """Returns definitions of module output ports. + + logits: + 0: AxisType(BatchTag) + + 1: AxisType(ChannelTag) + + embs: + 0: AxisType(BatchTag) + 1: AxisType(EncodedRepresentationTah) + """ + return { + "logits": NeuralType(('B', 'D'), LogitsType()), + "embs": NeuralType(('B', 'D'), AcousticEncodedRepresentation()), + } + + def __init__(self, feat_in, num_classes, emb_sizes=[1024, 1024], pool_mode='xvector', init_mode="xavier_uniform"): + TrainableNM.__init__(self) + self._feat_in = 0 + if pool_mode == 'gram': + gram = True + super_vector = False + elif pool_mode == 'superVector': + gram = True + super_vector = True + else: + gram = False + super_vector = False + + if gram: + self._feat_in += feat_in ** 2 + else: + self._feat_in += 2 * feat_in + + if super_vector and gram: + self._feat_in += 2 * feat_in + + self._midEmbd1 = int(emb_sizes[0]) # Spkr Vector Embedding Shape + self._midEmbd2 = int(emb_sizes[1]) if len(emb_sizes) > 1 else 0 # Spkr Vector Embedding Shape + + self._num_classes = num_classes + self._pooling = StatsPoolLayer(gram=gram, super_vector=super_vector) + + self.mid1 = self.affineLayer(self._feat_in, self._midEmbd1, learn_mean=False) + self.mid2 = self.affineLayer(self._midEmbd1, self._midEmbd2, learn_mean=False) + self.final = nn.Linear(self._midEmbd2, self._num_classes) + + self.apply(lambda x: init_weights(x, mode=init_mode)) + self.to(self._device) + + def affineLayer(self, inp_shape, out_shape, learn_mean=True): + layer = nn.Sequential( + nn.Linear(inp_shape, out_shape), + nn.BatchNorm1d(out_shape, affine=learn_mean, track_running_stats=True), + nn.ReLU(), + ) + + return layer # layer, embs + + def forward(self, encoder_output): + # encoder_output = self.norm(encoder_output) + pool = self._pooling(encoder_output) + mid1, emb1 = self.mid1(pool), self.mid1[:2](pool) + mid2, embs = self.mid2(mid1), self.mid2[:2](mid1) + out = self.final(mid2) + + return out, emb1 + + +# Siamese Network, support to be added in future releases +# class SiameseDecoderForSpeakerClass(TrainableNM): +# """ +# Jasper Decoder creates the final layer in Jasper that maps from the outputs +# of Jasper Encoder to the vocabulary of interest. + +# Args: +# feat_in (int): Number of channels being input to this module +# num_classes (int): Number of characters in ASR model's vocab/labels. +# This count should not include the CTC blank symbol. +# init_mode (str): Describes how neural network parameters are +# initialized. Options are ['xavier_uniform', 'xavier_normal', +# 'kaiming_uniform','kaiming_normal']. +# Defaults to "xavier_uniform". +# """ + +# @property +# def input_ports(self): +# """Returns definitions of module input ports. + +# encoder_output: +# 0: AxisType(BatchTag) + +# 1: AxisType(EncodedRepresentationTag) + +# 2: AxisType(ProcessedTimeTag) +# """ +# return { +# "embs1": NeuralType(('B', 'D'), AcousticEncodedRepresentation()), +# "embs2": NeuralType(('B', 'D'), AcousticEncodedRepresentation()), +# } + +# @property +# def output_ports(self): +# """Returns definitions of module output ports. + +# output: +# 0: AxisType(BatchTag) + +# 1: AxisType(ChannelTag) +# """ +# return { +# "logits": NeuralType(('B', 'D'), LogitsType()), +# } + +# def __init__(self, emb_size, mid_dim, init_mode="xavier_uniform"): +# super().__init__() +# self._feat_in = emb_size +# self._mid_dim = mid_dim + +# self.connect = self.affineLayer(self._feat_in, self._mid_dim, learn_mean=True) + +# self.S = nn.Parameter(torch.randn(self._mid_dim, self._mid_dim), requires_grad=True) +# self.b = nn.Parameter(torch.randn(1), requires_grad=True) + +# self.apply(lambda x: init_weights(x, mode=init_mode)) +# self.to(self._device) + +# def affineLayer(self, inp_shape, out_shape, learn_mean=True): +# layer = nn.Sequential( +# nn.Linear(inp_shape, out_shape), +# nn.BatchNorm1d(out_shape, affine=learn_mean, track_running_stats=True), +# nn.ReLU(), +# ) + +# return layer # layer, embs + +# def forward(self, inp_emb1, inp_emb2): + +# x = self.connect(inp_emb1) +# y = self.connect(inp_emb2) + +# out = ( +# torch.matmul(x, y.T).diag() +# - torch.matmul(torch.matmul(x, self.S), x.T).diag() +# - torch.matmul(torch.matmul(y, self.S), y.T).diag() +# + self.b +# ) + +# return out diff --git a/nemo/collections/asr/losses.py b/nemo/collections/asr/losses.py index d8714187cd2e..f3ca8f5a4d25 100644 --- a/nemo/collections/asr/losses.py +++ b/nemo/collections/asr/losses.py @@ -13,6 +13,9 @@ class CTCLossNM(LossNM): Args: num_classes (int): Number of characters in ASR model's vocab/labels. This count should not include the CTC blank symbol. + zero_infinity (bool): Whether to zero infinite losses and the associated gradients. + By default, it is False. Infinite losses mainly occur when the inputs are too + short to be aligned to the targets. """ @property @@ -41,11 +44,11 @@ def output_ports(self): # return {"loss": NeuralType(None)} return {"loss": NeuralType(elements_type=LossType())} - def __init__(self, num_classes): + def __init__(self, num_classes, zero_infinity=False): super().__init__() self._blank = num_classes - self._criterion = nn.CTCLoss(blank=self._blank, reduction='none') + self._criterion = nn.CTCLoss(blank=self._blank, reduction='none', zero_infinity=zero_infinity) def _loss(self, log_probs, targets, input_length, target_length): input_length = input_length.long() diff --git a/nemo/collections/asr/parts/collections.py b/nemo/collections/asr/parts/collections.py index a71fc396d53f..c28f2d9f3e30 100644 --- a/nemo/collections/asr/parts/collections.py +++ b/nemo/collections/asr/parts/collections.py @@ -233,6 +233,8 @@ def __init__( logging.info( "Filtered duration for loading collection is %f.", duration_filtered, ) + self.uniq_labels = sorted(set(map(lambda x: x.label, data))) + logging.info("# {} files loaded accounting to # {} labels".format(len(data), len(self.uniq_labels))) super().__init__(data) diff --git a/nemo/collections/asr/parts/dataset.py b/nemo/collections/asr/parts/dataset.py index b47c61ffc70f..6042a8bd3b0d 100644 --- a/nemo/collections/asr/parts/dataset.py +++ b/nemo/collections/asr/parts/dataset.py @@ -53,6 +53,55 @@ def seq_collate_fn(batch, token_pad_value=0): return audio_signal, audio_lengths, tokens, tokens_lengths +def fixed_seq_collate_fn(batch, fixed_length=16000): + """collate batch of audio sig, audio len, tokens, tokens len + + Args: + batch (Optional[FloatTensor], Optional[LongTensor], LongTensor, + LongTensor): A tuple of tuples of signal, signal lengths, + encoded tokens, and encoded tokens length. This collate func + assumes the signals are 1d torch tensors (i.e. mono audio). + fixed_length (Optional[int]): length of input signal to be considered + + """ + _, audio_lengths, _, tokens_lengths = zip(*batch) + + has_audio = audio_lengths[0] is not None + fixed_length = min(fixed_length, max(audio_lengths)) + + audio_signal, tokens = [], [] + for sig, sig_len, tokens_i, _ in batch: + if has_audio: + sig_len = sig_len.item() + chunck_len = sig_len - fixed_length + if chunck_len < 0: + # pad = (0,fixed_length-sig_len) + # signal = torch.nn.functional.pad(sig,pad) + repeat = fixed_length // sig_len + rem = fixed_length % sig_len + sub = sig[-rem:] if rem > 0 else torch.tensor([]) + rep_sig = torch.cat(repeat * [sig]) + signal = torch.cat((rep_sig, sub)) + # print(sig_len,repeat,rem,len(sub),len(rep_sig),len(signal)) + else: + start_idx = torch.randint(0, chunck_len, (1,)) if chunck_len else torch.tensor(0) + end_idx = start_idx + fixed_length + signal = sig[start_idx:end_idx] + + audio_signal.append(signal) + tokens.append(tokens_i) + + if has_audio: + audio_signal = torch.stack(audio_signal) + audio_lengths = torch.stack(audio_lengths) + else: + audio_signal, audio_lengths = None, None + tokens = torch.stack(tokens) + tokens_lengths = torch.stack(tokens_lengths) + + return audio_signal, audio_lengths, tokens, tokens_lengths + + def audio_seq_collate_fn(batch): """ Collate a batch (iterable of (sample tensor, label tensor) tuples) into @@ -355,7 +404,8 @@ class AudioLabelDataset(Dataset): Args: manifest_filepath: Path to manifest json as described above. Can be comma-separated paths. - labels: String containing all the possible labels to map to + labels (Optional[list]): String containing all the possible labels to map to + if None then automatically picks from ASRSpeechLabel collection. featurizer: Initialized featurizer class that converts paths of audio to feature tensors max_duration: If audio exceeds this length, do not include in dataset @@ -366,7 +416,14 @@ class AudioLabelDataset(Dataset): """ def __init__( - self, manifest_filepath, labels, featurizer, max_duration=None, min_duration=None, trim=False, load_audio=True + self, + manifest_filepath, + featurizer, + labels=None, + max_duration=None, + min_duration=None, + trim=False, + load_audio=True, ): self.collection = collections.ASRSpeechLabel( manifests_files=manifest_filepath.split(','), min_duration=min_duration, max_duration=max_duration, @@ -376,14 +433,17 @@ def __init__( self.trim = trim self.load_audio = load_audio - self.labels = labels - self.num_commands = len(labels) + self.labels = labels if labels else self.collection.uniq_labels + self.num_commands = len(self.labels) self.label2id, self.id2label = {}, {} - for label_id, label in enumerate(labels): + for label_id, label in enumerate(self.labels): self.label2id[label] = label_id self.id2label[label_id] = label + for idx in range(len(self.labels[:5])): + logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) + def __getitem__(self, index): sample = self.collection[index] if self.load_audio: diff --git a/nemo/collections/asr/parts/jasper.py b/nemo/collections/asr/parts/jasper.py index 1ec0fb75d0b9..a07fd1fb3b50 100644 --- a/nemo/collections/asr/parts/jasper.py +++ b/nemo/collections/asr/parts/jasper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional, Tuple +from typing import Callable, List, Optional, Tuple import torch import torch.nn as nn @@ -65,6 +65,34 @@ def get_same_padding(kernel_size, stride, dilation): return kernel_size // 2 +class StatsPoolLayer(nn.Module): + def __init__(self, gram=False, super_vector=False): + super().__init__() + self.gram = gram + self.super = super_vector + + def forward(self, encoder_output): + + mean = encoder_output.mean(dim=-1) # Time Axis + std = encoder_output.std(dim=-1) + + pooled = torch.cat([mean, std], dim=-1) + + if self.gram: + time_len = encoder_output.shape[-1] + # encoder_output = encoder_output + cov = encoder_output.bmm(encoder_output.transpose(2, 1)) # cov matrix + cov = cov.view(cov.shape[0], -1) / (time_len) + + if self.gram and not self.super: + return cov + + if self.super and self.gram: + pooled = torch.cat([pooled, cov], dim=-1) + + return pooled + + class MaskedConv1d(nn.Module): __constants__ = ["use_conv_mask", "real_out_channels", "heads"] @@ -151,21 +179,66 @@ def forward(self, x): class SqueezeExcite(nn.Module): - def __init__(self, channels, reduction_ratio): + def __init__( + self, + channels: int, + reduction_ratio: int, + context_window: int = -1, + interpolation_mode: str = 'nearest', + activation: Optional[Callable] = None, + ): + """ + Squeeze-and-Excitation sub-module. + + Args: + channels: Input number of channels. + reduction_ratio: Reduction ratio for "squeeze" layer. + context_window: Integer number of timesteps that the context + should be computed over, using stride 1 average pooling. + If value < 1, then global context is computed. + interpolation_mode: Interpolation mode of timestep dimension. + Used only if context window is > 1. + The modes available for resizing are: `nearest`, `linear` (3D-only), + `bilinear`, `area` + activation: Intermediate activation function used. Must be a + callable activation function. + """ super(SqueezeExcite, self).__init__() - self.pool = nn.AdaptiveAvgPool1d(1) + self.context_window = int(context_window) + self.interpolation_mode = interpolation_mode + + if self.context_window <= 0: + self.pool = nn.AdaptiveAvgPool1d(1) # context window = T + else: + self.pool = nn.AvgPool1d(self.context_window, stride=1) + + if activation is None: + activation = nn.ReLU(inplace=True) + self.fc = nn.Sequential( nn.Linear(channels, channels // reduction_ratio, bias=False), - nn.ReLU(inplace=True), + activation, nn.Linear(channels // reduction_ratio, channels, bias=False), - nn.Sigmoid(), ) def forward(self, x): - batch, channels, _ = x.size() - y = self.pool(x).view(batch, channels) - y = self.fc(y).view(batch, channels, 1) - return x * y.expand_as(x) + batch, channels, timesteps = x.size() + y = self.pool(x) # [B, C, T - context_window + 1] + y = y.transpose(1, 2) # [B, T - context_window + 1, C] + y = self.fc(y) # [B, T - context_window + 1, C] + y = y.transpose(1, 2) # [B, C, T - context_window + 1] + + if self.context_window > 0: + y = torch.nn.functional.interpolate(y, size=timesteps, mode=self.interpolation_mode) + + y = torch.sigmoid(y) + + return x * y + + +class Swish(nn.Module): + def forward(self, x): + return x * torch.sigmoid(x) class JasperBlock(nn.Module): @@ -194,6 +267,9 @@ def __init__( conv_mask=False, se=False, se_reduction_ratio=16, + se_context_window=None, + se_interpolation_mode='nearest', + stride_last=False, ): super(JasperBlock, self).__init__() @@ -216,12 +292,18 @@ def __init__( conv = nn.ModuleList() for _ in range(repeat - 1): + # Stride last means only the last convolution in block will have stride + if stride_last: + stride_val = [1] + else: + stride_val = stride + conv.extend( self._get_conv_bn_layer( inplanes_loop, planes, kernel_size=kernel_size, - stride=stride, + stride=stride_val, dilation=dilation, padding=padding_val, groups=groups, @@ -234,9 +316,6 @@ def __init__( conv.extend(self._get_act_dropout_layer(drop_prob=dropout, activation=activation)) - if se and not residual: - conv.append(SqueezeExcite(planes, reduction_ratio=se_reduction_ratio)) - inplanes_loop = planes conv.extend( @@ -255,8 +334,16 @@ def __init__( ) ) - if se and not residual: - conv.append(SqueezeExcite(planes, reduction_ratio=se_reduction_ratio)) + if se: + conv.append( + SqueezeExcite( + planes, + reduction_ratio=se_reduction_ratio, + context_window=se_context_window, + interpolation_mode=se_interpolation_mode, + activation=activation, + ) + ) self.mconv = conv @@ -265,19 +352,27 @@ def __init__( if residual: res_list = nn.ModuleList() + + if residual_mode == 'stride_add': + stride_val = stride + else: + stride_val = [1] + if len(residual_panes) == 0: res_panes = [inplanes] self.dense_residual = False for ip in res_panes: res = nn.ModuleList( self._get_conv_bn_layer( - ip, planes, kernel_size=1, normalization=normalization, norm_groups=norm_groups, + ip, + planes, + kernel_size=1, + normalization=normalization, + norm_groups=norm_groups, + stride=stride_val, ) ) - if se: - res.append(SqueezeExcite(planes, reduction_ratio=se_reduction_ratio)) - res_list.append(res) self.res = res_list @@ -434,7 +529,7 @@ def forward(self, input_: Tuple[List[Tensor], Optional[Tensor]]): else: res_out = res_layer(res_out) - if self.residual_mode == 'add': + if self.residual_mode == 'add' or self.residual_mode == 'stride_add': out = out + res_out else: out = torch.max(out, res_out) @@ -445,3 +540,7 @@ def forward(self, input_: Tuple[List[Tensor], Optional[Tensor]]): return xs + [out], lens return [out], lens + + +# Register swish activation function +jasper_activations['swish'] = Swish diff --git a/nemo/collections/asr/parts/spectr_augment.py b/nemo/collections/asr/parts/spectr_augment.py index ff733cc2f352..a2f4bd2f587a 100755 --- a/nemo/collections/asr/parts/spectr_augment.py +++ b/nemo/collections/asr/parts/spectr_augment.py @@ -14,7 +14,12 @@ class SpecAugment(nn.Module): freq_masks - how many frequency segments should be cut time_masks - how many time segments should be cut freq_width - maximum number of frequencies to be cut in one segment - time_width - maximum number of time steps to be cut in one segment + time_width - maximum number of time steps to be cut in one segment. + Can be a positive integer or a float value in the range [0, 1]. + If positive integer value, defines maximum number of time steps + to be cut in one segment. + If a float value, defines maximum percentage of timesteps that + are cut adaptively. """ def __init__( @@ -30,10 +35,23 @@ def __init__( self.freq_width = freq_width self.time_width = time_width + if isinstance(time_width, int): + self.adaptive_temporal_width = False + else: + if time_width > 1.0 or time_width < 0.0: + raise ValueError('If `time_width` is a float value, must be in range [0, 1]') + + self.adaptive_temporal_width = True + @torch.no_grad() def forward(self, x): sh = x.shape + if self.adaptive_temporal_width: + time_width = max(1, int(sh[2] * self.time_width)) + else: + time_width = self.time_width + mask = torch.zeros(x.shape).byte() for idx in range(sh[0]): @@ -45,9 +63,9 @@ def forward(self, x): mask[idx, x_left : x_left + w, :] = 1 for i in range(self.time_masks): - y_left = int(self._rng.uniform(0, sh[2] - self.time_width)) + y_left = int(self._rng.uniform(0, sh[2] - time_width)) - w = int(self._rng.uniform(0, self.time_width)) + w = int(self._rng.uniform(0, time_width)) mask[idx, :, y_left : y_left + w] = 1 diff --git a/nemo/collections/nlp/data/tokenizers/tokenizer_utils.py b/nemo/collections/nlp/data/tokenizers/tokenizer_utils.py index 836e748c3370..7ace6eeea053 100644 --- a/nemo/collections/nlp/data/tokenizers/tokenizer_utils.py +++ b/nemo/collections/nlp/data/tokenizers/tokenizer_utils.py @@ -18,6 +18,10 @@ from transformers import AlbertTokenizer, BertTokenizer, RobertaTokenizer import nemo +from nemo.collections.nlp.nm.trainables.common.megatron.megatron_utils import ( + get_megatron_vocab_file, + is_lower_cased_megatron, +) __all__ = ['MODEL_SPECIAL_TOKENS', 'TOKENIZERS', 'get_tokenizer', 'get_bert_special_tokens'] @@ -72,7 +76,7 @@ def get_tokenizer( ''' Args: tokenizer_name: sentencepiece or nemobert - pretrained_mode_name ('str'): name of the pretrained model from the hugging face list, + pretrained_mode_name ('str'): name of the pretrained model from the hugging face list or 'megatron', for example: bert-base-cased To see the list of pretrained models, use: nemo_nlp.nm.trainables.get_bert_models_list() tokenizer_model (path): only used for sentencepiece tokenizer @@ -80,6 +84,12 @@ def get_tokenizer( vocab_file (str): path to vocab file do_lower_case (bool): (whether to apply lower cased) - only applicable when tokenizer is build with vocab file ''' + if 'megatron' in pretrained_model_name: + do_lower_case = is_lower_cased_megatron(pretrained_model_name) + vocab_file = get_megatron_vocab_file(pretrained_model_name) + return nemo.collections.nlp.data.tokenizers.NemoBertTokenizer( + vocab_file=vocab_file, do_lower_case=do_lower_case + ) if tokenizer_name == 'nemobert': tokenizer = nemo.collections.nlp.data.tokenizers.NemoBertTokenizer( diff --git a/nemo/collections/nlp/nm/trainables/common/__init__.py b/nemo/collections/nlp/nm/trainables/common/__init__.py index f04ffdbdfa8f..7ac5338dfe4a 100644 --- a/nemo/collections/nlp/nm/trainables/common/__init__.py +++ b/nemo/collections/nlp/nm/trainables/common/__init__.py @@ -14,7 +14,9 @@ # limitations under the License. # ============================================================================= +from nemo.collections.nlp.nm.trainables.common.common_utils import * from nemo.collections.nlp.nm.trainables.common.huggingface import * +from nemo.collections.nlp.nm.trainables.common.megatron import * from nemo.collections.nlp.nm.trainables.common.sequence_classification_nm import * from nemo.collections.nlp.nm.trainables.common.sequence_regression_nm import * from nemo.collections.nlp.nm.trainables.common.token_classification_nm import * diff --git a/nemo/collections/nlp/nm/trainables/common/common_utils.py b/nemo/collections/nlp/nm/trainables/common/common_utils.py new file mode 100644 index 000000000000..318038c50aa4 --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/common/common_utils.py @@ -0,0 +1,75 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import json + +from nemo import logging +from nemo.collections.nlp.nm.trainables.common.huggingface.huggingface_utils import * +from nemo.collections.nlp.nm.trainables.common.megatron.megatron_bert_nm import MegatronBERT +from nemo.collections.nlp.nm.trainables.common.megatron.megatron_utils import * + +__all__ = ['get_pretrained_lm_models_list', 'get_pretrained_lm_model'] + + +def get_pretrained_lm_models_list(): + ''' + Returns the list of support pretrained models + ''' + return get_megatron_lm_models_list() + get_huggingface_lm_models_list() + + +def get_pretrained_lm_model(pretrained_model_name, config=None, vocab=None, checkpoint=None): + ''' + Returns pretrained model + Args: + pretrained_model_name (str): pretrained model name, for example, bert-base-uncased. + See the full list by calling get_pretrained_lm_models_list() + config (str): path to the model configuration file + vocab (str): path to the vocabulary file used during model training + checkpoint (str): path to the pretrained model checkpoint + Returns: + Pretrained model (NM) + ''' + if pretrained_model_name in get_huggingface_lm_models_list(): + model = get_huggingface_lm_model(bert_config=config, pretrained_model_name=pretrained_model_name) + elif pretrained_model_name in get_megatron_lm_models_list(): + if pretrained_model_name == 'megatron-bert-cased' or pretrained_model_name == 'megatron-bert-uncased': + if not (config and checkpoint): + raise ValueError(f'Config file and pretrained checkpoint required for {pretrained_model_name}') + if not config: + config = get_megatron_config_file(pretrained_model_name) + if isinstance(config, str): + with open(config) as f: + config = json.load(f) + if not vocab: + vocab = get_megatron_vocab_file(pretrained_model_name) + if not checkpoint: + checkpoint = get_megatron_checkpoint(pretrained_model_name) + model = MegatronBERT( + model_name=pretrained_model_name, + vocab_file=vocab, + hidden_size=config['hidden-size'], + num_attention_heads=config['num-attention-heads'], + num_layers=config['num-layers'], + max_seq_length=config['max-seq-length'], + ) + else: + raise ValueError(f'{pretrained_model_name} is not supported') + + if checkpoint: + model.restore_from(checkpoint) + logging.info(f"{pretrained_model_name} model restored from {checkpoint}") + return model diff --git a/nemo/collections/nlp/nm/trainables/common/huggingface/huggingface_utils.py b/nemo/collections/nlp/nm/trainables/common/huggingface/huggingface_utils.py index ec66b1d55aef..05c36e171f93 100644 --- a/nemo/collections/nlp/nm/trainables/common/huggingface/huggingface_utils.py +++ b/nemo/collections/nlp/nm/trainables/common/huggingface/huggingface_utils.py @@ -18,12 +18,12 @@ from nemo.collections.nlp.nm.trainables.common.huggingface.bert_nm import BERT from nemo.collections.nlp.nm.trainables.common.huggingface.roberta_nm import Roberta -__all__ = ['MODELS', 'get_huggingface_model', 'get_bert_models_list'] +__all__ = ['MODELS', 'get_huggingface_lm_model', 'get_huggingface_lm_models_list'] -def get_huggingface_model(pretrained_model_name, bert_config=None): +def get_huggingface_lm_model(pretrained_model_name, bert_config=None): ''' - Return the dict of special tokens associated with the model. + Returns the dict of special tokens associated with the model. Args: pretrained_mode_name ('str'): name of the pretrained model from the hugging face list, for example: bert-base-cased @@ -36,7 +36,7 @@ def get_huggingface_model(pretrained_model_name, bert_config=None): else: return MODELS[model_type]['class'](pretrained_model_name=pretrained_model_name) else: - raise ValueError(f'Choose pretrained model from the following list: {get_bert_models_list()}.') + raise ValueError(f'{pretrained_model_name} is not supported') MODELS = { @@ -46,7 +46,10 @@ def get_huggingface_model(pretrained_model_name, bert_config=None): } -def get_bert_models_list(): +def get_huggingface_lm_models_list(): + ''' + Returns the list of supported HuggingFace models + ''' huggingface_models = [] for model in MODELS: model_names = [x.pretrained_model_name for x in MODELS[model]['class'].list_pretrained_models()] diff --git a/nemo/collections/nlp/nm/trainables/common/megatron/__init__.py b/nemo/collections/nlp/nm/trainables/common/megatron/__init__.py new file mode 100644 index 000000000000..d82f20067425 --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/common/megatron/__init__.py @@ -0,0 +1,17 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.nm.trainables.common.megatron.megatron_bert_nm import * diff --git a/nemo/collections/nlp/nm/trainables/common/megatron/megatron_bert_nm.py b/nemo/collections/nlp/nm/trainables/common/megatron/megatron_bert_nm.py new file mode 100644 index 000000000000..5a7f76854c66 --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/common/megatron/megatron_bert_nm.py @@ -0,0 +1,140 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import os + +import torch +from megatron.initialize import initialize_megatron +from megatron.model.bert_model import bert_attention_mask_func, bert_extended_attention_mask, bert_position_ids +from megatron.model.language_model import get_language_model +from megatron.model.utils import init_method_normal, scaled_init_method_normal + +from nemo.backends.pytorch.nm import TrainableNM +from nemo.core import DeviceType +from nemo.core.neural_types import ChannelType, NeuralType +from nemo.utils.decorators import add_port_docs + +__all__ = ['MegatronBERT'] + + +class MegatronBERT(TrainableNM): + """ + MegatronBERT wraps around the Megatron Language model + from https://github.com/NVIDIA/Megatron-LM + + Args: + config_file (str): path to model configuration file. + vocab_file (str): path to vocabulary file. + tokenizer_type (str): tokenizer type, currently only 'BertWordPieceLowerCase' supported. + """ + + @property + @add_port_docs() + def input_ports(self): + """Returns definitions of module input ports. + input_ids: input token ids + token_type_ids: segment type ids + attention_mask: attention mask + """ + return { + "input_ids": NeuralType(('B', 'T'), ChannelType()), + "attention_mask": NeuralType(('B', 'T'), ChannelType()), + "token_type_ids": NeuralType(('B', 'T'), ChannelType(), optional=True), + } + + @property + @add_port_docs() + def output_ports(self): + """Returns definitions of module output ports. + hidden_states: output embedding + """ + return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} + + def __init__( + self, + model_name, + vocab_file, + hidden_size=1024, + num_attention_heads=16, + num_layers=24, + max_seq_length=512, + tokenizer_type='BertWordPieceLowerCase', + init_method_std=0.02, + num_tokentypes=2, + ): + + super().__init__() + + if not os.path.exists(vocab_file): + raise ValueError(f'Vocab file not found at {vocab_file}') + + megatron_args = { + "num_layers": num_layers, + "hidden_size": hidden_size, + "num_attention_heads": num_attention_heads, + "max_position_embeddings": max_seq_length, + "tokenizer_type": tokenizer_type, + "vocab_file": vocab_file, + } + + initialize_megatron(None, megatron_args, ignore_unknown_args=True) + init_method = init_method_normal(init_method_std) + + self.language_model, self._language_model_key = get_language_model( + attention_mask_func=bert_attention_mask_func, + num_tokentypes=num_tokentypes, + add_pooler=False, + init_method=init_method, + scaled_init_method=scaled_init_method_normal(init_method_std, num_layers), + ) + + self.language_model.to(self._device) + self._hidden_size = self.language_model.hidden_size + + @property + def hidden_size(self): + """ + Property returning hidden size. + + Returns: + Hidden size. + """ + return self._hidden_size + + def forward(self, input_ids, attention_mask, token_type_ids): + extended_attention_mask = bert_extended_attention_mask( + attention_mask, next(self.language_model.parameters()).dtype + ) + position_ids = bert_position_ids(input_ids) + + sequence_output = self.language_model( + input_ids, position_ids, extended_attention_mask, tokentype_ids=token_type_ids + ) + return sequence_output + + def restore_from(self, path, local_rank=0): + if self.placement == DeviceType.AllGpu: + load_device = f"cuda:{local_rank}" + else: + load_device = self._device + + state_dict = torch.load(path, map_location=load_device) + + # to load from Megatron pretrained checkpoint + if 'model' in state_dict: + self.language_model.load_state_dict(state_dict['model'][self._language_model_key]) + else: + self.load_state_dict(state_dict) diff --git a/nemo/collections/nlp/nm/trainables/common/megatron/megatron_utils.py b/nemo/collections/nlp/nm/trainables/common/megatron/megatron_utils.py new file mode 100644 index 000000000000..13ec1894eb05 --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/common/megatron/megatron_utils.py @@ -0,0 +1,113 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import os + +import wget +from transformers import TRANSFORMERS_CACHE, cached_path + +__all__ = [ + 'MEGATRON_CACHE', + 'MEGATRON_CONFIG_MAP', + 'CONFIGS', + 'get_megatron_lm_models_list', + 'get_megatron_config_file', + 'get_megatron_vocab_file', + 'get_megatron_checkpoint', +] + +MEGATRON_CACHE = os.path.join(os.path.dirname(TRANSFORMERS_CACHE), 'megatron') + +CONFIGS = {'345m': {"hidden-size": 1024, "num-attention-heads": 16, "num-layers": 24, "max-seq-length": 512}} + +MEGATRON_CONFIG_MAP = { + 'megatron-bert-345m-uncased': { + 'config': CONFIGS['345m'], + 'checkpoint': 'https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.0/files/release/mp_rank_00/model_optim_rng.pt', + 'vocab': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt', + 'do_lower_case': True, + }, + 'megatron-bert-uncased': { + 'config': None, + 'checkpoint': None, + 'vocab': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt', + 'do_lower_case': True, + }, + 'megatron-bert-cased': { + 'config': None, + 'vocab': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt', + 'do_lower_case': False, + }, +} + + +def get_megatron_lm_models_list(): + ''' + Return the list of support Megatron models + ''' + return list(MEGATRON_CONFIG_MAP.keys()) + + +def get_megatron_config_file(pretrained_model_name): + ''' + Returns model config file + Args: + pretrained_model_name (str): pretrained model name + Returns: + config (dict): contains model configuration: number of hidden layers, number of attention heads, etc + ''' + return MEGATRON_CONFIG_MAP[pretrained_model_name]['config'] + + +def get_megatron_vocab_file(pretrained_model_name): + ''' + Gets vocabulary file from cache or downloads it + Args: + pretrained_model_name (str): pretrained model name + Returns: + path (str): path to the vocab file + ''' + url = MEGATRON_CONFIG_MAP[pretrained_model_name]['vocab'] + path = cached_path(url, cache_dir=MEGATRON_CACHE) + return path + + +def get_megatron_checkpoint(pretrained_model_name): + ''' + Gets checkpoint file from cache or downloads it + Args: + pretrained_model_name (str): pretrained model name + Returns: + path (str): path to model checkpoint + ''' + url = MEGATRON_CONFIG_MAP[pretrained_model_name]['checkpoint'] + path = os.path.join(MEGATRON_CACHE, pretrained_model_name) + + if not os.path.exists(path): + wget.download(url, path) + return path + + +def is_lower_cased_megatron(pretrained_model_name): + ''' + Returns if the megatron is cased or uncased + Args: + pretrained_model_name (str): pretrained model name + Returns: + do_lower_cased (bool): whether the model uses lower cased data + ''' + return MEGATRON_CONFIG_MAP[pretrained_model_name]['do_lower_case'] diff --git a/nemo/collections/tts/parts/waveglow.py b/nemo/collections/tts/parts/waveglow.py index 0f6d74e05501..abf81f165e60 100644 --- a/nemo/collections/tts/parts/waveglow.py +++ b/nemo/collections/tts/parts/waveglow.py @@ -3,6 +3,7 @@ import torch import torch.nn.functional as F +from torch.autograd import Variable @torch.jit.script @@ -45,15 +46,12 @@ def forward(self, z, reverse: bool = False): if not hasattr(self, 'W_inverse'): # Reverse computation W_inverse = W.float().inverse() + W_inverse = Variable(W_inverse[..., None]) if z.dtype == torch.half: W_inverse = W_inverse.half() - z = F.conv1d(z, W_inverse, bias=None, stride=1, padding=0) - # Tracer demands uniform output, i.e two tensors: - dummy = torch.zeros([1]) - return ( - z, - dummy, - ) + self.W_inverse = W_inverse + z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) + return z else: # Forward computation log_det_W = batch_size * n_of_groups * torch.logdet(W.float()) @@ -185,32 +183,29 @@ def forward(self, forward_input: Tuple[torch.Tensor, torch.Tensor]): log_s_list = [] log_det_W_list = [] - k = 0 - for convinvk, wnk in zip(self.convinv, self.WN): + for k in range(self.n_flows): if k % self.n_early_every == 0 and k > 0: output_audio.append(audio[:, : self.n_early_size, :]) audio = audio[:, self.n_early_size :, :] - audio, log_det_W = convinvk(audio) + audio, log_det_W = self.convinv[k](audio) log_det_W_list.append(log_det_W) n_half = int(audio.size(1) / 2) audio_0 = audio[:, :n_half, :] audio_1 = audio[:, n_half:, :] - output = wnk((audio_0, spect)) + output = self.WN[k]((audio_0, spect)) log_s = output[:, n_half:, :] b = output[:, :n_half, :] audio_1 = torch.exp(log_s) * audio_1 + b log_s_list.append(log_s) - audio = torch.cat((audio_0, audio_1), 1) - k += 1 + audio = torch.cat([audio_0, audio_1], 1) output_audio.append(audio) return torch.cat(output_audio, 1), log_s_list, log_det_W_list - @torch.jit.ignore def infer(self, spect, sigma: float = 1.0): spect = self.upsample(spect) # trim conv artifacts. maybe pad spec to kernel multiple @@ -224,10 +219,10 @@ def infer(self, spect, sigma: float = 1.0): audio = sigma * torch.randn(spect.size(0), self.n_remaining_channels, spect.size(2), device=spect.device).to( spect.dtype ) + # audio=sigma * torch.ones(spect.size(0), self.n_remaining_channels, spect.size(2), device=spect.device).to( + # spect.dtype + # ) - # k = int(self.n_flows - 1) - # TODO - when ModuleList will support reversed iterator, make it traceable - # for convinvk, wnk in zip(self.convinv, self.WN): for k in reversed(range(self.n_flows)): n_half = int(audio.size(1) / 2) audio_0 = audio[:, :n_half, :] @@ -240,17 +235,13 @@ def infer(self, spect, sigma: float = 1.0): audio = torch.cat((audio_0, audio_1), 1) audio = self.convinv[k](audio, reverse=True) - audio = audio[0] - if k % self.n_early_every == 0 and k > 0: z = sigma * torch.randn(spect.size(0), self.n_early_size, spect.size(2), device=spect.device).to( spect.dtype ) + # z = sigma * torch.ones(spect.size(0), self.n_early_size, spect.size(2), device=spect.device).to(spect.dtype) audio = torch.cat((z, audio), 1) - # k -= 1 - - audio = audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data - return audio + return audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1) def remove_weightnorm(model): diff --git a/nemo/collections/tts/waveglow_modules.py b/nemo/collections/tts/waveglow_modules.py index aa648d3aebaf..b316b720d484 100644 --- a/nemo/collections/tts/waveglow_modules.py +++ b/nemo/collections/tts/waveglow_modules.py @@ -207,7 +207,7 @@ def denoise(self, audio, strength=0.1): audio_denoised = librosa.core.istft(audio_spec_denoised * audio_angles) return audio_denoised, audio_spec_denoised - def forward(self, mel_spectrogram, z): + def forward(self, mel_spectrogram): if not self._removed_weight_norm: logging.info("remove WN") self.waveglow = remove_weightnorm(self.waveglow) @@ -215,7 +215,8 @@ def forward(self, mel_spectrogram, z): if self.training: raise ValueError("You are using the WaveGlow Infer Neural Module in training mode.") with torch.no_grad(): - return self.waveglow.forward((mel_spectrogram, z,))[0] + audio = self.waveglow.infer(mel_spectrogram, sigma=self._sigma) + return audio class WaveGlowLoss(LossNM): diff --git a/nemo/core/neural_factory.py b/nemo/core/neural_factory.py index f80df8c39b23..77e86227f81d 100644 --- a/nemo/core/neural_factory.py +++ b/nemo/core/neural_factory.py @@ -243,8 +243,7 @@ def train( batch_size stop_on_nan_loss: (default: False) If set to True, the training will stop if loss=nan or inf. If set to False, the training - will continue. Note that if apex.amp is not used, or if - optimization level is O0, training will stop regardless. + will continue. Returns: None @@ -669,7 +668,6 @@ def train( lr_policy=None, batches_per_step=None, stop_on_nan_loss=False, - steps_per_nan_check=100, synced_batchnorm=False, synced_batchnorm_groupsize=0, gradient_predivide=False, @@ -687,7 +685,6 @@ def train( lr_policy=lr_policy, batches_per_step=batches_per_step, stop_on_nan_loss=stop_on_nan_loss, - steps_per_nan_check=steps_per_nan_check, synced_batchnorm=synced_batchnorm, synced_batchnorm_groupsize=synced_batchnorm_groupsize, gradient_predivide=gradient_predivide, diff --git a/nemo/core/neural_graph.py b/nemo/core/neural_graph.py index 850f2879eefa..917af83b117c 100644 --- a/nemo/core/neural_graph.py +++ b/nemo/core/neural_graph.py @@ -26,9 +26,10 @@ from ruamel.yaml import YAML +from nemo.backends import get_state_dict, load, save, set_state_dict from nemo.core import OperationMode from nemo.core.neural_interface import NeuralInterface -from nemo.core.neural_modules import NeuralModule +from nemo.core.neural_modules import ModuleType, NeuralModule from nemo.core.neural_types import NeuralPortNameMismatchError, NeuralType, NmTensor from nemo.package_info import __version__ as nemo_version from nemo.utils import logging @@ -46,12 +47,12 @@ class NeuralGraph(NeuralInterface): def __init__(self, operation_mode: OperationMode = OperationMode.both, name: Optional[str] = None): """ - Constructor. Initializes graph variables. + Constructor. Initializes graph variables. - Args: - operation_mode: Graph operation mode, that will be propagated along modules during graph creation. - [training | eval | both] (DEFAULT: both) - name: Name of the graph (optional) + Args: + operation_mode: Graph operation mode, that will be propagated along modules during graph creation. + [training | eval | both] (DEFAULT: both) + name: Name of the graph (optional) """ # Initialize the inferface. super().__init__() @@ -84,12 +85,11 @@ def __init__(self, operation_mode: OperationMode = OperationMode.both, name: Opt def __call__(self, **kwargs): """ - This method "nests" one existing neural graph into another one. - Also checks if all inputs were provided and properly connects them. - - Args: - kwargs: keyword arguments containing dictionary of (input_port_name, port_content). + This method "nests" one existing neural graph into another one. + Also checks if all inputs were provided and properly connects them. + Args: + kwargs: keyword arguments containing dictionary of (input_port_name, port_content). """ # Test operation modes of the nested graphs. outer_mode = self._app_state.active_graph.operation_mode @@ -121,11 +121,11 @@ def __call__(self, **kwargs): def __nest(self, inner_graph: 'NeuralGraph', inner_graph_args): """ - Method nests (copies) a graph: modules, steps, topology (tensors). + Method nests (copies) a graph: modules, steps, topology (tensors). - Args: - inner_graph: Graph to be copied (will be "nested" in this (self) graph). - inner_graph_args: inputs passed to the graph call. + Args: + inner_graph: Graph to be copied (will be "nested" in this (self) graph). + inner_graph_args: inputs passed to the graph call. """ # Remember the number of "already present steps". step_bump = len(self.steps) @@ -248,13 +248,13 @@ def __nest(self, inner_graph: 'NeuralGraph', inner_graph_args): def record_step(self, module: NeuralModule): """ - Records the operation (module plus passed inputs) on a list. + Records the operation (the module to be executed) on a list. - Args: - module: Neural modules added to a given graph. + Args: + module: Neural modules added to a given graph. - Returns: - Step number. + Returns: + Step number. """ # The solution allows loops in the graph. # This also means that module with that name can already be present in the graph. @@ -277,17 +277,17 @@ def record_step(self, module: NeuralModule): @property def step_number(self) -> int: """ - Returns: - The current step number. + Returns: + The current step number. """ return len(self._steps) - 1 def bind_outputs(self, tensors_list: Union[NmTensor, List[NmTensor]]): """ - Binds the output tensors. + Binds the output tensors. - Args: - tensors_list: A single tensor OR a List of tensors to be bound. + Args: + tensors_list: A single tensor OR a List of tensors to be bound. """ # Handle both single port and lists of ports to be bound. if type(tensors_list) is not list: @@ -311,48 +311,46 @@ def bind_outputs(self, tensors_list: Union[NmTensor, List[NmTensor]]): @property def inputs(self) -> GraphInputs: """ - Returns graph inputs. - Returns: - A graph input. + Graph input. """ return self._inputs @property def input_ports(self) -> Dict[str, NeuralType]: """ - Returns definitions of graph input ports (dict of Neural Types). + Returns definitions of graph input ports (dict of Neural Types). .. note:: This method actually returns an immutable dictionary with port types (like Neural Modules). In order to get access to actual graph inputs please call the inputs() method. Returns: - A graph input ports definitions. + Graph input ports definitions. """ return self._inputs.definitions @property def outputs(self) -> GraphOutputs: """ - Returns graph outputs. + Returns graph outputs. Returns: - A graph outputs. + Graph outputs. """ return self._outputs @property def output_ports(self) -> Dict[str, NeuralType]: """ - Returns definitions of module output ports (dict of Neural Types). + Returns definitions of module output ports (dict of Neural Types). .. note:: This method actually returns an immutable dictionary with port types (like Neural Modules). In order to get access to actual graph outpus please call the outputs() method. Returns: - A graph output ports definitions. + Graph output ports definitions. """ return self._outputs.definitions @@ -360,10 +358,8 @@ def output_ports(self) -> Dict[str, NeuralType]: @property def output_tensors(self) -> Dict[str, NmTensor]: """ - Returns graph output tensors. - Returns: - A graph output tensors. + Fraph output tensors. """ return self._outputs.tensors @@ -387,35 +383,36 @@ def __getitem__(self, key) -> NeuralModule: def __len__(self) -> int: """ - Returns: - The number of modules (vertices) in a given graph. + Returns: + The number of modules (vertices) in a given graph. """ return len(self._modules) @property def steps(self) -> Dict[int, str]: - """ Returns steps. """ + """ + Returns: + Dictionary [steps_number, module_name] + """ return self._steps @property def tensors(self): """ - Property returning a (double) dictionary of all output tensors. + Property returning a (double) dictionary of all output tensors. - Returns: - Dictionary of tensors in the format [module_name][output_port_name]. - + Returns: + Dictionary of tensors in the format [module_name][output_port_name]. """ return self._all_tensors @property def tensor_list(self) -> List[NmTensor]: """ - Property returning output tensors by extracting them on the fly from the bound outputs. - - Returns: - List of tensors. - + Property returning output tensors by extracting them on the fly from the bound outputs. + + Returns: + List of tensors. """ tensor_list = [] # Get tensors by acessing the producer-ports. @@ -429,45 +426,45 @@ def tensor_list(self) -> List[NmTensor]: @property def operation_mode(self) -> OperationMode: """ - Returns: - Operation mode. + Returns: + Operation mode. """ return self._operation_mode def __enter__(self) -> 'NeuralGraph': """ - Activates this graph. - - Returns: - The graph object. + Activates this graph. + + Returns: + The graph object. """ self._app_state.active_graph = self return self def __exit__(self, exc_type, exc_value, exc_traceback): """ - Deactivates the current graph. + Deactivates the current graph. """ self._app_state.active_graph = None def activate(self): """ - Activates this graph. + Activates this graph. """ self._app_state.active_graph = self def deactivate(self): """ - Deactivates the current graph. + Deactivates the current graph. """ self._app_state.active_graph = None def export_to_config(self, config_file: str): """ - Exports the neural graph to a file. - - Args: - config_file: Name (and path) of the config file (YML) to be written to. + Exports the neural graph to a file. + + Args: + config_file: Name (and path) of the config file (YML) to be written to. """ # Greate an absolute path. abs_path_file = path.expanduser(config_file) @@ -484,10 +481,11 @@ def export_to_config(self, config_file: str): ) def serialize(self) -> Dict[str, Any]: - """ Method serializes the whole graph. + """ + Method serializes the whole graph. - Returns: - Dictionary containing description of the whole graph. + Returns: + Dictionary containing description of the whole graph. """ # Create a dictionary representing the serialized object. serialized_graph = {} @@ -514,10 +512,11 @@ def serialize(self) -> Dict[str, Any]: return serialized_graph def __serialize_header(self) -> Dict[str, Any]: - """ Private method responsible for serializing the graph header. + """ + Private method responsible for serializing the graph header. - Returns: - Dictionary containing description of the whole graph. + Returns: + Dictionary containing description of the whole graph. """ # Generate full_spec of the class. full_spec = str(self.__module__) + "." + str(self.__class__.__qualname__) @@ -533,10 +532,11 @@ def __serialize_header(self) -> Dict[str, Any]: return header def __serialize_modules(self) -> Dict[str, Any]: - """ Private method responsible for serializing the modules present in the graph. + """ + Private method responsible for serializing the modules present in the graph. - Returns: - Dictionary containing description of all graph modules. + Returns: + Dictionary containing description of all graph modules. """ serialized_modules = {} for name, module in self._modules.items(): @@ -544,10 +544,11 @@ def __serialize_modules(self) -> Dict[str, Any]: return serialized_modules def __serialize_steps(self): - """ Private method responsible for serializing the steps (order of module executions). + """ + Private method responsible for serializing the steps (order of module executions). - Returns: - Dictionary containing description of the steps. + Returns: + Dictionary containing description of the steps. """ serialized_steps = {} for no, module_name in self._steps.items(): @@ -555,10 +556,11 @@ def __serialize_steps(self): return serialized_steps def __serialize_connections(self) -> Dict[str, Any]: - """ Private method responsible for serializing the connections in the graph. + """ + Private method responsible for serializing the connections in the graph. - Returns: - List containing "connections" between modules. + Returns: + List containing "connections" between modules. """ serialized_connections = [] # Iterate through "tensor modules". @@ -583,22 +585,18 @@ def import_from_config( name: Optional[str] = None, ) -> 'NeuralGraph': """ - Class method importing the neural graph from the configuration file. - Raises an ImportError exception when config file is invalid. - - Args: - config_file: path (absolute or relative) and name of the config file (YML) - - reuse_existing_modules: If the modules with (name, type, init_params) are already created, import will - connect to them instead of creating new instances. - - overwrite_params: Dictionary containing parameters that will be added to or overwrite (!) the default - parameters loaded from the configuration file - - name: Name of the new graph (optional, DEFAULT: NONE) - - Returns: - Instance of the created NeuralGraph object. + Class method importing the neural graph from the configuration file. + Raises an ImportError exception when config file is invalid. + + Args: + config_file: path (absolute or relative) and name of the config file (YML) + reuse_existing_modules: If the modules with (name, type, init_params) are already created, import will + connect to them instead of creating new instances. + overwrite_params: Dictionary containing parameters that will be added to or overwrite (!) the default + parameters loaded from the configuration file + name: Name of the new graph (optional, DEFAULT: NONE) + Returns: + Instance of the created NeuralGraph object. """ logging.info("Loading configuration of a new Neural Graph from the `{}` file".format(config_file)) @@ -615,15 +613,14 @@ def import_from_config( @classmethod def __validate_config_file(cls, config_file: str): """ - Class method validating whether the config file has a proper content (sections, specification etc.). - Raises an ImportError exception when config file is invalid or - incompatible (when called from a particular class). - - Args: - config_file: path (absolute or relative) and name of the config file (YML) + Class method validating whether the config file has a proper content (sections, specification etc.). + Raises an ImportError exception when config file is invalid or + incompatible (when called from a particular class). - Returns: - A loaded configuration file (dictionary). + Args: + config_file: path (absolute or relative) and name of the config file (YML) + Returns: + A loaded configuration file (dictionary). """ # Greate an absolute path. abs_path_file = path.expanduser(config_file) @@ -658,16 +655,14 @@ def deserialize( cls, configuration: Dict[str, Any], reuse_existing_modules: bool = False, name: Optional[str] = None ) -> 'NeuralGraph': """ - Class method creating a graph instance by deserializing the provided configuratino. - - Args: - configuration: Dictionary containing serialized graph. - - reuse_existing_modules: If the modules with (name, type, init_params) are already created, import will - connect to them instead of creating new instances. + Class method creating a graph instance by deserializing the provided configuratino. - Returns: - Instance of the created NeuralGraph object. + Args: + configuration: Dictionary containing serialized graph. + reuse_existing_modules: If the modules with (name, type, init_params) are already created, import will + connect to them instead of creating new instances. + Returns: + Instance of the created NeuralGraph object. """ # Deserialize header and get object class. operation_mode = cls.__deserialize_header(configuration["header"]) @@ -702,13 +697,13 @@ def deserialize( @classmethod def __deserialize_header(cls, serialized_header: Dict[str, Any]): - """ Private class method deserializing the header and extracts the general information. - - Args: - serialized_header: Dictionary containing graph header. - - Returns: - Operation mode. + """ + Private class method deserializing the header and extracts the general information. + + Args: + serialized_header: Dictionary containing graph header. + Returns: + Operation mode. """ # Parse the "full specification" - do not need that now. # spec_list = serialized_header["full_spec"].split(".") @@ -725,17 +720,18 @@ def __deserialize_header(cls, serialized_header: Dict[str, Any]): return operation_mode def __deserialize_modules(self, serialized_modules: Dict[str, Any], reuse_existing_modules: bool): - """ Private method deserializing the modules present in the graph. + """ + Private method deserializing the modules present in the graph. - Args: - serialized_modules: Dictionary containing graph modules. - reuse_existing_modules: If True, won create a new module when a module with a given name exists. + Args: + serialized_modules: Dictionary containing graph modules. + reuse_existing_modules: If True, won create a new module when a module with a given name exists. - Returns: - Dictionary of modules. + Returns: + Dictionary of modules. - Raises: - KeyError: A module with name already exists (if reuse_existing_modules is set to False). + Raises: + KeyError: A module with name already exists (if reuse_existing_modules is set to False). """ modules = {} for name, module_params in serialized_modules.items(): @@ -753,13 +749,13 @@ def __deserialize_modules(self, serialized_modules: Dict[str, Any], reuse_existi return modules def __deserialize_steps(self, serialized_steps: Dict[str, Any]): - """ Private method deserializing the steps (order of module executions). - - Args: - serialized_steps: Dictionary containing serialized steps. + """ + Private method deserializing the steps (order of module executions). - Returns: - Odered dict with steps. + Args: + serialized_steps: Dictionary containing serialized steps. + Returns: + Odered dict with steps. """ steps = OrderedDict() for i in range(len(serialized_steps)): @@ -768,14 +764,14 @@ def __deserialize_steps(self, serialized_steps: Dict[str, Any]): return steps def __deserialize_connections(self, serialized_connections: Dict[str, Any], modules: Dict[str, NeuralModule]): - """ Private method deserializing the connections in the graph. - - Args: - serialized_steps: Dictionary containing serialized connections. - modules: List of modules. + """ + Private method deserializing the connections in the graph. - Returns: - List of connections, in a format enabling graph traversing. + Args: + serialized_steps: Dictionary containing serialized connections. + modules: List of modules. + Returns: + List of connections, in a format enabling graph traversing. """ connections = [] # Deserialize connections one by one. @@ -798,15 +794,15 @@ def __deserialize_connections(self, serialized_connections: Dict[str, Any], modu return connections def __execute_and_create_tensors(self, steps, modules, connections, inputs): - """ Method creates (internal) tensors of the graph by executing it following the order and using - the provided connections and inputs. - - Args: - steps: List of steps to be executed. - modules: List of modules. - connections: List of connections. - inputs: List of "bound inputs" - + """ + Method creates (internal) tensors of the graph by executing it following the order and using + the provided connections and inputs. + + Args: + steps: List of steps to be executed. + modules: List of modules. + connections: List of connections. + inputs: List of "bound inputs" """ # Activate this graph, so all the tensors will be added to this ! self.activate() @@ -874,8 +870,8 @@ def __execute_and_create_tensors(self, steps, modules, connections, inputs): def summary(self) -> str: """ - Returns: - A nice, full graph summary. + Returns: + A nice, full graph summary. """ # Line "decorator". desc = "\n" + 120 * '=' + "\n" @@ -920,3 +916,129 @@ def summary(self) -> str: # Return the result. return desc + + def freeze(self, module_names: Optional[List[str]] = None): + """ + A method that freezes the weights of the trainable modules in a graph. + + Args: + module_names: List of modules to be frozen (Optional). If passed, all modules will be unfrozen. + Raises: + KeyError: If name of the module won't be recognized. + """ + # Work on all modules. + if module_names is None: + module_names = self._modules.keys() + + # Iterate through modules one by one. + for name in module_names: + if name not in self._modules.keys(): + raise KeyError("Module `{}` not present in the `{}` graph".format(name, self.name)) + # Check module type. + module = self._modules[name] + if module.type == ModuleType.trainable: + # Freeze weights of the module. + module.freeze() + else: + logging.debug("Module `{}` is not trainable so cannot be frozen".format(name)) + + def unfreeze(self, module_names: Optional[List[str]] = None): + """ + Unfreezes weights of the trainable modules in a graph. + + Args: + module_names: List of modules to be unfrozen (Optional). If not passed, all modules will be unfrozen. + Raises: + KeyError: If name of the module won't be recognized. + """ + # Work on all modules. + if module_names is None: + module_names = self._modules.keys() + + # Iterate through modules one by one. + for name in module_names: + if name not in self._modules.keys(): + raise KeyError("Module `{}` not present in the `{}` graph".format(name, self.name)) + # Check module type. + module = self._modules[name] + if module.type == ModuleType.trainable: + # Unfreeze weights of the module. + module.unfreeze() + else: + logging.debug("Module `{}` is not trainable so cannot be unfrozen".format(name)) + + def save_to(self, filename: str, module_names: Optional[List[str]] = None): + """ + Saves the state of trainable modules in the graph to a checkpoint file. + + Args: + filename (string): Name of the file where the checkpoint will be saved. + module_names: List of modules to be frozen (Optional). If passed, all modules will be saved. + Raises: + KeyError: If name of the module won't be recognized. + """ + # Work on all modules. + if module_names is None: + module_names = self._modules.keys() + + # Prepare the "graph checkpoint". + chkpt = {"header": {"nemo_core_version": nemo_version, "name": self.name}, "modules": {}} + + log_str = '' + # Iterate through the modules one by one. + for name in module_names: + if name not in self._modules.keys(): + raise KeyError("Module `{}` not present in the `{}` graph".format(name, self.name)) + # Check module type. + module = self._modules[name] + if module.type == ModuleType.trainable: + # Get module state_dict(). + chkpt["modules"][name] = get_state_dict(module) + log_str += " * Module '{}' ({}) params saved \n".format(module.name, type(module).__name__) + else: + logging.debug("Module `{}` is not trainable so cannot be saved".format(name)) + + # Save checkpoint. + save(chkpt, filename) + log_str = "Saved the '{}' graph to a checkpoint `{}`:\n".format(self.name, filename) + log_str + logging.info(log_str) + + def restore_from(self, filename: str, module_names: Optional[List[str]] = None): + """ + Restores the state of trainable modules in the graph from a checkpoint file. + + Args: + filename (string): Name of the checkpoint to be restored from. + module_names: List of modules to be frozen (Optional). If passed, all modules will be restored. + Raises: + KeyError: If name of the module won't be recognized. + """ + # Work on all modules. + if module_names is None: + module_names = self._modules.keys() + + # Load the checkpoint. + chkpt = load(filename) + + log_str = "Loading modules constituting the '{}' graph from the `{}` checkpoint :\n".format( + chkpt["header"]["name"], filename + ) + + warning = False + # Iterate through the modules one by one. + for name in module_names: + try: + # Get module. + module = self._modules[name] + # Restore module weights + set_state_dict(module, chkpt["modules"][name]) + log_str += " * Module '{}' ({}) params loaded\n".format(module.name, type(module).__name__) + except KeyError: + log_str += " ! Module '{}' params not found in checkpoint\n".format(name) + warning = True + + # Log results. + if warning: + logging.warning(log_str) + else: + logging.info(log_str) diff --git a/nemo/core/neural_modules.py b/nemo/core/neural_modules.py index 24abfb46264d..163db3ea3513 100644 --- a/nemo/core/neural_modules.py +++ b/nemo/core/neural_modules.py @@ -15,8 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""This file contains NeuralModule and NmTensor classes.""" -__all__ = ['WeightShareTransform', 'NeuralModule'] +__all__ = ['WeightShareTransform', 'NeuralModule', 'ModuleType'] import uuid from abc import abstractmethod @@ -39,6 +38,16 @@ YAML = YAML(typ='safe') +class ModuleType(Enum): + """ Back-end independent module types """ + + module = 0 + datalayer = 1 + trainable = 2 + loss = 3 + nontrainable = 4 + + class WeightShareTransform(Enum): """When sharing parameters, what kind of transform to apply.""" @@ -53,7 +62,7 @@ class WeightShareTransform(Enum): class NeuralModule(NeuralInterface): """ - Abstract class that every Neural Module must inherit from. + Abstract class that every Neural Module must inherit from. """ def __init__(self, name=None): @@ -69,6 +78,9 @@ def __init__(self, name=None): # Register module and store the generated name. self._name = self._app_state.register_module(self, name) + # Set "module" type as default. + self._type = ModuleType.module + # Set "both" as default operation mode. self._operation_mode = OperationMode.both @@ -85,70 +97,68 @@ def __init__(self, name=None): @property def init_params(self) -> Dict[str, Any]: """ - Property returning parameters used to instantiate the module. + Property returning parameters used to instantiate the module. - Returns: - Dictionary containing parameters used to instantiate the module. + Returns: + Dictionary containing parameters used to instantiate the module. """ return self._init_params def __extract_init_params(self) -> Dict[str, Any]: """ - Retrieves the dictionary of of parameters (keys, values) passed to constructor of a class derived - (also indirectly) from the Neural Module class. + Retrieves the dictionary of of parameters (keys, values) passed to constructor of a class derived + (also indirectly) from the Neural Module class. - Returns: - Dictionary containing parameters passed to init(). + Returns: + Dictionary containing parameters passed to init(). """ # Get names of arguments of the original module init method. - init_keys = getfullargspec(type(self).__init__).args + to_set_params = getfullargspec(type(self).__init__).args + to_set_params.remove("self") - # Remove self. - if "self" in init_keys: - init_keys.remove("self") + # Create empty list of init params. + init_params = {} - # Create a list of params and initialize it with a special value. - init_params = {}.fromkeys(init_keys, "__UNSET__") - no_of_unset = len(init_params) - - # Retrieve values of those params from the call list. - # Do it by removing and analysing the calls from stack one by one. + # Get the frame "call context". for frame in stack()[1:]: - # Get call "context". - localvars = getargvalues(frame[0]).locals - # Check if we are in the "context" of the class call. - if "__class__" not in localvars.keys(): - continue - # Check if this is the context of the current "class". - if type(localvars["self"]).__name__ != localvars["__class__"].__name__: - # If own class is not equal to the call context class. - continue - # Ok, got the actual __init__() call!! - # Copy the keys. - for key in init_keys: - # Found the variable - and it is still unset! - if key in localvars.keys() and init_params[key] == "__UNSET__": - # Save the value. - init_params[key] = localvars[key] - no_of_unset -= 1 - # That should set all the init_params! - assert no_of_unset == 0 - # Ok, we can terminate. - break + # Get the call arguments. + localvars = getargvalues(frame[0]) + + # Fill the parameters with call_args. + for key in to_set_params: + if key in localvars.args: + init_params[key] = localvars.locals[key] + + # Remove all set keys. + for key in init_params.keys(): + if key in to_set_params: + to_set_params.remove(key) + + # Check if we have set everything. + if len(to_set_params) == 0: + break + + # Make sure that we collected ALL (and ONLY) the signature params - if not, then there is a BUG! + if len(to_set_params) != 0: + raise ValueError( + "Could not collect all the signature params! " + F"Please file a bug on GitHub with the current stacktrace so that it can be resolved." + ) + + # print("! init_params of {}: {}\n".format(type(self).__name__, init_params)) # Return parameters. return init_params def __validate_params(self, params: Dict[str, Any]) -> bool: """ - Checks whether dictionary contains parameters being primitive types (string, int, float etc.) - or (lists of)+ primitive types. - - Args: - params: dictionary of parameters. + Checks whether dictionary contains parameters being primitive types (string, int, float etc.) + or (lists of)+ primitive types. - Returns: - True if all parameters were ok, False otherwise. + Args: + params: dictionary of parameters. + Returns: + True if all parameters were ok, False otherwise. """ ok = True @@ -167,13 +177,12 @@ def __validate_params(self, params: Dict[str, Any]) -> bool: def __is_of_allowed_type(self, var) -> bool: """ - A recursive function that checks if a given variable is of allowed type. + A recursive function that checks if a given variable is of allowed type. - Args: - pretrained_model_name (str): name of pretrained model to use in order. - - Returns: - True if all parameters were ok, False otherwise. + Args: + pretrained_model_name (str): name of pretrained model to use in order. + Returns: + True if all parameters were ok, False otherwise. """ # Special case: None is also allowed. if var is None: @@ -201,13 +210,12 @@ def __is_of_allowed_type(self, var) -> bool: def export_to_config(self, config_file: str): """ - A function that exports module "configuration" (i.e. init parameters) to a YAML file. - - Args: - config_file: path (absolute or relative) and name of the config file (YML) + A function that exports module "configuration" (i.e. init parameters) to a YAML file. - Raises: - ValueError: An error occurred and parameters coudn't be exported. + Args: + config_file: path (absolute or relative) and name of the config file (YML) + Raises: + ValueError: An error occurred and parameters coudn't be exported. """ # Greate an absolute path. abs_path_file = path.expanduser(config_file) @@ -224,10 +232,11 @@ def export_to_config(self, config_file: str): ) def serialize(self) -> Dict[str, Any]: - """ A method serializing the whole Neural module (into a dictionary). - - Returns: - Dictionary containing a "serialized" module. + """ + A method serializing the whole Neural module (into a dictionary). + + Returns: + Dictionary containing a "serialized" module. """ # Create a dictionary representing the serialized object. serialized_module = {} @@ -242,10 +251,11 @@ def serialize(self) -> Dict[str, Any]: return serialized_module def __serialize_header(self) -> Dict[str, Any]: - """ A protected method that creates a header stored later in the configuration file. + """ + A protected method that creates a header stored later in the configuration file. - Returns: - Dictionary containing a header with module specification. + Returns: + Dictionary containing a header with module specification. """ # Get module "full specification". @@ -290,14 +300,15 @@ def __serialize_header(self) -> Dict[str, Any]: def _serialize_configuration(self) -> Dict[str, Any]: """ - A function that serializes the module "configuration (i.e. init parameters) to a dictionary. - Raises a ValueError exception in case then parameters coudn't be exported. + A function that serializes the module "configuration (i.e. init parameters) to a dictionary. - ..note: - Thus functions should be overloaded when writing a custom module import/export. + ..note: + Thus functions should be overloaded when writing a custom module import/export. - Returns: - A "serialized" dictionary with module configuration. + Returns: + A "serialized" dictionary with module configuration. + Raises: + A ValueError exception in case then parameters coudn't be exported. """ # Check if generic export will work. if not self.__validate_params(self._init_params): @@ -314,22 +325,18 @@ def import_from_config( cls, config_file: str, section_name: str = None, name: str = None, overwrite_params: Dict = {} ) -> 'NeuralModule': """ - Class method importing the configuration file. - Raises an ImportError exception when config file is invalid or - incompatible (when called from a particular class). + Class method importing the configuration file. + Raises an ImportError exception when config file is invalid or + incompatible (when called from a particular class). - Args: - config_file: path (absolute or relative) and name of the config file (YML) - - section_name: section in the configuration file storing module configuration (optional, DEFAULT: None) - - name: name of the module that will overwrite the name in the `init_params` (optional, DEFAULT: None) - - overwrite_params: Dictionary containing parameters that will be added to or overwrite (!) - the default init parameters loaded from the configuration file (the module "init_params" section). - - Returns: - Instance of the created NeuralModule object. + Args: + config_file: path (absolute or relative) and name of the config file (YML) + section_name: section in the configuration file storing module configuration (optional, DEFAULT: None) + name: name of the module that will overwrite the name in the `init_params` (optional, DEFAULT: None) + overwrite_params: Dictionary containing parameters that will be added to or overwrite (!) + the default init parameters loaded from the configuration file (the module "init_params" section). + Returns: + Instance of the created NeuralModule object. """ logging.info("Loading configuration of a new Neural Module from the `{}` file".format(config_file)) @@ -345,17 +352,15 @@ def import_from_config( @classmethod def __validate_config_file(cls, config_file: str, section_name: str = None) -> Dict[str, Any]: """ - Class method validating whether the config file has a proper content (sections, specification etc.). - Raises an ImportError exception when config file is invalid or - incompatible (when called from a particular class). - - Args: - config_file: path (absolute or relative) and name of the config file (YML) + Class method validating whether the config file has a proper content (sections, specification etc.). + Raises an ImportError exception when config file is invalid or + incompatible (when called from a particular class). - section_name: section in the configuration file storing module configuration (optional, DEFAULT: None) - - Returns: - A loaded configuration file (dictionary). + Args: + config_file: path (absolute or relative) and name of the config file (YML) + section_name: section in the configuration file storing module configuration (optional, DEFAULT: None) + Returns: + A loaded configuration file (dictionary). """ # Greate an absolute path. abs_path_file = path.expanduser(config_file) @@ -398,21 +403,21 @@ def __validate_config_file(cls, config_file: str, section_name: str = None) -> D @classmethod def deserialize( - cls, configuration: str, name: str = None, overwrite_params: Dict[str, Any] = {} + cls, configuration: Dict[str, Any], name: str = None, overwrite_params: Dict[str, Any] = {} ) -> 'NeuralModule': """ - Class method instantianting the neural module object based on the configuration (dictionary). + Class method instantianting the neural module object based on the configuration (dictionary). - Args: - configuration: Dictionary containing proper "header" and "init_params" sections. + Args: + configuration: Dictionary containing proper "header" and "init_params" sections. - name: name of the module that will overwrite the name in the `init_params` (optional, DEFAULT: None) + name: name of the module that will overwrite the name in the `init_params` (optional, DEFAULT: None) - overwrite_params: Dictionary containing parameters that will be added to or overwrite (!) - the default init parameters loaded from the configuration file (the module "init_params" section). + overwrite_params: Dictionary containing parameters that will be added to or overwrite (!) + the default init parameters loaded from the configuration file (the module "init_params" section). - Returns: - Instance of the created NeuralModule object. + Returns: + Instance of the created NeuralModule object. """ # Deserialize header - get object class. module_class = cls.__deserialize_header(configuration["header"]) @@ -442,13 +447,13 @@ def deserialize( @classmethod def __deserialize_header(cls, serialized_header: Dict[str, Any]): - """ Method deserializes the header and extracts the module class. - - Args: - serialized_header: Dictionary containing module header. + """ + Method deserializes the header and extracts the module class. - Returns: - Class of the module to be created. + Args: + serialized_header: Dictionary containing module header. + Returns: + Class of the module to be created. """ # Parse the "full specification". spec_list = serialized_header["full_spec"].split(".") @@ -464,45 +469,37 @@ def __deserialize_header(cls, serialized_header: Dict[str, Any]): @classmethod def _deserialize_configuration(cls, serialized_init_params: Dict[str, Any]): """ - A function that deserializes the module "configuration (i.e. init parameters). + A function that deserializes the module "configuration (i.e. init parameters). - ..note: - Thus functions should be overloaded when writing a custom module import/export. + ..note: + Thus functions should be overloaded when writing a custom module import/export. - Args: - serialized_init_params: List of init parameters loaded from the file. - - Returns: - A "deserialized" list with init parameters. + Args: + serialized_init_params: List of init parameters loaded from the file. + Returns: + A "deserialized" list with init parameters. """ # In this case configuration = init parameters. return serialized_init_params - @deprecated(version=0.11) - @staticmethod - def create_ports(**kwargs): - """ Deprecated method, to be remoted in the next release.""" - raise Exception( - 'Deprecated method. Please implement ``inputs`` and ``outputs`` \ - properties to define module ports instead' - ) - @property @abstractmethod def input_ports(self) -> Dict[str, NeuralType]: - """Returns definitions of module input ports + """ + Returns definitions of module input ports Returns: - A (dict) of module's input ports names to NeuralTypes mapping + A dictionary containing module's input ports (names, NeuralTypes) mapping. """ @property @abstractmethod def output_ports(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports + """ + Returns definitions of module output ports Returns: - A (dict) of module's output ports names to NeuralTypes mapping + A dictionary containing module's output ports (names, NeuralTypes) mapping. """ @property @@ -525,7 +522,6 @@ def _disabled_deployment_output_ports(self) -> Set[str]: def _prepare_for_deployment(self) -> None: """Patch the module if required to prepare for deployment - """ return @@ -534,6 +530,11 @@ def operation_mode(self): """ Returns the operation mode. """ return self._operation_mode + @property + def type(self): + """ Returns the type of module. """ + return self._type + @operation_mode.setter def operation_mode(self, operation_mode: OperationMode): """ Sets the operation mode. """ diff --git a/nemo/utils/neural_graph/graph_outputs.py b/nemo/utils/neural_graph/graph_outputs.py index 2210ff316215..6c494d986b7f 100644 --- a/nemo/utils/neural_graph/graph_outputs.py +++ b/nemo/utils/neural_graph/graph_outputs.py @@ -162,8 +162,8 @@ def bind(self, tensors_ref: List["NmTensor"], port_names: Optional[str] = None): str(tensor.producer_step_number) + "_" + tensor.producer_name + "_" + tensor.name ) # last = port name - logging.warning( - "Setting unigue name of the default output port `{}` produced in step {} by `{}` to `{}`".format( + logging.debug( + "Setting unique name of the default output port `{}` produced in step {} by `{}` to `{}`".format( tensor.name, tensor.producer_step_number, tensor.producer_name, name ) ) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index d05b9197b585..204ed8dbee7f 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -10,3 +10,4 @@ torchvision wget wrapt ruamel.yaml +sklearn diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt index 5c8d4d3626c4..0edb80411639 100644 --- a/requirements/requirements_asr.txt +++ b/requirements/requirements_asr.txt @@ -12,3 +12,4 @@ sox torch-stft unidecode webdataset +kaldi-python-io diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 32a857a35ac2..f923e8869ea9 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -8,5 +8,5 @@ unidecode youtokentome numpy tqdm -sklearn gdown +megatron-lm diff --git a/scripts/get_hi-mia_data.py b/scripts/get_hi-mia_data.py new file mode 100644 index 000000000000..145ab6780e3a --- /dev/null +++ b/scripts/get_hi-mia_data.py @@ -0,0 +1,164 @@ +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# USAGE: python get_aishell_data.py --data_root= + +import argparse +import json +import logging +import os +import tarfile +import urllib.request +from glob import glob + +import librosa as l +from sklearn.model_selection import StratifiedShuffleSplit +from tqdm import tqdm + +parser = argparse.ArgumentParser(description='HI-MIA Data download') +parser.add_argument("--data_root", required=True, default=None, type=str) +args = parser.parse_args() + +URL = { + 'dev': "http://www.openslr.org/resources/85/dev.tar.gz", + 'test': "http://www.openslr.org/resources/85/test.tar.gz", + 'train': "http://www.openslr.org/resources/85/train.tar.gz", +} + + +def __maybe_download_file(destination: str, source: str): + """ + Downloads source to destination if it doesn't exist. + If exists, skips download + Args: + destination: local filepath + source: url of resource + + Returns: + + """ + source = URL[source] + if not os.path.exists(destination): + logging.info("{0} does not exist. Downloading ...".format(destination)) + urllib.request.urlretrieve(source, filename=destination + '.tmp') + os.rename(destination + '.tmp', destination) + logging.info("Downloaded {0}.".format(destination)) + else: + logging.info("Destination {0} exists. Skipping.".format(destination)) + return destination + + +def __extract_all_files(filepath: str, data_root: str, data_dir: str): + if not os.path.exists(data_dir): + extract_file(filepath, data_root) + audio_dir = os.path.join(data_dir, 'wav') + for subfolder, _, filelist in os.walk(audio_dir): + for ftar in filelist: + extract_file(os.path.join(subfolder, ftar), subfolder) + else: + logging.info('Skipping extracting. Data already there %s' % data_dir) + + +def extract_file(filepath: str, data_dir: str): + try: + tar = tarfile.open(filepath) + tar.extractall(data_dir) + tar.close() + except Exception: + logging.info('Not extracting. Maybe already there?') + + +def write_file(name, lines, idx): + with open(name, 'w') as fout: + for i in idx: + dic = lines[i] + json.dump(dic, fout) + fout.write('\n') + print("wrote", name) + + +def __process_data(data_folder: str, data_set: str): + """ + To generate manifest + Args: + data_folder: source with wav files + dst_folder: where manifest files will be stored + Returns: + + """ + fullpath = os.path.abspath(data_folder) + scp = glob(fullpath + '/**/*.wav', recursive=True) + out = os.path.join(fullpath, data_set + '_all.json') + utt2spk = os.path.join(fullpath, 'utt2spk') + utt2spk_file = open(utt2spk, 'w') + id = -2 # speaker id + + if os.path.exists(out): + os.remove(out) + + speakers = [] + lines = [] + with open(out, 'w') as outfile: + for line in tqdm(scp): + line = line.strip() + y, sr = l.load(line, sr=None) + if sr != 16000: + y, sr = l.load(line, sr=16000) + l.output.write_wav(line, y, sr) + dur = l.get_duration(y=y, sr=sr) + if data_set == 'test': + speaker = line.split('/')[-1].split('.')[0].split('_')[0] + else: + speaker = line.split('/')[id] + speaker = list(speaker) + speaker = ''.join(speaker) + speakers.append(speaker) + meta = {"audio_filepath": line, "duration": float(dur), "label": speaker} + lines.append(meta) + json.dump(meta, outfile) + outfile.write("\n") + utt2spk_file.write(line.split('/')[-1] + "\t" + speaker + "\n") + + utt2spk_file.close() + + if data_set != 'test': + sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42) + for train_idx, test_idx in sss.split(speakers, speakers): + print(len(train_idx)) + + out = os.path.join(fullpath, 'train.json') + write_file(out, lines, train_idx) + out = os.path.join(fullpath, 'dev.json') + write_file(out, lines, test_idx) + + +def main(): + data_root = args.data_root + for data_set in URL.keys(): + + # data_set = 'data_aishell' + logging.info("\n\nWorking on: {0}".format(data_set)) + file_path = os.path.join(data_root, data_set + ".tgz") + logging.info("Getting {0}".format(data_set)) + __maybe_download_file(file_path, data_set) + logging.info("Extracting {0}".format(data_set)) + data_folder = os.path.join(data_root, data_set) + __extract_all_files(file_path, data_root, data_folder) + logging.info("Processing {0}".format(data_set)) + __process_data(data_folder, data_set) + logging.info('Done!') + + +if __name__ == "__main__": + main() diff --git a/scripts/scp_to_manifest.py b/scripts/scp_to_manifest.py new file mode 100644 index 000000000000..ec297bcc739a --- /dev/null +++ b/scripts/scp_to_manifest.py @@ -0,0 +1,96 @@ +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import logging +import os + +import librosa as l +from sklearn.model_selection import StratifiedShuffleSplit +from tqdm import tqdm + + +""" +This scipt converts a scp file where each line contains + +to a manifest json file. +Args: +--scp: scp file name +--id: index of speaker label in filename present in scp file that is separated by '/' +--out: output manifest file name +--split: True / False if you would want to split the manifest file for training purposes + you may not need this for test set. output file names is _.json + Defaults to False +""" + + +def write_file(name, lines, idx): + with open(name, 'w') as fout: + for i in idx: + dic = lines[i] + json.dump(dic, fout) + fout.write('\n') + logging.info("wrote", name) + + +def main(scp, id, out, split=False): + if os.path.exists(out): + os.remove(out) + scp_file = open(scp, 'r').readlines() + + lines = [] + speakers = [] + with open(out, 'w') as outfile: + for line in tqdm(scp_file): + line = line.strip() + y, sr = l.load(line, sr=None) + dur = l.get_duration(y=y, sr=sr) + speaker = line.split('/')[id] + speaker = list(speaker) + speaker = ''.join(speaker) + speakers.append(speaker) + meta = {"audio_filepath": line, "duration": float(dur), "label": speaker} + lines.append(meta) + json.dump(meta, outfile) + outfile.write("\n") + + path = os.path.dirname(out) + if split: + sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42) + for train_idx, test_idx in sss.split(speakers, speakers): + logging.info(len(train_idx)) + + out = os.path.join(path, 'train.json') + write_file(out, lines, train_idx) + out = os.path.join(path, 'dev.json') + write_file(out, lines, test_idx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--scp", help="scp file name", type=str, required=True) + parser.add_argument( + "--id", help="field num seperated by '/' to be considered as speaker label", type=int, required=True + ) + parser.add_argument("--out", help="manifest_file name", type=str, required=True) + parser.add_argument( + "--split", + help="bool if you would want to split the manifest file for training purposes", + required=False, + action='store_true', + ) + args = parser.parse_args() + + main(args.scp, args.id, args.out, args.split) diff --git a/setup.cfg b/setup.cfg index df3f4f19217e..b64285ec6d07 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,7 +24,7 @@ addopts = --verbose --pyargs --durations=0 markers = unit: marks unit test, i.e. testing a single, well isolated functionality (deselect with '-m "not unit"') integration: marks test checking the elements when integrated into subsystems (deselect with '-m "not integration"') - system: marks test working at the highest integration level. (deselect with '-m "not system"') + system: marks test working at the highest integration level (deselect with '-m "not system"') acceptance: marks test checking whether the developed product/model passes the user defined acceptance criteria (deselect with '-m "not acceptance"') docs: mark tests related to documentation (deselect with '-m "not docs"') skipduringci: marks tests that are skipped ci as they are addressed by Jenkins jobs but should be run to test user setups diff --git a/tests/data/an4_speaker.tar.gz b/tests/data/an4_speaker.tar.gz new file mode 100644 index 000000000000..912bde741626 Binary files /dev/null and b/tests/data/an4_speaker.tar.gz differ diff --git a/tests/data/contextnet_32.yaml b/tests/data/contextnet_32.yaml new file mode 100644 index 000000000000..5e56e0d44048 --- /dev/null +++ b/tests/data/contextnet_32.yaml @@ -0,0 +1,77 @@ +model: "ContextNet" +sample_rate: 16000 +repeat: &repeat 2 +dropout: &dropout 0.0 +stride: &stride 2 + + +AudioToTextDataLayer: + max_duration: 16.7 + trim_silence: true + + train: + shuffle: true + + eval: + shuffle: false + max_duration: null + +AudioToMelSpectrogramPreprocessor: + window_size: 0.025 + window_stride: 0.01 + window: "hann" + normalize: "per_feature" + n_fft: 512 + features: 80 + dither: 0.00001 + pad_to: 16 + stft_conv: false + +SpectrogramAugmentation: + freq_masks: 2 + time_masks: 10 + freq_width: 27 + time_width: 0.05 + +ContextNetEncoder: + activation: "relu" + conv_mask: true + + jasper: + - filters: 32 + repeat: 1 + kernel: [5] + stride: [1] + dilation: [1] + dropout: 0.0 + residual: false + separable: true + se: true + se_context_size: -1 + + - filters: 32 + repeat: *repeat + kernel: [5] + stride: [1] + dilation: [1] + dropout: *dropout + residual: true + separable: true + se: true + se_context_size: 256 + + - filters: 32 + repeat: *repeat + kernel: [5] + stride: [*stride] + dilation: [1] + dropout: *dropout + residual: true + separable: true + se: true + se_context_size: -1 + stride_last: true + residual_mode: "stride_add" + +labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] diff --git a/tests/data/quartznet_spkr_test.yaml b/tests/data/quartznet_spkr_test.yaml new file mode 100644 index 000000000000..d0ec8b33d7e2 --- /dev/null +++ b/tests/data/quartznet_spkr_test.yaml @@ -0,0 +1,81 @@ +model: "GramVoxNet" +sample_rate: &sample_rate 16000 +dropout: &drop 0.5 +repeat: &rep 1 +time_length: 8 +n_filters: &n_filters 512 + +AudioToSpeechLabelDataLayer: + sample_rate: *sample_rate + train: + min_duration: 0.1 + shuffle: true + eval: + min_duration: 0.01 + shuffle: false + +AudioToMelSpectrogramPreprocessor: + normalize: "per_feature" + window_size: 0.02 + window_stride: 0.01 + window: "hann" + features: &n_mels 64 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + stft_conv: false + +JasperEncoder: + feat_in: *n_mels + activation: "relu" + + jasper: + - filters: *n_filters + repeat: 1 + kernel: [3] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [5] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [7] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: *n_filters + repeat: *rep + kernel: [9] + stride: [1] + dilation: [1] + dropout: *drop + residual: true + seperable: true + + - filters: &enc_feat_out 1500 + repeat: 1 + kernel: [1] + stride: [1] + dilation: [1] + dropout: 0.0 + residual: false + seperable: true + +JasperDecoderForSpkrClass: + feat_in: *enc_feat_out + pool_mode: 'xvector' + emb_sizes: 128,128 diff --git a/tests/integration/test_asr_gradient_step_and_eval.py b/tests/integration/test_asr_gradient_step_and_eval.py index 66688794ec72..d68898c076b6 100644 --- a/tests/integration/test_asr_gradient_step_and_eval.py +++ b/tests/integration/test_asr_gradient_step_and_eval.py @@ -212,6 +212,64 @@ def test_quartznet_training(self): # Assert that training loss went down assert loss_list[-1] < loss_list[0] + @pytest.mark.integration + def test_contextnet_ctc_training(self): + """Integtaion test that instantiates a small ContextNet model and tests training with the sample asr data. + Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss + at the first step. + Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss + Checks SE-block with fixed context size and global context, residual_mode='stride_add' and 'stride_last' flags + """ + with open(os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/contextnet_32.yaml"))) as f: + contextnet_model_definition = self.yaml.load(f) + dl = nemo_asr.AudioToTextDataLayer(manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=30) + pre_process_params = { + 'frame_splicing': 1, + 'features': 80, + 'window_size': 0.025, + 'n_fft': 512, + 'dither': 1e-05, + 'window': 'hann', + 'sample_rate': 16000, + 'normalize': 'per_feature', + 'window_stride': 0.01, + } + preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params) + + spec_aug = nemo_asr.SpectrogramAugmentation(**contextnet_model_definition['SpectrogramAugmentation']) + + contextnet_encoder = nemo_asr.ContextNetEncoder( + feat_in=contextnet_model_definition['AudioToMelSpectrogramPreprocessor']['features'], + **contextnet_model_definition['ContextNetEncoder'], + ) + contextnet_decoder = nemo_asr.ContextNetDecoderForCTC(feat_in=32, hidden_size=16, num_classes=len(self.labels)) + ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) + + # DAG + audio_signal, a_sig_length, transcript, transcript_len = dl() + processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) + + processed_signal = spec_aug(input_spec=processed_signal) + + encoded, encoded_len = contextnet_encoder(audio_signal=processed_signal, length=p_length) + log_probs = contextnet_decoder(encoder_output=encoded) + loss = ctc_loss( + log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, + ) + + loss_list = [] + callback = nemo.core.SimpleLossLoggerCallback( + tensors=[loss], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1 + ) + + self.nf.train( + [loss], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.001}, + ) + self.nf.reset_trainer() + + # Assert that training loss went down + assert loss_list[-1] < loss_list[0] + @pytest.mark.integration def test_stft_conv_training(self): """Integtaion test that instantiates a small Jasper model and tests training with the sample asr data. diff --git a/tests/integration/test_speaker_recognition_gradient_step.py b/tests/integration/test_speaker_recognition_gradient_step.py new file mode 100644 index 000000000000..ab062ddbad81 --- /dev/null +++ b/tests/integration/test_speaker_recognition_gradient_step.py @@ -0,0 +1,116 @@ +# ! /usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +import os +import shutil +import tarfile +from functools import partial + +import pytest +from ruamel.yaml import YAML + +import nemo +import nemo.collections.asr as nemo_asr + +logging = nemo.logging + + +@pytest.mark.usefixtures("neural_factory") +class TestSpeakerRecognitonPytorch: + manifest_filepath = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/an4_speaker/train.json")) + yaml = YAML(typ="safe") + + @classmethod + def setup_class(cls) -> None: + data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/")) + logging.info("Looking up for speaker related data") + if not os.path.exists(os.path.join(data_folder, "an4_speaker")): + logging.info("Extracting speaker related files to: {0}".format(os.path.join(data_folder, "an4_speaker"))) + tar = tarfile.open(os.path.join(data_folder, "an4_speaker.tar.gz"), "r:gz") + tar.extractall(path=data_folder) + tar.close() + else: + logging.info("Speech Command data found in: {0}".format(os.path.join(data_folder, "an4_speaker"))) + + @classmethod + def teardown_class(cls) -> None: + data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/")) + logging.info("Looking up for test an4 data") + if os.path.exists(os.path.join(data_folder, "an4_speaker")): + shutil.rmtree(os.path.join(data_folder, "an4_speaker")) + + @staticmethod + def print_and_log_loss(loss_tensor, loss_log_list): + """A helper function that is passed to SimpleLossLoggerCallback. It prints loss_tensors and appends to + the loss_log_list list. + + Args: + loss_tensor (NMTensor): tensor representing loss. Loss should be a scalar + loss_log_list (list): empty list + """ + logging.info(f'Train Loss: {str(loss_tensor[0].item())}') + loss_log_list.append(loss_tensor[0].item()) + + @pytest.mark.integration + def test_quartznet_speaker_reco_training(self): + """Integtaion test that instantiates a small QuartzNet model for speaker recognition and tests training with the + sample an4 data. + Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss + at the first step. + """ + with open( + os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/quartznet_spkr_test.yaml")) + ) as file: + spkr_params = self.yaml.load(file) + dl = nemo_asr.AudioToSpeechLabelDataLayer( + manifest_filepath=self.manifest_filepath, labels=None, batch_size=10, + ) + sample_rate = 16000 + + preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( + sample_rate=sample_rate, **spkr_params["AudioToMelSpectrogramPreprocessor"], + ) + jasper_encoder = nemo_asr.JasperEncoder(**spkr_params['JasperEncoder']) + jasper_decoder = nemo_asr.JasperDecoderForSpkrClass( + feat_in=spkr_params['JasperEncoder']['jasper'][-1]['filters'], + num_classes=dl.num_classes, + pool_mode=spkr_params['JasperDecoderForSpkrClass']['pool_mode'], + emb_sizes=spkr_params["JasperDecoderForSpkrClass"]["emb_sizes"].split(","), + ) + ce_loss = nemo_asr.CrossEntropyLossNM() + + # DAG + audio_signal, a_sig_length, targets, targets_len = dl() + processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) + + encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) + # logging.info(jasper_encoder) + log_probs, _ = jasper_decoder(encoder_output=encoded) + loss = ce_loss(logits=log_probs, labels=targets) + + loss_list = [] + callback = nemo.core.SimpleLossLoggerCallback( + tensors=[loss], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1 + ) + self.nf.random_seed = 42 + self.nf.train( + [loss], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 4, "lr": 0.002}, + ) + self.nf.reset_trainer() + + # Assert that training loss went down + assert loss_list[-1] < loss_list[0] diff --git a/tests/unit/core/neural_graph/test_neural_graphs.py b/tests/unit/core/neural_graph/test_neural_graphs.py index 982f42afe0b0..96fac0f620c9 100644 --- a/tests/unit/core/neural_graph/test_neural_graphs.py +++ b/tests/unit/core/neural_graph/test_neural_graphs.py @@ -19,7 +19,9 @@ import pytest +from numpy import array_equal +from nemo.backends import get_state_dict from nemo.backends.pytorch.tutorials import MSELoss, RealFunctionDataLayer, TaylorNet from nemo.core import NeuralGraph from nemo.core.neural_types import NeuralTypeComparisonResult @@ -83,18 +85,35 @@ def test_explicit_graph_manual_activation(self): assert len(g0) == 2 @pytest.mark.unit - def test_default_output_ports(self): - """ Tests automatic binding of default output ports. """ + def test_graph_save_load(self, tmpdir): + """ + Tests graph saving and loading. + + Args: + tmpdir: Fixture which will provide a temporary directory. + """ + dl = RealFunctionDataLayer(n=10, batch_size=1) - m2 = TaylorNet(dim=4) - loss = MSELoss() + tn = TaylorNet(dim=4) + # Get the "original" weights. + weights1 = get_state_dict(tn) + # Create a simple graph. with NeuralGraph() as g1: x, t = dl() - p = m2(x=x) + p = tn(x=x) + + # Generate filename in the temporary directory. + tmp_file_name = str(tmpdir.join("tgsl_g1.chkpt")) + # Save graph. + g1.save_to(tmp_file_name) + + # Load graph. + g1.restore_from(tmp_file_name) + + # Get the "restored" weights. + weights2 = get_state_dict(tn) - # Tests output ports. - assert len(g1.output_ports) == 3 - assert g1.output_ports["x"].compare(x) == NeuralTypeComparisonResult.SAME - assert g1.output_ports["y"].compare(t) == NeuralTypeComparisonResult.SAME - assert g1.output_ports["y_pred"].compare(p) == NeuralTypeComparisonResult.SAME + # Compare state dicts. + for key in weights1: + assert array_equal(weights1[key].cpu().numpy(), weights2[key].cpu().numpy()) diff --git a/tests/unit/core/neural_module/test_module_configuration_export.py b/tests/unit/core/neural_module/test_module_configuration_export.py index e64f48ce65e4..689d621c9c5c 100644 --- a/tests/unit/core/neural_module/test_module_configuration_export.py +++ b/tests/unit/core/neural_module/test_module_configuration_export.py @@ -36,7 +36,7 @@ class MockupSimpleModule(NeuralModule): Mockup component class. """ - def __init__(self, a, b, c, d): + def __init__(self, a, b, c, d=False): super().__init__() def setup_method(self, method): @@ -57,7 +57,7 @@ def test_simple_export(self, tmpdir): """ # Set params: {"int": 123, "float": 12.4, "string": "ala ma kota", "bool": True} - params = {"a": 123, "b": 12.4, "c": "ala ma kota", "d": True} + params = {"a": 123, "b": 12.4, "c": "ala ma kota"} module = TestNeuralModuleExport.MockupSimpleModule(**params) # Generate filename in the temporary directory. @@ -81,7 +81,7 @@ def test_simple_export(self, tmpdir): assert int(exported_init_params["a"]) == 123 assert float(exported_init_params["b"]) == 12.4 assert exported_init_params["c"] == "ala ma kota" - assert bool(exported_init_params["d"]) == True + assert bool(exported_init_params["d"]) == False @pytest.mark.unit def test_nested_list_export(self, tmpdir): diff --git a/tests/unit/core/test_deploy_export.py b/tests/unit/core/test_deploy_export.py index f44006d13095..9f4f29ff3e3f 100644 --- a/tests/unit/core/test_deploy_export.py +++ b/tests/unit/core/test_deploy_export.py @@ -18,6 +18,7 @@ import copy import os +from inspect import signature from collections import OrderedDict from pathlib import Path import urllib.request @@ -85,24 +86,19 @@ def __test_export_route(self, module, out_name, mode, input_example=None): module.eval() torch.manual_seed(1) + deploy_input_example = input_example if isinstance(input_example, OrderedDict): - outputs_fwd = module.forward(*tuple(input_example.values())) + deploy_input_example = tuple(input_example.values()) + if len(deploy_input_example) == 1: + deploy_input_example = deploy_input_example[0] elif isinstance(input_example, tuple): - outputs_fwd = module.forward(*input_example) - elif input_example is not None: - outputs_fwd = module.forward(input_example) - else: - outputs_fwd = None - - deploy_input_example = ( - tuple(input_example.values()) if isinstance(input_example, OrderedDict) else input_example - ) + deploy_input_example = input_example if len(input_example) > 1 else input_example[0] + + sig = signature(module.forward) + pnum = len(sig.parameters) + outputs_fwd = module.forward(*deploy_input_example) if pnum > 2 else module.forward(deploy_input_example) self.nf.deployment_export( - module=module, - output=out_name, - input_example=deploy_input_example, - d_format=mode, - output_example=outputs_fwd, + module=module, output=out_name, input_example=deploy_input_example, d_format=mode, output_example=None, ) assert out.exists() == True @@ -198,14 +194,8 @@ def __test_export_route(self, module, out_name, mode, input_example=None): elif mode == DF.ONNX: # Must recompute because *module* might be different now - if isinstance(input_example, OrderedDict): - outputs_fwd = module.forward(*tuple(input_example.values())) - elif isinstance(input_example, tuple): # or isinstance(input_example, list) - outputs_fwd = module.forward(*input_example) - elif input_example is not None: - outputs_fwd = module.forward(input_example) - else: - outputs_fwd = None + torch.manual_seed(1) + outputs_fwd = module.forward(*deploy_input_example) if pnum > 2 else module.forward(deploy_input_example) sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC @@ -236,6 +226,7 @@ def __test_export_route(self, module, out_name, mode, input_example=None): outputs_scr = torch.from_numpy(outputs_scr[0]).cuda() elif mode == DF.TORCHSCRIPT: tscr = torch.jit.load(out_name) + torch.manual_seed(1) outputs_scr = ( tscr.forward(*tuple(input_example.values())) if isinstance(input_example, OrderedDict) @@ -244,7 +235,8 @@ def __test_export_route(self, module, out_name, mode, input_example=None): ) ) elif mode == DF.PYTORCH: - module.load_state_dict(torch.load(out_name)) + module.restore_from(out_name) + torch.manual_seed(1) if isinstance(input_example, OrderedDict): outputs_scr = module.forward(*tuple(input_example.values())) elif isinstance(input_example, tuple) or isinstance(input_example, list): @@ -344,7 +336,18 @@ def test_hf_bert(self, tmpdir, df_type): @pytest.mark.unit @pytest.mark.run_only_on('GPU') - @pytest.mark.parametrize("df_type", [DF.ONNX, DF.TORCHSCRIPT, DF.PYTORCH]) + @pytest.mark.parametrize("df_type", [DF.TORCHSCRIPT, DF.PYTORCH]) + # + # TODO WaveGlow.infer uses torch.randn which is required to be seeded + # for deterministic results. It gets translated to ONNX op like this: + # + # %16020 = RandomNormalLike[dtype = 1](%16019) + # + # There is no way to seed it, thus to validate ONNX test flow + # please use torch.ones + # + # @pytest.mark.parametrize("df_type", [DF.ONNX, DF.TORCHSCRIPT, DF.PYTORCH]) + # def test_waveglow(self, tmpdir, df_type): url = "https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ljspeech/versions/2/files/WaveGlowNM.pt" ptfile = "./WaveGlowNM.pt" @@ -357,12 +360,8 @@ def test_waveglow(self, tmpdir, df_type): torch.manual_seed(1) mel = torch.randn(1, 80, 96).cuda() - stride = 256 # value from waveglow upsample - n_group = 8 - z_size2 = (mel.size(2) * stride) // n_group - z = torch.randn(1, z_size2).cuda() - input_example = OrderedDict([("mel_spectrogram", mel), ("z", z)]) + input_example = OrderedDict([("mel_spectrogram", mel)]) tmp_file_name = str(tmpdir.mkdir("export").join("waveglow")) self.__test_export_route(module=module, out_name=tmp_file_name, mode=df_type, input_example=input_example) diff --git a/tests/unit/test_torch_backend.py b/tests/unit/test_torch_backend.py new file mode 100644 index 000000000000..f8ba63cf86b7 --- /dev/null +++ b/tests/unit/test_torch_backend.py @@ -0,0 +1,71 @@ +# ! /usr/bin/python +# -*- coding: utf-8 -*- + +# ============================================================================= +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import pytest +from numpy import array_equal + +from nemo.backends import get_state_dict, load, save, set_state_dict +from nemo.backends.pytorch.tutorials import TaylorNet + + +@pytest.mark.usefixtures("neural_factory") +class TestTorchBackend: + @pytest.mark.unit + def test_state_dict(self): + """ + Tests whether the get/set_state_dict proxy functions work properly. + """ + # Module. + fx = TaylorNet(dim=4) + + # Get state dict. + state_dict1 = get_state_dict(fx) + + # Set state dict. + set_state_dict(fx, state_dict1) + + # Compare state dicts. + state_dict2 = get_state_dict(fx) + for key in state_dict1.keys(): + assert array_equal(state_dict1[key].cpu().numpy(), state_dict2[key].cpu().numpy()) + + @pytest.mark.unit + def test_save_load(self, tmpdir): + """ + Tests whether the save and load proxy functions work properly. + + Args: + tmpdir: Fixture which will provide a temporary directory. + """ + # Module. + fx = TaylorNet(dim=4) + + # Generate filename in the temporary directory. + tmp_file_name = str(tmpdir.join("tsl_taylornet.chkpt")) + + # Save. + weights = get_state_dict(fx) + save(weights, tmp_file_name) + + # Load. + loaded_weights = load(tmp_file_name) + + # Compare state dicts. + for key in weights: + assert array_equal(weights[key].cpu().numpy(), loaded_weights[key].cpu().numpy())