Merge branch 'main' into streaming_asr

NVIDIA · Jun 18, 2021 · 4351694 · 4351694
2 parents c5826a2 + e070e04
commit 4351694
Show file tree

Hide file tree

Showing 134 changed files with 4,190 additions and 365 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:21.03-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:21.05-py3
 
 
 # build an image that includes only the nemo dependencies, ensures that dependencies

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1,7 +1,7 @@
 pipeline {
   agent {
         docker {
-      image 'nvcr.io/nvidia/pytorch:21.03-py3'
+      image 'nvcr.io/nvidia/pytorch:21.05-py3'
       args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache/torch:/root/.cache/torch --shm-size=8g'
         }
   }
@@ -66,7 +66,7 @@ pipeline {
 
     stage('L0: Unit Tests GPU') {
       steps {
-        sh 'pytest -m "not pleasefixme" --with_downloads'
+        sh 'pytest -m "not pleasefixme" --with_downloads --relax_numba_compat'
       }
     }
 
@@ -78,7 +78,7 @@ pipeline {
         }
       }
       steps {
-        sh 'CUDA_VISIBLE_DEVICES="" pytest -m "not pleasefixme" --cpu --with_downloads'
+        sh 'CUDA_VISIBLE_DEVICES="" pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat'
       }
     }
 
@@ -288,8 +288,8 @@ pipeline {
             model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
             model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
             model.tokenizer.type="wpe" \
-            model.train_ds.batch_size=10 \
-            model.validation_ds.batch_size=10 \
+            model.train_ds.batch_size=4 \
+            model.validation_ds.batch_size=4 \
             trainer.gpus=[1] \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results'
@@ -348,43 +348,47 @@ pipeline {
     }
 
 //  TODO: UNCOMMENT TESTS AFTER 21.04 release (numba 0.53 min requirement)
-//     stage('L2: ASR RNNT dev run') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       parallel {
-//         stage('Speech to Text - RNNT') {
-//           steps {
-//             sh 'python examples/asr/speech_to_text_rnnt.py \
-//             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-//             model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-//             model.train_ds.batch_size=8 \
-//             trainer.gpus=[0] \
-//             +trainer.fast_dev_run=True \
-//             exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_results'
-//             sh 'rm -rf examples/asr/speech_to_text_rnnt_results'
-//           }
-//         }
-//         stage('L2: Speech to Text RNNT WPE') {
-//           steps {
-//             sh 'python examples/asr/speech_to_text_rnnt_bpe.py \
-//             --config-path="experimental/contextnet_rnnt/" --config-name="config_rnnt_bpe.yaml" \
-//             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-//             model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-//             model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-//             model.tokenizer.type="wpe" \
-//             trainer.gpus=[0] \
-//             +trainer.fast_dev_run=True \
-//             exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_wpe_results'
-//             sh 'rm -rf examples/asr/speech_to_text_rnnt_wpe_results'
-//           }
-//         }
-//       }
-//     }
+    stage('L2: ASR RNNT dev run') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      parallel {
+        stage('Speech to Text - RNNT') {
+          steps {
+            sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/speech_to_text_rnnt.py \
+            --config-path="experimental/contextnet_rnnt/" --config-name="config_rnnt.yaml" \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            model.train_ds.batch_size=2 \
+            model.validation_ds.batch_size=2 \
+            trainer.gpus=[0] \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_results'
+            sh 'rm -rf examples/asr/speech_to_text_rnnt_results'
+          }
+        }
+        stage('L2: Speech to Text RNNT WPE') {
+          steps {
+            sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/speech_to_text_rnnt_bpe.py \
+            --config-path="experimental/contextnet_rnnt/" --config-name="config_rnnt_bpe.yaml" \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            model.train_ds.batch_size=2 \
+            model.validation_ds.batch_size=2 \
+            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
+            model.tokenizer.type="wpe" \
+            trainer.gpus=[0] \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_wpe_results'
+            sh 'rm -rf examples/asr/speech_to_text_rnnt_wpe_results'
+          }
+        }
+      }
+    }
 
     stage('L2: ASR Multi-dataloader dev run') {
       when {
@@ -1261,6 +1265,39 @@ pipeline {
       }
     }
 
+    stage('L2: NMT Megatron Model Parallel Size 2 Encoder') {
+      when {
+        anyOf{
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps{
+        sh 'cd examples/nlp/machine_translation && \
+        python enc_dec_nmt.py \
+        --config-path=conf \
+        --config-name=megatron \
+        model.encoder.model_name=megatron-bert-uncased \
+        model.encoder.checkpoint_file=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
+        model.encoder.hidden_size=1024 \
+        model.encoder.num_attention_heads=16 \
+        model.encoder.num_layers=24 \
+        model.encoder.max_position_embeddings=512 \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+        trainer.gpus=[0,1] \
+        +trainer.fast_dev_run=true \
+        exp_manager=null \
+        '
+      }
+    }
+
     stage('L2: NMT Tarred Dataset Creation') {
       when {
         anyOf {
@@ -1302,7 +1339,7 @@ pipeline {
               python create_tarred_parallel_dataset.py \
               --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
               --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              --out_dir $PWD/preproc_out_dir \
+              --out_dir $PWD/out_dir \
               --encoder_tokenizer_vocab_size=2000 \
               --decoder_tokenizer_vocab_size=2000 \
               --tokens_in_batch=1000 \

diff --git a/README.rst b/README.rst
@@ -150,13 +150,13 @@ Use this installation mode if you are contributing to NeMo.
 Docker containers:
 ~~~~~~~~~~~~~~~~~~
 
-If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 21.03-py3 and then installing from GitHub.
+If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 21.05-py3 and then installing from GitHub.
 
 .. code-block:: bash
 
     docker run --gpus all -it --rm -v <nemo_github_folder>:/NeMo --shm-size=8g \
     -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \
-    stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:21.03-py3
+    stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:21.05-py3
 
 Examples
 --------

diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css
@@ -42,7 +42,7 @@ a
 
 a:visited
 {
-    color: #b6b6b6;
+    color: #218219;
 }
 
 

diff --git a/docs/source/asr/asr_language_modeling.rst b/docs/source/asr/asr_language_modeling.rst
@@ -169,7 +169,7 @@ Width of the beam search (`--beam_width`) specifies the number of top candidates
 would search for. Larger beams result in more accurate but slower predictions.
 
 There is also a tutorial to learn more about evaluating the ASR models with N-gram LM here:
-`Offline ASR Inference with Beam Search and External Language Model Rescoring <https://colab.research.google.com/github/NVIDIA/NeMo/blob/v1.0.0/tutorials/asr/Offline_ASR.ipynb>`_
+`Offline ASR Inference with Beam Search and External Language Model Rescoring <https://colab.research.google.com/github/NVIDIA/NeMo/blob/v1.0.2/tutorials/asr/Offline_ASR.ipynb>`_
 
 Hyperparameter Grid Search
 --------------------------

diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst
@@ -10,7 +10,7 @@ for audio files, parameters for any augmentation being performed, as well as the
 this page cover each of these in more detail.
 
 Example configuration files for all of the NeMo ASR scripts can be found in the
-`config directory of the examples <https://github.com/NVIDIA/NeMo/tree/v1.0.0/examples/asr/conf>`_.
+`config directory of the examples <https://github.com/NVIDIA/NeMo/tree/v1.0.2/examples/asr/conf>`_.
 
 
 Dataset Configuration

diff --git a/docs/source/asr/data/asr_results.csv b/docs/source/asr/data/asr_results.csv
diff --git a/docs/source/asr/data/benchmark_en.csv b/docs/source/asr/data/benchmark_en.csv
@@ -1,11 +1,12 @@
 Model Name,Model Base Class,Model Card
 QuartzNet15x5Base-En,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels"
-stt_zh_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5"
 stt_en_jasper10x5dr,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr"
 stt_en_citrinet_256,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256"
 stt_en_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512"
 stt_en_citrinet_1024,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024"
-stt_zh_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"
+stt_en_citrinet_256_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25"
+stt_en_citrinet_512_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25"
+stt_en_citrinet_1024_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25"
 stt_en_conformer_ctc_small,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small"
 stt_en_conformer_ctc_medium,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium"
 stt_en_conformer_ctc_large,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large"

diff --git a/docs/source/asr/data/benchmark_es.csv b/docs/source/asr/data/benchmark_es.csv
@@ -1,3 +1,3 @@
 Model,Model Base Class,Model Card
 stt_es_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5"
-
+stt_es_citrinet_512,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_512"
diff --git a/docs/source/asr/data/benchmark_zh.csv b/docs/source/asr/data/benchmark_zh.csv
@@ -1,3 +1,2 @@
 Model,Model Base Class,Model Card
-stt_zh_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5"
 stt_zh_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"
diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
@@ -235,7 +235,7 @@ Conversion to Tarred Datasets
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 You can easily convert your existing NeMo-compatible ASR datasets using the
-`conversion script here <https://github.com/NVIDIA/NeMo/blob/v1.0.0/scripts/speech_recognition/convert_to_tarred_audio_dataset.py>`_.
+`conversion script here <https://github.com/NVIDIA/NeMo/blob/v1.0.2/scripts/speech_recognition/convert_to_tarred_audio_dataset.py>`_.
 
 .. code::
 

diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
@@ -43,8 +43,8 @@ The full documentation tree is as follows:
 Resources and Documentation
 ---------------------------
 
-Hands-on speech recognition tutorial notebooks can be found under `the ASR tutorials folder <https://github.com/NVIDIA/NeMo/tree/v1.0.0/tutorials/asr/>`_.
-If you are a beginner to NeMo, consider trying out the `ASR with NeMo <https://github.com/NVIDIA/NeMo/tree/v1.0.0/tutorials/asr/01_ASR_with_NeMo.ipynb>`_ tutorial.
+Hands-on speech recognition tutorial notebooks can be found under `the ASR tutorials folder <https://github.com/NVIDIA/NeMo/tree/v1.0.2/tutorials/asr/>`_.
+If you are a beginner to NeMo, consider trying out the `ASR with NeMo <https://github.com/NVIDIA/NeMo/tree/v1.0.2/tutorials/asr/01_ASR_with_NeMo.ipynb>`_ tutorial.
 This and most other tutorials can be run on Google Colab by specifying the link to the notebooks' GitHub pages on Colab.
 
 If you are looking for information about a particular ASR model, or would like to find out more about the model

diff --git a/docs/source/asr/results.rst b/docs/source/asr/results.rst
@@ -71,14 +71,16 @@ To perform inference and transcribe a sample of speech after loading the model,
 Setting the argument ``logprobs`` to ``True`` returns the log probabilities instead of transcriptions. For more information, see :doc:`./api.html#modules`.
 The audio files should be 16KHz monochannel wav files.
 
+Fine-tuning on Different Datasets
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are multiple ASR tutorials provided in the :ref:`Tutorials <tutorials>` section. Most of these tutorials explain how to instantiate a pre-trained model, prepare the model for fine-tuning on some dataset (in the same language) as a demonstration.
+
+
 Automatic Speech Recognition Models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-----------------------------------
 
-.. csv-table::
-   :file: data/asr_results.csv
-   :align: left
-   :widths: 30, 30, 40
-   :header-rows: 1
+Below is a list of all the ASR models that are available in NeMo for specific languages, as well as auxiliary language models for certain languages.
 
 Language Models for ASR
 ^^^^^^^^^^^^^^^^^^^^^^^
@@ -89,7 +91,8 @@ Language Models for ASR
    :widths: 30, 30, 40
    :header-rows: 1
 
-
+|
+
 Speech Recognition (Languages)
 ------------------------------
 

diff --git a/docs/source/asr/speaker_recognition/intro.rst b/docs/source/asr/speaker_recognition/intro.rst
@@ -28,7 +28,7 @@ Resource and Documentation Guide
 --------------------------------
 
 Hands-on speaker recognition tutorial notebooks can be found under
-`the speaker recognition tutorials folder <https://github.com/NVIDIA/NeMo/tree/v1.0.0/tutorials/speaker_recognition/>`_. This and most other tutorials can be run on Google Colab by specifying the link to the notebooks' GitHub pages on Colab.
+`the speaker recognition tutorials folder <https://github.com/NVIDIA/NeMo/tree/v1.0.2/tutorials/speaker_recognition/>`_. This and most other tutorials can be run on Google Colab by specifying the link to the notebooks' GitHub pages on Colab.
 
 If you are looking for information about a particular SpeakerNet model, or would like to find out more about the model
 architectures available in the ``nemo_asr`` collection, check out the :doc:`Models <./models>` page.
@@ -44,5 +44,5 @@ Documentation for configuration files specific to the ``nemo_asr`` models can be
 :doc:`Configuration Files <./configs>` page.
 
 
-For a clear step-by-step tutorial we advice you to refer tutorials found in `folder <https://github.com/NVIDIA/NeMo/tree/v1.0.0/tutorials/speaker_recognition/>`_.
+For a clear step-by-step tutorial we advice you to refer tutorials found in `folder <https://github.com/NVIDIA/NeMo/tree/v1.0.2/tutorials/speaker_recognition/>`_.
-Original file line number
+Diff line change
@@ Expand Up / @@ -42,7 +42,7 @@ a @@
     a:visited
     {
-        color: #b6b6b6;
+        color: #218219;
     }
@@ Expand Down @@