Merge branch 'master' of github.com:NVIDIA/NeMo into dev-cv-image-cla…

…ssification
NVIDIA · May 23, 2020 · a58d3dd · a58d3dd
2 parents b165288 + 9127797
commit a58d3dd
Show file tree

Hide file tree

Showing 133 changed files with 8,189 additions and 256 deletions.
diff --git a/.gitignore b/.gitignore
@@ -157,3 +157,7 @@ wandb
 dump.py
 
 docs/sources/source/test_build/
+
+# Checkpoints, config files and temporary files created in tutorials.
+examples/neural_graphs/*.chkpt
+examples/neural_graphs/*.yml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -78,6 +78,8 @@ To release a new version, please update the changelog as followed:
 - Online audio augmentation notebook in ASR examples ([PR #605](https://github.com/NVIDIA/NeMo/pull/605)) - @titu1994
 - ContextNet Encoder + Decoder Initial Support ([PR #630](https://github.com/NVIDIA/NeMo/pull/630)) - @titu1994
 - Added finetuning with Megatron-LM ([PR #601](https://github.com/NVIDIA/NeMo/pull/601)) - @ekmb
+- Added documentation for 8 kHz model ([PR #632](https://github.com/NVIDIA/NeMo/pull/632)) - @jbalam-nv
+
 
 ### Changed
 - Syncs across workers at each step to check for NaN or inf loss. Terminates all workers if stop\_on\_nan\_loss is set (as before), lets Apex deal with it if apex.amp optimization level is O1 or higher, and skips the step across workers otherwise. ([PR #637](https://github.com/NVIDIA/NeMo/pull/637)) - @redoctopus

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -2,7 +2,7 @@ pipeline {
   agent {
         docker {
             image 'nvcr.io/nvidia/pytorch:20.01-py3'
-            args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home:/home -v $HOME/.cache/torch:/root/.cache/torch --shm-size=8g'
+            args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache/torch:/root/.cache/torch --shm-size=8g'
         }
   }
   options {
@@ -193,11 +193,22 @@ pipeline {
         }
         stage ('Punctuation and Classification Training/Inference Test') {
           steps {
-            sh 'cd examples/nlp/token_classification && CUDA_VISIBLE_DEVICES=1 python punctuation_capitalization.py --data_dir /home/TestData/nlp/token_classification_punctuation/ --work_dir punctuation_output --save_epoch_freq 1 --num_epochs 1 --save_step_freq -1 --batch_size 2'
+            sh 'cd examples/nlp/token_classification && CUDA_VISIBLE_DEVICES=1 python punctuation_capitalization.py \
+            --data_dir /home/TestData/nlp/token_classification_punctuation/ --work_dir punctuation_output --save_epoch_freq 1 \
+            --num_epochs 1 --save_step_freq -1 --batch_size 2'
             sh 'cd examples/nlp/token_classification && DATE_F=$(ls punctuation_output/) && DATA_DIR="/home/TestData/nlp/token_classification_punctuation" && CUDA_VISIBLE_DEVICES=1 python punctuation_capitalization_infer.py --checkpoint_dir punctuation_output/$DATE_F/checkpoints/ --punct_labels_dict $DATA_DIR/punct_label_ids.csv --capit_labels_dict $DATA_DIR/capit_label_ids.csv'
             sh 'rm -rf examples/nlp/token_classification/punctuation_output'
           }
         }
+        stage('SGD Test') {
+          steps {
+            sh 'cd examples/nlp/dialogue_state_tracking && CUDA_VISIBLE_DEVICES=0 python dialogue_state_tracking_sgd.py \
+            --data_dir /home/TestData/nlp/sgd/ --schema_embedding_dir /home/TestData/nlp/sgd/embeddings/ --eval_dataset dev \
+            --dialogues_example_dir /home/TestData/nlp/sgd/dialogue_example_dir/ --work_dir sgd_output --task DEBUG \
+            --num_epochs 1 --save_epoch_freq=0'
+            sh 'rm -rf examples/nlp/dialogue_state_tracking/sgd_output'
+          }
+        }
       }
     }
 
@@ -355,7 +366,8 @@ pipeline {
 
   post {
     always {
+        sh "chmod -R 777 ."
         cleanWs()
     }
   }
-}
+}
diff --git a/docs/docs_zh/sources/source/speech_command/tutorial.rst b/docs/docs_zh/sources/source/speech_command/tutorial.rst
@@ -90,7 +90,7 @@ QuartzNet 模型使用一种固定的模型定义模式： QuartzNet-[BxR], 其
         process_classification_evaluation_epoch,
     )
 
-    logging = nemo.logging
+    from nemo.utils import logging
 
     # Lets define some hyper parameters
     lr = 0.05
@@ -420,7 +420,7 @@ QuartzNet 模型使用一种固定的模型定义模式： QuartzNet-[BxR], 其
     import nemo
     import nemo.collections.asr as nemo_asr
 
-    logging = nemo.logging
+    from nemo.utils import logging
 
     # We add some
     data_dir = '<path to the data directory>'

diff --git a/docs/sources/source/asr/8kHz_models.rst b/docs/sources/source/asr/8kHz_models.rst
@@ -0,0 +1,39 @@
+8kHz Models
+===========
+
+For applications based on telephony speech, using models trained on narrowband audio data sampled at 8 kHz may perform better than using models built with
+audio at a higher frequency (Note that to use models with audio at a different sample rate from your data, you would need to resample your data to match the sampling rate in the
+config file of the model). One approach to create large datasets for training a model suitable for your application would be to convert all audio data
+to the formats prevalent in your application. Here we detail one such approach that we took to train a model based on 8 kHz data.
+
+To train a model suitable for recognizing telephony speech we converted some of the datasets to G.711 :cite:`8kHz-mod-itu1988g711`. G.711 is a popular speech codec used in VoIP products and encodes speech
+at 64 kbps using PCM u-law companding. We converted audio from LibriSpeech, Mozilla Common Voice and WSJ datasets to G.711 format and combined Fisher and Switchboard datasets to
+train a :ref:`Quartznet15x5 <Quartznet_model>` model with about 4000 hours of data. To convert your audio to G.711 format you can use the script `convert_wav_to_g711wav.py` found in the `scripts` sub-directory of the nemo base directory.
+
+Among the experiments that we ran, we got the best accuracy for a model that used our 16 kHz Quartznet15x5 model's weights as pre-trained weights. We then
+trained the model for 250 epochs with five datasets mentioned above. Here are some results for our best model so far (note that all the test sets
+were converted to G.711 format for the results below):
+
+====================== =====================
+Test set               WER (%)
+====================== =====================
+LibriSpeech dev-clean  4.35
+LibriSpeech dev-other  11.89
+LibriSpeech test-clean 4.45
+LibriSpeech test-other 12.02
+Switchboard test       10.74
+Switchboard dev        10.59
+====================== =====================
+
+The model was first pretrained with 8 kHz LibriSpeech data for 134 epochs and then trained for another 250 epochs using G.711 audio from all the five datasets listed above. For best accuracy
+in your application, you may choose to :ref:`fine-tune <fine-tune>` this model using data collected from your application.
+
+..
+    The pre-trained model is available for download `here <https://ngc.nvidia.com/models/nvidian:nemo:quartznet_15x5_8_khz_for_nemo>`_.
+
+References
+----------
+.. bibliography:: asr_all.bib
+   :style: plain
+   :labelprefix: 8kHz-mod
+   :keyprefix: 8kHz-mod-
diff --git a/docs/sources/source/asr/asr_all.bib b/docs/sources/source/asr/asr_all.bib
@@ -60,8 +60,6 @@ @misc{ardila2019common
     primaryClass={cs.CL}
 }
 
-
-
 @article{graves2012,
   title={Sequence Transduction with Recurrent Neural Networks},
   author={Graves, Alex},
@@ -927,8 +925,14 @@ @article{novograd2019
 }
 
 @article{kriman2019quartznet,
-  title={Quartznet: Deep automatic speech recognition with 1d time-channel separable convolutions},
+  title={Quartznet: {Deep} automatic speech recognition with 1d time-channel separable convolutions},
   author={Kriman, Samuel and Beliaev, Stanislav and Ginsburg, Boris and Huang, Jocelyn and Kuchaiev, Oleksii and Lavrukhin, Vitaly and Leary, Ryan and Li, Jason and Zhang, Yang},
   journal={arXiv preprint arXiv:1910.10261},
   year={2019}
-}
+}
+
+@misc{itu1988g711,
+    title={{ITU-T} {G.711} - {Pulse} code modulation ({PCM}) of voice frequencies},
+    author={ITU-T Geneva Switzerland},
+    year={1988},
+}
diff --git a/docs/sources/source/asr/intro.rst b/docs/sources/source/asr/intro.rst
@@ -10,6 +10,8 @@ Speech Recognition
    tutorial
    datasets
    models
+   8kHz_models
+
 
 
 
diff --git a/docs/sources/source/asr/jasper.rst b/docs/sources/source/asr/jasper.rst
@@ -23,3 +23,10 @@ Jasper10x5dr  | Librispeech,          `here <https://ngc.nvidia.com/catalog/mode
               | Switchboard
 Jasper15x5SEP Aishell2                `here <https://ngc.nvidia.com/catalog/models/nvidia:aishell2_jasper10x5dr>`__
 ============= ======================= =================================================================================
+
+References
+^^^^^^^^^^
+.. bibliography:: asr_all.bib
+    :style: plain
+    :labelprefix: ASR-MODELS
+    :keyprefix: asr-models-
diff --git a/docs/sources/source/asr/models.rst b/docs/sources/source/asr/models.rst
@@ -7,10 +7,3 @@ Models
    jasper
    quartznet
 
-References
--------------
-
-.. bibliography:: asr_all.bib
-    :style: plain
-    :labelprefix: ASR-MODELS
-    :keyprefix: asr-models-
diff --git a/docs/sources/source/asr/quartznet.rst b/docs/sources/source/asr/quartznet.rst
@@ -1,19 +1,19 @@
+.. _Quartznet_model:
+
 QuartzNet
 ---------
 
-QuartzNet is a version of Jasper :cite:`asr-models-li2019jasper` model with separable convolutions and larger filters. It can achieve performance
+QuartzNet :cite:`qtz-models-kriman2019quartznet` is a version of Jasper :cite:`qtz-models-li2019jasper` model with separable convolutions and larger filters. It can achieve performance
 similar to Jasper but with an order of magnitude less parameters.
 Similarly to Jasper, QuartzNet family of models are denoted as QuartzNet_[BxR] where B is the number of blocks, and R - the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D *separable* convolution, batch normalization, ReLU, and dropout:
 
     .. image:: quartz_vertical.png
         :align: center
         :alt: quartznet model
 
-    .. note:: This checkpoint was trained on LibriSpeech :cite:`panayotov2015librispeech` and full "validated" part of En Mozilla Common Voice :cite:`ardila2019common`
-
 `QuartzNet paper <https://arxiv.org/abs/1910.10261>`_.
 
-Pretrained models can be found, `here <https://ngc.nvidia.com/catalog/models/nvidia:quartznet15x5>`_.
+Pretrained models can be found at the following links:
 
 ============= ===================== ==============================================================================
 Network       Dataset               Download Link 
@@ -24,7 +24,10 @@ QuartzNet15x5 Aishell2              `here <https://ngc.nvidia.com/catalog/models
 ============= ===================== ==============================================================================
 
 References
-----------
+^^^^^^^^^^
 
 .. bibliography:: asr_all.bib
     :style: plain
+    :labelprefix: QTZ-MODELS
+    :keyprefix: qtz-models-
+
diff --git a/docs/sources/source/asr/tutorial.rst b/docs/sources/source/asr/tutorial.rst
@@ -288,6 +288,7 @@ The command above should trigger 8-GPU training with mixed precision. In the com
 .. tip::
     You can pass several manifests (comma-separated) to train on a combined dataset like this: `--train_manifest=/manifests/librivox-train-all.json,/manifests/librivox-train-all-sp10pcnt.json,/manifests/cv/validated.json`. Here it combines 3 data sets: LibriSpeech, Mozilla Common Voice and LibriSpeech speed perturbed.
 
+.. _fine-tune:
 
 Fine-tuning
 -----------

diff --git a/docs/sources/source/conf.py b/docs/sources/source/conf.py
@@ -48,6 +48,7 @@
     'soundfile',
     'sentencepiece',
     'youtokentome',
+    'megatron-lm',
 ]
 
 # -- General configuration ------------------------------------------------

diff --git a/docs/sources/source/speaker_recognition/models.rst b/docs/sources/source/speaker_recognition/models.rst
@@ -6,10 +6,3 @@ Models
 
    quartznet
 
-References
-----------
-
-.. bibliography:: speaker.bib
-   :style: plain
-   :labelprefix: SPEAKER-TUT
-   :keyprefix: speaker-tut-
diff --git a/docs/sources/source/speaker_recognition/quartznet.rst b/docs/sources/source/speaker_recognition/quartznet.rst
@@ -29,11 +29,3 @@ QuartzNet3x2        voxceleb1             ffsvc-dev            14.22%	         7
                     voxceleb2
 ============== ================= ===================== ====================== ==========
 
-
-References
-----------
-
-    .. bibliography:: speaker.bib
-        :style: plain
-        :labelprefix: SPEAKER-TUT
-        :keyprefix: speaker-tut-
diff --git a/docs/sources/source/speech_command/models.rst b/docs/sources/source/speech_command/models.rst
@@ -6,10 +6,3 @@ Models
 
    quartznet
 
-References
--------------
-
-.. bibliography:: speech_recognition_all.bib
-    :style: plain
-    :labelprefix: SPEECH-RECOGNITION-MODELS
-    :keyprefix: speech-recognition-models-
diff --git a/docs/sources/source/speech_command/quartznet.rst b/docs/sources/source/speech_command/quartznet.rst
@@ -1,7 +1,7 @@
 QuartzNet
 ---------
 
-QuartzNet is a version of Jasper :cite:`asr-models-li2019jasper` model with separable convolutions and larger filters. It can achieve performance
+QuartzNet is a version of Jasper :cite:`speech-recognition-models-li2019jasper` model with separable convolutions and larger filters. It can achieve performance
 similar to Jasper but with an order of magnitude less parameters.
 Similarly to Jasper, QuartzNet family of models are denoted as QuartzNet_[BxR] where B is the number of blocks, and R - the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D *separable* convolution, batch normalization, ReLU, and dropout:
 
@@ -11,8 +11,6 @@ These models are trained on Google Speech Commands dataset (V1 - all 30 classes)
         :align: center
         :alt: quartznet model
 
-    .. note:: This checkpoint was trained on LibriSpeech :cite:`panayotov2015librispeech` and full "validated" part of En Mozilla Common Voice :cite:`ardila2019common`
-
 `QuartzNet paper <https://arxiv.org/abs/1910.10261>`_.
 
 These QuartzNet models were trained for 200 epochs using mixed precision on 2 GPUs with a batch size of 128 over 200 epochs.
@@ -32,7 +30,7 @@ QuartzNet3x2 (93k params)       Speech Commands V2    97.29% Test
 
 
 References
-----------
+^^^^^^^^^^
 
 .. bibliography:: speech_recognition_all.bib
     :style: plain

diff --git a/docs/sources/source/speech_command/speech_recognition_all.bib b/docs/sources/source/speech_command/speech_recognition_all.bib
@@ -40,4 +40,11 @@ @article{park2019
       year = "2019",
       eid = {arXiv:1904.08779},
       eprint = {1904.08779},
+}
+
+@article{li2019jasper,
+  title={Jasper: An End-to-End Convolutional Neural Acoustic Model},
+  author={Li, Jason and Lavrukhin, Vitaly and Ginsburg, Boris and Leary, Ryan and Kuchaiev, Oleksii and Cohen, Jonathan M and Nguyen, Huyen and Gadde, Ravi Teja},
+  journal={arXiv preprint arXiv:1904.03288},
+  year={2019}
 }
diff --git a/docs/sources/source/speech_command/tutorial.rst b/docs/sources/source/speech_command/tutorial.rst
@@ -111,7 +111,7 @@ The script below does both training and evaluation (on V1 dataset) on single GPU
         process_classification_evaluation_epoch,
     )
 
-    logging = nemo.logging
+    from nemo.utils import logging
 
     # Lets define some hyper parameters
     lr = 0.05
@@ -447,7 +447,7 @@ but they can similarly be used for v2 dataset.
     import nemo
     import nemo.collections.asr as nemo_asr
 
-    logging = nemo.logging
+    from nemo.utils import logging
 
     # We add some
     data_dir = '<path to the data directory>'

diff --git a/docs/sources/source/tutorials/intro.rst b/docs/sources/source/tutorials/intro.rst
@@ -12,3 +12,4 @@ Getting started
    weightsharing
    callbacks
    complex_training
+   neural_graphs
diff --git a/docs/sources/source/tutorials/neural_graphs.rst b/docs/sources/source/tutorials/neural_graphs.rst
@@ -0,0 +1,54 @@
+Neural Graphs
+=============
+
+The Neural Graph is a high-level abstract concept empowering the user to build graphs consisting of many,
+interconnected Neural Modules.
+Once the user defines a graph, its topology is “frozen”, i.e. connections between modules cannot change.
+If a user wants to change the topology - he/she can build another graph, potentially spanned over the same modules.
+At the same time, he can reuse and nest one graph into another.
+
+
+.. figure:: neural_graphs_general.png
+
+The import/export/save/restore options combined with the lightweight API make Neural Graphs
+a perfect tool for rapid prototyping and experimentation.
+
+There are two Jupyter Notebook tutorials focusing on different aspects of the Neural Graphs functionality.
+
+Tutorial I: The basic functionality
+-----------------------------------
+
+In this first part of the Neural Graphs (NGs) tutorial we will focus on a simple example:
+training TaylorNet module to approximate a sine wave function.
+We will build a simple "model graph" and show how we can nest it into another graphs.
+
+
+.. figure:: neural_graphs_nesting.png
+
+This part covers the following:
+ * how to create a Neural Graph object
+ * how to activate/deactivate graph context (in various ways)
+ * how to bind NG inputs and outpus (in various ways)
+ * how to nest one graph (representing the our "trainable model") into training and validation graphs
+
+
+Tutorial II: The advanced functionality
+---------------------------------------
+
+In this first part of the Neural Graphs (NGs) tutorial we will focus on a more complex example:
+training of an End-to-End Convolutional Neural Acoustic Model called JASPER.
+We will build a "model graph" and show how we can nest it into another graphs, how we can freeze/unfreeze modules,
+use graph configuration and save/load graph checkpoints.
+
+This part covers the following:
+ * how to nest one graph into another
+ * how to serialize and deserialize a graph
+ * how to export and import serialized graph configuration to/from YAML files
+ * how to save and load graph checkpoints (containing weights of the Trainable NMs)
+ * how to freeze/unfreeze modules in a graph
+
+Additionally, we will show how use `AppState` to list all the modules and graphs we have created in the scope of
+our application.
+
+.. note::
+    Both tutorial notebooks can be found in the `nemo/examples/neural_graphs` folder.
diff --git a/docs/sources/source/tutorials/neural_graphs_general.png b/docs/sources/source/tutorials/neural_graphs_general.png
diff --git a/docs/sources/source/tutorials/neural_graphs_nesting.png b/docs/sources/source/tutorials/neural_graphs_nesting.png
diff --git a/examples/applications/asr_service/app/__init__.py b/examples/applications/asr_service/app/__init__.py
@@ -7,8 +7,7 @@
 
 import nemo
 import nemo.collections.asr as nemo_asr
-
-logging = nemo.logging
+from nemo.utils import logging
 
 app = Flask(__name__)
 # make sure WORK_DIR exists before calling your service